Replace packages branch literal by an expression

Apply optimisations from #30
Remove reverting of the i2c-tools package - fixed upstream
2023-12-01 23:14:04 +02:00 · 2023-11-27 05:54:42 +02:00 · 2023-11-22 01:28:47 +02:00 · 2023-11-07 00:25:56 +02:00 · 2023-10-26 13:13:43 +08:00 · 2023-10-24 04:52:07 +03:00
33 changed files with 11484 additions and 41 deletions
--- a/.buildbot/openwrt/Dockerfile
+++ b/.buildbot/openwrt/Dockerfile
@ -1,27 +1,28 @@
 FROM ubuntu:focal

-ENV OPENWRT_VERSION=22.03.2
-ENV OPENWRT_TARGET=bcm27xx
-ENV OPENWRT_DEVICE=bcm2711
-ENV OPENWRT_CC=gcc-11.2.0_musl
-
 RUN apt -y update

 RUN DEBIAN_FRONTEND=noninteractive TZ=UTC \
  apt -yq install \
  build-essential rsync git-core subversion mercurial libssl-dev \
  libncurses5-dev unzip  gawk zlib1g-dev libncursesw5-dev zlib1g-dev \
-  gettext xsltproc wget unzip python
+  gettext xsltproc wget unzip python python3-distutils
+
+# Board build parameters
+ENV OPENWRT_VERSION=22.03.2
+ENV OPENWRT_TARGET=bcm27xx
+ENV OPENWRT_DEVICE=bcm2711
+ENV OPENWRT_CC=gcc-11.2.0_musl

 RUN mkdir /Downloads

 RUN wget -qP /Downloads \
-     "https://downloads.openwrt.org/releases/${OPENWRT_VERSION}/targets/bcm27xx/bcm2711/openwrt-imagebuilder-${OPENWRT_VERSION}-${OPENWRT_TARGET}-${OPENWRT_DEVICE}.Linux-x86_64.tar.xz"
+     "https://downloads.openwrt.org/releases/${OPENWRT_VERSION}/targets/${OPENWRT_TARGET}/${OPENWRT_DEVICE}/openwrt-imagebuilder-${OPENWRT_VERSION}-${OPENWRT_TARGET}-${OPENWRT_DEVICE}.Linux-x86_64.tar.xz"

 RUN wget -qP /Downloads \
-     "https://downloads.openwrt.org/releases/${OPENWRT_VERSION}/targets/bcm27xx/bcm2711/openwrt-sdk-${OPENWRT_VERSION}-${OPENWRT_TARGET}-${OPENWRT_DEVICE}_${OPENWRT_CC}.Linux-x86_64.tar.xz"
+     "https://downloads.openwrt.org/releases/${OPENWRT_VERSION}/targets/${OPENWRT_TARGET}/${OPENWRT_DEVICE}/openwrt-sdk-${OPENWRT_VERSION}-${OPENWRT_TARGET}-${OPENWRT_DEVICE}_${OPENWRT_CC}.Linux-x86_64.tar.xz"

-RUN wget -qP /Downloads \
-     "https://downloads.openwrt.org/releases/${OPENWRT_VERSION}/targets/bcm27xx/bcm2711/openwrt-toolchain-${OPENWRT_VERSION}-${OPENWRT_TARGET}-${OPENWRT_DEVICE}_${OPENWRT_CC}.Linux-x86_64.tar.xz"
+# RUN wget -qP /Downloads \
+#      "https://downloads.openwrt.org/releases/${OPENWRT_VERSION}/targets/${OPENWRT_TARGET}/${OPENWRT_DEVICE}/openwrt-toolchain-${OPENWRT_VERSION}-${OPENWRT_TARGET}-${OPENWRT_DEVICE}_${OPENWRT_CC}.Linux-x86_64.tar.xz"

 ADD . .
--- a/.buildbot/openwrt/build.sh
+++ b/.buildbot/openwrt/build.sh
@ -11,6 +11,9 @@

 set -x

+renice -n 19 $$
+MAKE="make -j$(nproc)"
+
 tar -xJf /Downloads/openwrt-sdk-${OPENWRT_VERSION}-*.tar.xz

 cp -r feed openwrt-sdk-*/
@ -21,6 +24,10 @@ cd openwrt-sdk-*/
 cp feeds.conf.default feeds.conf
 cat ${pwd}/feeds.conf >> feeds.conf

+OPENWRT_BRANCH=$(echo ${OPENWRT_VERSION} | cut -d. -f 1,2)
+
+sed -i "s/\(packages.git\)\^.*/\1;openwrt-${OPENWRT_BRANCH}/g" feeds.conf
+
 ./scripts/feeds update -a
 ./scripts/feeds install -a

@ -28,21 +35,31 @@ make defconfig
 # Enable collectd network encryption
 echo "CONFIG_PACKAGE_COLLECTD_ENCRYPTED_NETWORK=y" >> .config

-make package/asterisk-chan-quectel/compile
-make package/collectd/compile
-# no signing key
-make package/index
+${MAKE} package/i2c-tools/compile
+make package/kmod-i2c-mux-pinctrl/compile
+make package/kmod-nvme/compile
+make package/kmod-rtc-pcf85063/compile
+make package/nvme-cli/compile
+make package/ansible-core/compile
+${MAKE} package/asterisk-chan-quectel/compile
+${MAKE} package/collectd/compile

 sdkdir=$(pwd)
 cd ${pwd}

 tar -xJf /Downloads/openwrt-imagebuilder-${OPENWRT_VERSION}-*.tar.xz

+PROFILE=rpi-4
+BINPATH=bin/targets/${OPENWRT_TARGET}/${OPENWRT_DEVICE}
+IMAGE_PATH=${BINPATH}/openwrt-${OPENWRT_VERSION}-${OPENWRT_TARGET}-${OPENWRT_DEVICE}-${PROFILE}
+PKG_ARCH=aarch64_cortex-a72
+
 cd openwrt-imagebuilder-*/
-cp ${sdkdir}/bin/targets/bcm27xx/bcm2711/packages/*.ipk packages/
-cp ${sdkdir}/bin/packages/aarch64_cortex-a72/packages/*.ipk packages/
-cp ${sdkdir}/bin/packages/aarch64_cortex-a72/telephony/*.ipk packages/
-cp ${sdkdir}/bin/packages/aarch64_cortex-a72/quectel/*.ipk packages/
+cp ${sdkdir}/${BINPATH}/packages/*.ipk packages/
+cp ${sdkdir}/bin/packages/${PKG_ARCH}/base/*.ipk packages/
+cp ${sdkdir}/bin/packages/${PKG_ARCH}/packages/*.ipk packages/
+cp ${sdkdir}/bin/packages/${PKG_ARCH}/telephony/*.ipk packages/
+cp ${sdkdir}/bin/packages/${PKG_ARCH}/local/*.ipk packages/

 make info

@ -51,24 +68,13 @@ echo "CONFIG_TARGET_ROOTFS_PARTSIZE=308" >> .config
 # Disable ext4 images
 echo "CONFIG_TARGET_ROOTFS_EXT4FS=n" >> .config

-# Setup files
-mkdir -p files/etc/dropbear
-chmod 0750 files/etc/dropbear
-cp ${pwd}/authorized_keys files/etc/dropbear
-chmod 0400 files/etc/dropbear/authorized_keys
-mkdir -p files/etc/uci-defaults
-cp ${pwd}/defaults/* files/etc/uci-defaults
-
-PACKAGES="kmod-nf-nathelper-extra kmod-rtc-ds1307 \
+PACKAGES="kmod-i2c-bcm2835 kmod-i2c-mux kmod-i2c-mux-pinctrl \
+          kmod-nf-nathelper-extra \
+	  kmod-rtc-ds1307 kmod-rtc-pcf8563 kmod-rtc-pcf85063 \
 	  kmod-usb-audio kmod-usb-net-rtl8152 \
 	  alsa-lib pciutils usbutils \
 	  kmod-usb-net-qmi-wwan libqmi qmi-utils uqmi luci-proto-qmi \
 	  shadow-usermod \
-	  asterisk \
-	  asterisk-app-sms asterisk-pjsip asterisk-bridge-simple \
-	  asterisk-codec-alaw asterisk-codec-ulaw \
-	  asterisk-res-rtp-asterisk \
-	  asterisk-chan-quectel \
 	  ca-certificates \
 	  collectd collectd-mod-contextswitch collectd-mod-cpu \
 	  collectd-mod-curl collectd-mod-df \
@ -87,23 +93,87 @@ PACKAGES="kmod-nf-nathelper-extra kmod-rtc-ds1307 \
 	  collectd-mod-thermal collectd-mod-uptime \
 	  collectd-mod-users collectd-mod-vmem \
 	  collectd-mod-wireless \
-	  curl etherwake fail2ban \
+	  curl etherwake fail2ban hwclock i2c-tools \
 	  luci-app-mwan3 luci-app-upnp \
 	  luci-proto-wireguard \
 	  luci-ssl-nginx luci-app-acme \
-	  python3-packages python3-yaml \
+	  python3-ansible-core python3-ansible-core-src \
+	  python3-yaml \
 	  openssh-sftp-server tcpdump \
 	  iptables-nft ip6tables-nft \
 	  wireguard-tools wget-ssl \
 	 "

-make image PROFILE=rpi-4 \
-     PACKAGES="${PACKAGES}" DISABLED_SERVICES="dropbear" FILES="files"
+BOOTSOURCE=target/linux/${OPENWRT_TARGET}/image
+BOOTCONFIG=${BOOTSOURCE}/config.txt
+KERNELSOURCE=build_dir/target-${PKG_ARCH}_musl/linux-${OPENWRT_TARGET}_${OPENWRT_DEVICE}/linux-5.10.146
+OVERLAYSOURCE=${KERNELSOURCE}/arch/arm64/boot/dts/overlays

-make manifest PROFILE=rpi-4 PACKAGES="${PACKAGES}"
+# Setup files
+mkdir -p files/etc/dropbear
+chmod 0750 files/etc/dropbear
+cp ${pwd}/authorized_keys files/etc/dropbear
+chmod 0400 files/etc/dropbear/authorized_keys
+mkdir -p files/etc/docker
+chmod 0750 files/etc/docker
+cp ${pwd}/daemon.json files/etc/docker
+mkdir -p files/etc/uci-defaults
+cp ${pwd}/defaults/* files/etc/uci-defaults
+
+cp ${pwd}/overlays/dualeth.txt ${BOOTSOURCE}/current.txt
+
+cp ${BOOTCONFIG} .
+
+cat << "EOF" >> ${BOOTCONFIG}
+dtparam=i2c1=on
+dtparam=spi=on
+dtparam=i2s=on
+dtoverlay=i2c-rtc,ds1307
+
+include current.txt
+EOF
+
+pushd ${BOOTSOURCE}
+git apply ${pwd}/patches/bootconfig-add.patch
+popd
+
+make image PROFILE=${PROFILE} EXTRA_IMAGE_NAME="dualeth" \
+     PACKAGES="${PACKAGES}" DISABLED_SERVICES="dropbear" FILES="files" || exit 1
+
+
+sed -i "s|,i2c_csi_dsi||g" ${BOOTSOURCE}/current.txt
+sed -i "s/CONFIG_TARGET_ROOTFS_PARTSIZE=.*/CONFIG_TARGET_ROOTFS_PARTSIZE=768/g" .config
+
+make image PROFILE=${PROFILE} EXTRA_IMAGE_NAME="waveshare" \
+     PACKAGES=" \
+	${PACKAGES} cryptsetup kmod-ata-ahci smartmontools hdparm fdisk parted \
+	kmod-hwmon-drivetemp btrfs-progs kmod-fs-btrfs kmod-nvme nvme-cli \
+        docker dockerd docker-compose block-mount" \
+     DISABLED_SERVICES="dropbear" FILES="files" || exit 1
+
+
+cp ${pwd}/overlays/sensing.txt ${BOOTSOURCE}/current.txt
+cp ${pwd}/overlays/*.dtbo ${OVERLAYSOURCE}
+cp config.txt ${BOOTCONFIG}
+echo "include current.txt" >> ${BOOTCONFIG}
+pushd ${KERNELSOURCE}
+git apply ${pwd}/patches/overlay-add.patch
+popd
+
+sed -i "s/CONFIG_TARGET_ROOTFS_PARTSIZE=.*/CONFIG_TARGET_ROOTFS_PARTSIZE=308/g" .config
+
+make image PROFILE=${PROFILE} EXTRA_IMAGE_NAME="sensing" \
+     PACKAGES="${PACKAGES} asterisk \
+	  asterisk-app-sms asterisk-pjsip asterisk-bridge-simple \
+	  asterisk-codec-alaw asterisk-codec-ulaw \
+	  asterisk-res-rtp-asterisk \
+	  asterisk-chan-quectel" \
+     DISABLED_SERVICES="dropbear" FILES="files" || exit 1
+
+
+cat ${BINPATH}/*.manifest

 out=../../out

 mkdir -p ${out}
-mv bin/targets/bcm27xx/bcm2711/*.gz ${out}
-mv packages ${out}
+mv ${BINPATH}/*.gz ${out}
--- a/daemon.json
+++ b/daemon.json
@ -0,0 +1,6 @@
+{
+    "log-level": "warn",
+    "userland-proxy": false,
+    "iptables": true,
+    "storage-driver": "btrfs"
+}
--- a/defaults/48-docker
+++ b/defaults/48-docker
@ -0,0 +1,14 @@
+VERSION=1
+
+[ "$(uci -q get defaults.version.docker)" -ge "$VERSION" ] && exit 0
+
+# save version
+/sbin/uci set defaults.version.docker="$VERSION"
+
+/sbin/uci -q batch << EOF
+set defaults.version.docker="$VERSION"
+commit defaults
+set docker.globals=globals
+set docker.globals.alt_config_file=/etc/docker/daemon.json
+commit docker
+EOF
--- a/defaults/49-network
+++ b/defaults/49-network
@ -1,6 +1,6 @@
 VERSION=1

-[ "$(uci -q get defaults.version.network)" -ge "$VERSION" ] & exit 
+[ "$(uci -q get defaults.version.network)" -ge "$VERSION" ] && exit

 # save version
 /sbin/uci set defaults.version.network="$VERSION"
--- a/feed/ansible-core/Makefile
+++ b/feed/ansible-core/Makefile
@ -0,0 +1,40 @@
+# This is free software, licensed under the GNU General Public License v2.
+# See /LICENSE for more information.
+#
+
+include $(TOPDIR)/rules.mk
+
+PKG_NAME:=ansible-core
+PKG_VERSION:=2.14.2
+PKG_RELEASE:=$(AUTORELEASE)
+
+PYPI_NAME:=$(PKG_NAME)
+PKG_HASH:=47f0d4b4125b58edba6435a47f37cbe6a18da54594d18f812958bb0cb58d4e65
+PKG_MAINTAINER:=Lee Miller <lee.miller@tutanota.com>
+PKG_LICENSE:=GPL-3.0-or-later
+
+include $(TOPDIR)/feeds/packages/lang/python/pypi.mk
+include $(INCLUDE_DIR)/package.mk
+include $(TOPDIR)/feeds/packages/lang/python/python3-package.mk
+
+define Package/python3-$(PKG_NAME)
+  SECTION:=lang
+  CATEGORY:=Languages
+  SUBMENU:=Python
+  TITLE:=Radically simple IT automation
+  URL:=https://ansible.com/
+  DEPENDS:=+python3 +python3-logging +python3-setuptools +python3-cryptography \
+	+python3-jinja2 +python3-yaml +python3-packaging \
+	+python3-git +python3-resolvelib
+endef
+
+define Package/python3-$(PKG_NAME)/description
+   Ansible is a radically simple IT automation system. It handles configuration
+   management, application deployment, cloud provisioning, ad-hoc task
+   execution, network automation, and multi-node orchestration. Ansible makes
+   complex changes like zero-downtime rolling updates with load balancers easy.
+endef
+
+$(eval $(call Py3Package,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)-src))
--- a/feed/gitdb/Makefile
+++ b/feed/gitdb/Makefile
@ -0,0 +1,38 @@
+# This is free software, licensed under the GNU General Public License v2.
+# See /LICENSE for more information.
+#
+
+include $(TOPDIR)/rules.mk
+
+PKG_NAME:=gitdb
+PKG_VERSION:=4.0.10
+PKG_RELEASE:=$(AUTORELEASE)
+
+PYPI_NAME:=$(PKG_NAME)
+PKG_HASH:=6eb990b69df4e15bad899ea868dc46572c3f75339735663b81de79b06f17eb9a
+PKG_MAINTAINER:=Lee Miller <lee.miller@tutanota.com>
+PKG_LICENSE:=BSD
+
+include $(TOPDIR)/feeds/packages/lang/python/pypi.mk
+include $(INCLUDE_DIR)/package.mk
+include $(TOPDIR)/feeds/packages/lang/python/python3-package.mk
+
+define Package/python3-$(PKG_NAME)
+  SECTION:=lang
+  CATEGORY:=Languages
+  SUBMENU:=Python
+  TITLE:=Git Object Database
+  URL:=https://github.com/gitpython-developers/gitdb
+  DEPENDS:=+python3 +python3-setuptools +python3-smmap
+endef
+
+define Package/python3-$(PKG_NAME)/description
+   GitDB allows you to access bare git repositories for reading and writing.
+   It aims at allowing full access to loose objects as well as packs with
+   performance and scalability in mind. It operates exclusively on streams,
+   allowing to handle large objects with a small memory footprint.
+endef
+
+$(eval $(call Py3Package,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)-src))
--- a/feed/gitpython/Makefile
+++ b/feed/gitpython/Makefile
@ -0,0 +1,36 @@
+# This is free software, licensed under the GNU General Public License v2.
+# See /LICENSE for more information.
+#
+
+include $(TOPDIR)/rules.mk
+
+PKG_NAME:=git
+PKG_VERSION:=3.1.31
+PKG_RELEASE:=$(AUTORELEASE)
+
+PYPI_NAME:=GitPython
+PKG_HASH:=8ce3bcf69adfdf7c7d503e78fd3b1c492af782d58893b650adb2ac8912ddd573
+PKG_MAINTAINER:=Lee Miller <lee.miller@tutanota.com>
+PKG_LICENSE:=BSD-3-Clause
+
+include $(TOPDIR)/feeds/packages/lang/python/pypi.mk
+include $(INCLUDE_DIR)/package.mk
+include $(TOPDIR)/feeds/packages/lang/python/python3-package.mk
+
+define Package/python3-$(PKG_NAME)
+  SECTION:=lang
+  CATEGORY:=Languages
+  SUBMENU:=Python
+  TITLE:=GitPython is a Python library used to interact with Git repositories
+  URL:=https://github.com/gitpython-developers/GitPython
+  DEPENDS:=+python3 +python3-setuptools +python3-gitdb
+endef
+
+define Package/python3-$(PKG_NAME)/description
+   GitPython is a python library used to interact with git repositories,
+   high-level like git-porcelain, or low-level like git-plumbing.
+endef
+
+$(eval $(call Py3Package,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)-src))
--- a/feed/kmod-i2c-mux-pinctrl/Makefile
+++ b/feed/kmod-i2c-mux-pinctrl/Makefile
@ -0,0 +1,25 @@
+include $(TOPDIR)/rules.mk
+include $(INCLUDE_DIR)/kernel.mk
+
+PKG_NAME:=i2c-mux-pinctrl
+
+include $(INCLUDE_DIR)/package.mk
+
+define KernelPackage/$(PKG_NAME)
+  SUBMENU:=I2C support
+  TITLE:=pinctrl-based I2C multiplexer
+  DEPENDS:=+kmod-i2c-mux
+  AUTOLOAD:=$(call AutoLoad,51,i2c-mux-pinctrl,1)
+  FILES:=$(PKG_BUILD_DIR)/i2c-mux-pinctrl.ko
+  KCONFIG:=
+endef
+
+define KernelPackage/$(PKG_NAME)/description
+ Kernel modules for GENERIC_PINCTRL I2C bus mux/switching devices
+endef
+
+EXTRA_KCONFIG:= CONFIG_I2C_MUX_PINCTRL=m
+
+include ../kmod.mk
+
+$(eval $(call KernelPackage,$(PKG_NAME)))
--- a/feed/kmod-i2c-mux-pinctrl/src/Makefile
+++ b/feed/kmod-i2c-mux-pinctrl/src/Makefile
@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for multiplexer I2C chip drivers.
+
+obj-$(CONFIG_I2C_MUX_GPIO)	+= i2c-mux-gpio.o
+obj-$(CONFIG_I2C_MUX_PINCTRL)	+= i2c-mux-pinctrl.o
+
+ccflags-$(CONFIG_I2C_DEBUG_BUS) := -DDEBUG
--- a/feed/kmod-i2c-mux-pinctrl/src/gpiolib.h
+++ b/feed/kmod-i2c-mux-pinctrl/src/gpiolib.h
@ -0,0 +1,184 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Internal GPIO functions.
+ *
+ * Copyright (C) 2013, Intel Corporation
+ * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
+ */
+
+#ifndef GPIOLIB_H
+#define GPIOLIB_H
+
+#include <linux/gpio/driver.h>
+#include <linux/gpio/consumer.h> /* for enum gpiod_flags */
+#include <linux/err.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/cdev.h>
+
+#define GPIOCHIP_NAME	"gpiochip"
+
+/**
+ * struct gpio_device - internal state container for GPIO devices
+ * @id: numerical ID number for the GPIO chip
+ * @dev: the GPIO device struct
+ * @chrdev: character device for the GPIO device
+ * @mockdev: class device used by the deprecated sysfs interface (may be
+ * NULL)
+ * @owner: helps prevent removal of modules exporting active GPIOs
+ * @chip: pointer to the corresponding gpiochip, holding static
+ * data for this device
+ * @descs: array of ngpio descriptors.
+ * @ngpio: the number of GPIO lines on this GPIO device, equal to the size
+ * of the @descs array.
+ * @base: GPIO base in the DEPRECATED global Linux GPIO numberspace, assigned
+ * at device creation time.
+ * @label: a descriptive name for the GPIO device, such as the part number
+ * or name of the IP component in a System on Chip.
+ * @data: per-instance data assigned by the driver
+ * @list: links gpio_device:s together for traversal
+ *
+ * This state container holds most of the runtime variable data
+ * for a GPIO device and can hold references and live on after the
+ * GPIO chip has been removed, if it is still being used from
+ * userspace.
+ */
+struct gpio_device {
+	int			id;
+	struct device		dev;
+	struct cdev		chrdev;
+	struct device		*mockdev;
+	struct module		*owner;
+	struct gpio_chip	*chip;
+	struct gpio_desc	*descs;
+	int			base;
+	u16			ngpio;
+	const char		*label;
+	void			*data;
+	struct list_head        list;
+	struct blocking_notifier_head notifier;
+
+#ifdef CONFIG_PINCTRL
+	/*
+	 * If CONFIG_PINCTRL is enabled, then gpio controllers can optionally
+	 * describe the actual pin range which they serve in an SoC. This
+	 * information would be used by pinctrl subsystem to configure
+	 * corresponding pins for gpio usage.
+	 */
+	struct list_head pin_ranges;
+#endif
+};
+
+/* gpio suffixes used for ACPI and device tree lookup */
+static __maybe_unused const char * const gpio_suffixes[] = { "gpios", "gpio" };
+
+struct gpio_array {
+	struct gpio_desc	**desc;
+	unsigned int		size;
+	struct gpio_chip	*chip;
+	unsigned long		*get_mask;
+	unsigned long		*set_mask;
+	unsigned long		invert_mask[];
+};
+
+struct gpio_desc *gpiochip_get_desc(struct gpio_chip *gc, unsigned int hwnum);
+int gpiod_get_array_value_complex(bool raw, bool can_sleep,
+				  unsigned int array_size,
+				  struct gpio_desc **desc_array,
+				  struct gpio_array *array_info,
+				  unsigned long *value_bitmap);
+int gpiod_set_array_value_complex(bool raw, bool can_sleep,
+				  unsigned int array_size,
+				  struct gpio_desc **desc_array,
+				  struct gpio_array *array_info,
+				  unsigned long *value_bitmap);
+
+extern spinlock_t gpio_lock;
+extern struct list_head gpio_devices;
+
+struct gpio_desc {
+	struct gpio_device	*gdev;
+	unsigned long		flags;
+/* flag symbols are bit numbers */
+#define FLAG_REQUESTED	0
+#define FLAG_IS_OUT	1
+#define FLAG_EXPORT	2	/* protected by sysfs_lock */
+#define FLAG_SYSFS	3	/* exported via /sys/class/gpio/control */
+#define FLAG_ACTIVE_LOW	6	/* value has active low */
+#define FLAG_OPEN_DRAIN	7	/* Gpio is open drain type */
+#define FLAG_OPEN_SOURCE 8	/* Gpio is open source type */
+#define FLAG_USED_AS_IRQ 9	/* GPIO is connected to an IRQ */
+#define FLAG_IRQ_IS_ENABLED 10	/* GPIO is connected to an enabled IRQ */
+#define FLAG_IS_HOGGED	11	/* GPIO is hogged */
+#define FLAG_TRANSITORY 12	/* GPIO may lose value in sleep or reset */
+#define FLAG_PULL_UP    13	/* GPIO has pull up enabled */
+#define FLAG_PULL_DOWN  14	/* GPIO has pull down enabled */
+#define FLAG_BIAS_DISABLE    15	/* GPIO has pull disabled */
+#define FLAG_EDGE_RISING     16	/* GPIO CDEV detects rising edge events */
+#define FLAG_EDGE_FALLING    17	/* GPIO CDEV detects falling edge events */
+
+	/* Connection label */
+	const char		*label;
+	/* Name of the GPIO */
+	const char		*name;
+#ifdef CONFIG_OF_DYNAMIC
+	struct device_node	*hog;
+#endif
+#ifdef CONFIG_GPIO_CDEV
+	/* debounce period in microseconds */
+	unsigned int		debounce_period_us;
+#endif
+};
+
+int gpiod_request(struct gpio_desc *desc, const char *label);
+void gpiod_free(struct gpio_desc *desc);
+int gpiod_configure_flags(struct gpio_desc *desc, const char *con_id,
+		unsigned long lflags, enum gpiod_flags dflags);
+int gpiod_hog(struct gpio_desc *desc, const char *name,
+		unsigned long lflags, enum gpiod_flags dflags);
+
+/*
+ * Return the GPIO number of the passed descriptor relative to its chip
+ */
+static inline int gpio_chip_hwgpio(const struct gpio_desc *desc)
+{
+	return desc - &desc->gdev->descs[0];
+}
+
+/* With descriptor prefix */
+
+#define gpiod_emerg(desc, fmt, ...)					       \
+	pr_emerg("gpio-%d (%s): " fmt, desc_to_gpio(desc), desc->label ? : "?",\
+		 ##__VA_ARGS__)
+#define gpiod_crit(desc, fmt, ...)					       \
+	pr_crit("gpio-%d (%s): " fmt, desc_to_gpio(desc), desc->label ? : "?", \
+		 ##__VA_ARGS__)
+#define gpiod_err(desc, fmt, ...)					       \
+	pr_err("gpio-%d (%s): " fmt, desc_to_gpio(desc), desc->label ? : "?",  \
+		 ##__VA_ARGS__)
+#define gpiod_warn(desc, fmt, ...)					       \
+	pr_warn("gpio-%d (%s): " fmt, desc_to_gpio(desc), desc->label ? : "?", \
+		 ##__VA_ARGS__)
+#define gpiod_info(desc, fmt, ...)					       \
+	pr_info("gpio-%d (%s): " fmt, desc_to_gpio(desc), desc->label ? : "?", \
+		 ##__VA_ARGS__)
+#define gpiod_dbg(desc, fmt, ...)					       \
+	pr_debug("gpio-%d (%s): " fmt, desc_to_gpio(desc), desc->label ? : "?",\
+		 ##__VA_ARGS__)
+
+/* With chip prefix */
+
+#define chip_emerg(gc, fmt, ...)					\
+	dev_emerg(&gc->gpiodev->dev, "(%s): " fmt, gc->label, ##__VA_ARGS__)
+#define chip_crit(gc, fmt, ...)					\
+	dev_crit(&gc->gpiodev->dev, "(%s): " fmt, gc->label, ##__VA_ARGS__)
+#define chip_err(gc, fmt, ...)					\
+	dev_err(&gc->gpiodev->dev, "(%s): " fmt, gc->label, ##__VA_ARGS__)
+#define chip_warn(gc, fmt, ...)					\
+	dev_warn(&gc->gpiodev->dev, "(%s): " fmt, gc->label, ##__VA_ARGS__)
+#define chip_info(gc, fmt, ...)					\
+	dev_info(&gc->gpiodev->dev, "(%s): " fmt, gc->label, ##__VA_ARGS__)
+#define chip_dbg(gc, fmt, ...)					\
+	dev_dbg(&gc->gpiodev->dev, "(%s): " fmt, gc->label, ##__VA_ARGS__)
+
+#endif /* GPIOLIB_H */
--- a/feed/kmod-i2c-mux-pinctrl/src/i2c-mux-gpio.c
+++ b/feed/kmod-i2c-mux-pinctrl/src/i2c-mux-gpio.c
@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * I2C multiplexer using GPIO API
+ *
+ * Peter Korsgaard <peter.korsgaard@barco.com>
+ */
+
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+#include <linux/platform_data/i2c-mux-gpio.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/bits.h>
+#include <linux/gpio/consumer.h>
+/* FIXME: stop poking around inside gpiolib */
+#include "gpiolib.h"
+
+struct gpiomux {
+	struct i2c_mux_gpio_platform_data data;
+	int ngpios;
+	struct gpio_desc **gpios;
+};
+
+static void i2c_mux_gpio_set(const struct gpiomux *mux, unsigned val)
+{
+	DECLARE_BITMAP(values, BITS_PER_TYPE(val));
+
+	values[0] = val;
+
+	gpiod_set_array_value_cansleep(mux->ngpios, mux->gpios, NULL, values);
+}
+
+static int i2c_mux_gpio_select(struct i2c_mux_core *muxc, u32 chan)
+{
+	struct gpiomux *mux = i2c_mux_priv(muxc);
+
+	i2c_mux_gpio_set(mux, chan);
+
+	return 0;
+}
+
+static int i2c_mux_gpio_deselect(struct i2c_mux_core *muxc, u32 chan)
+{
+	struct gpiomux *mux = i2c_mux_priv(muxc);
+
+	i2c_mux_gpio_set(mux, mux->data.idle);
+
+	return 0;
+}
+
+#ifdef CONFIG_OF
+static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
+					struct platform_device *pdev)
+{
+	struct device_node *np = pdev->dev.of_node;
+	struct device_node *adapter_np, *child;
+	struct i2c_adapter *adapter;
+	unsigned *values;
+	int i = 0;
+
+	if (!np)
+		return -ENODEV;
+
+	adapter_np = of_parse_phandle(np, "i2c-parent", 0);
+	if (!adapter_np) {
+		dev_err(&pdev->dev, "Cannot parse i2c-parent\n");
+		return -ENODEV;
+	}
+	adapter = of_find_i2c_adapter_by_node(adapter_np);
+	of_node_put(adapter_np);
+	if (!adapter)
+		return -EPROBE_DEFER;
+
+	mux->data.parent = i2c_adapter_id(adapter);
+	put_device(&adapter->dev);
+
+	mux->data.n_values = of_get_child_count(np);
+
+	values = devm_kcalloc(&pdev->dev,
+			      mux->data.n_values, sizeof(*mux->data.values),
+			      GFP_KERNEL);
+	if (!values) {
+		dev_err(&pdev->dev, "Cannot allocate values array");
+		return -ENOMEM;
+	}
+
+	for_each_child_of_node(np, child) {
+		of_property_read_u32(child, "reg", values + i);
+		i++;
+	}
+	mux->data.values = values;
+
+	if (of_property_read_u32(np, "idle-state", &mux->data.idle))
+		mux->data.idle = I2C_MUX_GPIO_NO_IDLE;
+
+	return 0;
+}
+#else
+static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
+					struct platform_device *pdev)
+{
+	return 0;
+}
+#endif
+
+static int i2c_mux_gpio_probe(struct platform_device *pdev)
+{
+	struct i2c_mux_core *muxc;
+	struct gpiomux *mux;
+	struct i2c_adapter *parent;
+	struct i2c_adapter *root;
+	unsigned initial_state;
+	int i, ngpios, ret;
+
+	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
+	if (!mux)
+		return -ENOMEM;
+
+	if (!dev_get_platdata(&pdev->dev)) {
+		ret = i2c_mux_gpio_probe_dt(mux, pdev);
+		if (ret < 0)
+			return ret;
+	} else {
+		memcpy(&mux->data, dev_get_platdata(&pdev->dev),
+			sizeof(mux->data));
+	}
+
+	ngpios = gpiod_count(&pdev->dev, "mux");
+	if (ngpios <= 0) {
+		dev_err(&pdev->dev, "no valid gpios provided\n");
+		return ngpios ?: -EINVAL;
+	}
+	mux->ngpios = ngpios;
+
+	parent = i2c_get_adapter(mux->data.parent);
+	if (!parent)
+		return -EPROBE_DEFER;
+
+	muxc = i2c_mux_alloc(parent, &pdev->dev, mux->data.n_values,
+			     ngpios * sizeof(*mux->gpios), 0,
+			     i2c_mux_gpio_select, NULL);
+	if (!muxc) {
+		ret = -ENOMEM;
+		goto alloc_failed;
+	}
+	mux->gpios = muxc->priv;
+	muxc->priv = mux;
+
+	platform_set_drvdata(pdev, muxc);
+
+	root = i2c_root_adapter(&parent->dev);
+
+	muxc->mux_locked = true;
+
+	if (mux->data.idle != I2C_MUX_GPIO_NO_IDLE) {
+		initial_state = mux->data.idle;
+		muxc->deselect = i2c_mux_gpio_deselect;
+	} else {
+		initial_state = mux->data.values[0];
+	}
+
+	for (i = 0; i < ngpios; i++) {
+		struct device *gpio_dev;
+		struct gpio_desc *gpiod;
+		enum gpiod_flags flag;
+
+		if (initial_state & BIT(i))
+			flag = GPIOD_OUT_HIGH;
+		else
+			flag = GPIOD_OUT_LOW;
+		gpiod = devm_gpiod_get_index(&pdev->dev, "mux", i, flag);
+		if (IS_ERR(gpiod)) {
+			ret = PTR_ERR(gpiod);
+			goto alloc_failed;
+		}
+
+		mux->gpios[i] = gpiod;
+
+		if (!muxc->mux_locked)
+			continue;
+
+		/* FIXME: find a proper way to access the GPIO device */
+		gpio_dev = &gpiod->gdev->dev;
+		muxc->mux_locked = i2c_root_adapter(gpio_dev) == root;
+	}
+
+	if (muxc->mux_locked)
+		dev_info(&pdev->dev, "mux-locked i2c mux\n");
+
+	for (i = 0; i < mux->data.n_values; i++) {
+		u32 nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
+		unsigned int class = mux->data.classes ? mux->data.classes[i] : 0;
+
+		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], class);
+		if (ret)
+			goto add_adapter_failed;
+	}
+
+	dev_info(&pdev->dev, "%d port mux on %s adapter\n",
+		 mux->data.n_values, parent->name);
+
+	return 0;
+
+add_adapter_failed:
+	i2c_mux_del_adapters(muxc);
+alloc_failed:
+	i2c_put_adapter(parent);
+
+	return ret;
+}
+
+static int i2c_mux_gpio_remove(struct platform_device *pdev)
+{
+	struct i2c_mux_core *muxc = platform_get_drvdata(pdev);
+
+	i2c_mux_del_adapters(muxc);
+	i2c_put_adapter(muxc->parent);
+
+	return 0;
+}
+
+static const struct of_device_id i2c_mux_gpio_of_match[] = {
+	{ .compatible = "i2c-mux-gpio", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, i2c_mux_gpio_of_match);
+
+static struct platform_driver i2c_mux_gpio_driver = {
+	.probe	= i2c_mux_gpio_probe,
+	.remove	= i2c_mux_gpio_remove,
+	.driver	= {
+		.name	= "i2c-mux-gpio",
+		.of_match_table = i2c_mux_gpio_of_match,
+	},
+};
+
+module_platform_driver(i2c_mux_gpio_driver);
+
+MODULE_DESCRIPTION("GPIO-based I2C multiplexer driver");
+MODULE_AUTHOR("Peter Korsgaard <peter.korsgaard@barco.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:i2c-mux-gpio");
--- a/feed/kmod-i2c-mux-pinctrl/src/i2c-mux-pinctrl.c
+++ b/feed/kmod-i2c-mux-pinctrl/src/i2c-mux-pinctrl.c
@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * I2C multiplexer using pinctrl API
+ *
+ * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
+ */
+
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+#include <linux/module.h>
+#include <linux/pinctrl/consumer.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include "pinctrl_core.h"
+
+struct i2c_mux_pinctrl {
+	struct pinctrl *pinctrl;
+	struct pinctrl_state *states[];
+};
+
+static int i2c_mux_pinctrl_select(struct i2c_mux_core *muxc, u32 chan)
+{
+	struct i2c_mux_pinctrl *mux = i2c_mux_priv(muxc);
+
+	return pinctrl_select_state(mux->pinctrl, mux->states[chan]);
+}
+
+static int i2c_mux_pinctrl_deselect(struct i2c_mux_core *muxc, u32 chan)
+{
+	return i2c_mux_pinctrl_select(muxc, muxc->num_adapters);
+}
+
+static struct i2c_adapter *i2c_mux_pinctrl_root_adapter(
+	struct pinctrl_state *state)
+{
+	struct i2c_adapter *root = NULL;
+	struct pinctrl_setting *setting;
+	struct i2c_adapter *pin_root;
+
+	list_for_each_entry(setting, &state->settings, node) {
+		pin_root = i2c_root_adapter(setting->pctldev->dev);
+		if (!pin_root)
+			return NULL;
+		if (!root)
+			root = pin_root;
+		else if (root != pin_root)
+			return NULL;
+	}
+
+	return root;
+}
+
+static struct i2c_adapter *i2c_mux_pinctrl_parent_adapter(struct device *dev)
+{
+	struct device_node *np = dev->of_node;
+	struct device_node *parent_np;
+	struct i2c_adapter *parent;
+
+	parent_np = of_parse_phandle(np, "i2c-parent", 0);
+	if (!parent_np) {
+		dev_err(dev, "Cannot parse i2c-parent\n");
+		return ERR_PTR(-ENODEV);
+	}
+	parent = of_find_i2c_adapter_by_node(parent_np);
+	of_node_put(parent_np);
+	if (!parent)
+		return ERR_PTR(-EPROBE_DEFER);
+
+	return parent;
+}
+
+static int i2c_mux_pinctrl_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct device_node *np = dev->of_node;
+	struct i2c_mux_core *muxc;
+	struct i2c_mux_pinctrl *mux;
+	struct i2c_adapter *parent;
+	struct i2c_adapter *root;
+	int num_names, i, ret;
+	const char *name;
+
+	num_names = of_property_count_strings(np, "pinctrl-names");
+	if (num_names < 0) {
+		dev_err(dev, "Cannot parse pinctrl-names: %d\n",
+			num_names);
+		return num_names;
+	}
+
+	parent = i2c_mux_pinctrl_parent_adapter(dev);
+	if (IS_ERR(parent))
+		return PTR_ERR(parent);
+
+	muxc = i2c_mux_alloc(parent, dev, num_names,
+			     struct_size(mux, states, num_names),
+			     0, i2c_mux_pinctrl_select, NULL);
+	if (!muxc) {
+		ret = -ENOMEM;
+		goto err_put_parent;
+	}
+	mux = i2c_mux_priv(muxc);
+
+	platform_set_drvdata(pdev, muxc);
+
+	mux->pinctrl = devm_pinctrl_get(dev);
+	if (IS_ERR(mux->pinctrl)) {
+		ret = PTR_ERR(mux->pinctrl);
+		dev_err(dev, "Cannot get pinctrl: %d\n", ret);
+		goto err_put_parent;
+	}
+
+	for (i = 0; i < num_names; i++) {
+		ret = of_property_read_string_index(np, "pinctrl-names", i,
+						    &name);
+		if (ret < 0) {
+			dev_err(dev, "Cannot parse pinctrl-names: %d\n", ret);
+			goto err_put_parent;
+		}
+
+		mux->states[i] = pinctrl_lookup_state(mux->pinctrl, name);
+		if (IS_ERR(mux->states[i])) {
+			ret = PTR_ERR(mux->states[i]);
+			dev_err(dev, "Cannot look up pinctrl state %s: %d\n",
+				name, ret);
+			goto err_put_parent;
+		}
+
+		if (strcmp(name, "idle"))
+			continue;
+
+		if (i != num_names - 1) {
+			dev_err(dev, "idle state must be last\n");
+			ret = -EINVAL;
+			goto err_put_parent;
+		}
+		muxc->deselect = i2c_mux_pinctrl_deselect;
+	}
+
+	root = i2c_root_adapter(&muxc->parent->dev);
+
+	muxc->mux_locked = true;
+	for (i = 0; i < num_names; i++) {
+		if (root != i2c_mux_pinctrl_root_adapter(mux->states[i])) {
+			muxc->mux_locked = false;
+			break;
+		}
+	}
+	if (muxc->mux_locked)
+		dev_info(dev, "mux-locked i2c mux\n");
+
+	/* Do not add any adapter for the idle state (if it's there at all). */
+	for (i = 0; i < num_names - !!muxc->deselect; i++) {
+		ret = i2c_mux_add_adapter(muxc, 0, i, 0);
+		if (ret)
+			goto err_del_adapter;
+	}
+
+	return 0;
+
+err_del_adapter:
+	i2c_mux_del_adapters(muxc);
+err_put_parent:
+	i2c_put_adapter(parent);
+
+	return ret;
+}
+
+static int i2c_mux_pinctrl_remove(struct platform_device *pdev)
+{
+	struct i2c_mux_core *muxc = platform_get_drvdata(pdev);
+
+	i2c_mux_del_adapters(muxc);
+	i2c_put_adapter(muxc->parent);
+
+	return 0;
+}
+
+static const struct of_device_id i2c_mux_pinctrl_of_match[] = {
+	{ .compatible = "i2c-mux-pinctrl", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, i2c_mux_pinctrl_of_match);
+
+static struct platform_driver i2c_mux_pinctrl_driver = {
+	.driver	= {
+		.name	= "i2c-mux-pinctrl",
+		.of_match_table = of_match_ptr(i2c_mux_pinctrl_of_match),
+	},
+	.probe	= i2c_mux_pinctrl_probe,
+	.remove	= i2c_mux_pinctrl_remove,
+};
+module_platform_driver(i2c_mux_pinctrl_driver);
+
+MODULE_DESCRIPTION("pinctrl-based I2C multiplexer driver");
+MODULE_AUTHOR("Stephen Warren <swarren@nvidia.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:i2c-mux-pinctrl");
--- a/feed/kmod-i2c-mux-pinctrl/src/pinctrl_core.h
+++ b/feed/kmod-i2c-mux-pinctrl/src/pinctrl_core.h
@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Core private header for the pin control subsystem
+ *
+ * Copyright (C) 2011 ST-Ericsson SA
+ * Written on behalf of Linaro for ST-Ericsson
+ *
+ * Author: Linus Walleij <linus.walleij@linaro.org>
+ */
+
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/pinctrl/pinconf.h>
+#include <linux/pinctrl/machine.h>
+
+struct pinctrl_gpio_range;
+
+/**
+ * struct pinctrl_dev - pin control class device
+ * @node: node to include this pin controller in the global pin controller list
+ * @desc: the pin controller descriptor supplied when initializing this pin
+ *	controller
+ * @pin_desc_tree: each pin descriptor for this pin controller is stored in
+ *	this radix tree
+ * @pin_group_tree: optionally each pin group can be stored in this radix tree
+ * @num_groups: optionally number of groups can be kept here
+ * @pin_function_tree: optionally each function can be stored in this radix tree
+ * @num_functions: optionally number of functions can be kept here
+ * @gpio_ranges: a list of GPIO ranges that is handled by this pin controller,
+ *	ranges are added to this list at runtime
+ * @dev: the device entry for this pin controller
+ * @owner: module providing the pin controller, used for refcounting
+ * @driver_data: driver data for drivers registering to the pin controller
+ *	subsystem
+ * @p: result of pinctrl_get() for this device
+ * @hog_default: default state for pins hogged by this device
+ * @hog_sleep: sleep state for pins hogged by this device
+ * @mutex: mutex taken on each pin controller specific action
+ * @device_root: debugfs root for this device
+ */
+struct pinctrl_dev {
+	struct list_head node;
+	struct pinctrl_desc *desc;
+	struct radix_tree_root pin_desc_tree;
+#ifdef CONFIG_GENERIC_PINCTRL_GROUPS
+	struct radix_tree_root pin_group_tree;
+	unsigned int num_groups;
+#endif
+#ifdef CONFIG_GENERIC_PINMUX_FUNCTIONS
+	struct radix_tree_root pin_function_tree;
+	unsigned int num_functions;
+#endif
+	struct list_head gpio_ranges;
+	struct device *dev;
+	struct module *owner;
+	void *driver_data;
+	struct pinctrl *p;
+	struct pinctrl_state *hog_default;
+	struct pinctrl_state *hog_sleep;
+	struct mutex mutex;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *device_root;
+#endif
+};
+
+/**
+ * struct pinctrl - per-device pin control state holder
+ * @node: global list node
+ * @dev: the device using this pin control handle
+ * @states: a list of states for this device
+ * @state: the current state
+ * @dt_maps: the mapping table chunks dynamically parsed from device tree for
+ *	this device, if any
+ * @users: reference count
+ */
+struct pinctrl {
+	struct list_head node;
+	struct device *dev;
+	struct list_head states;
+	struct pinctrl_state *state;
+	struct list_head dt_maps;
+	struct kref users;
+};
+
+/**
+ * struct pinctrl_state - a pinctrl state for a device
+ * @node: list node for struct pinctrl's @states field
+ * @name: the name of this state
+ * @settings: a list of settings for this state
+ */
+struct pinctrl_state {
+	struct list_head node;
+	const char *name;
+	struct list_head settings;
+};
+
+/**
+ * struct pinctrl_setting_mux - setting data for MAP_TYPE_MUX_GROUP
+ * @group: the group selector to program
+ * @func: the function selector to program
+ */
+struct pinctrl_setting_mux {
+	unsigned group;
+	unsigned func;
+};
+
+/**
+ * struct pinctrl_setting_configs - setting data for MAP_TYPE_CONFIGS_*
+ * @group_or_pin: the group selector or pin ID to program
+ * @configs: a pointer to an array of config parameters/values to program into
+ *	hardware. Each individual pin controller defines the format and meaning
+ *	of config parameters.
+ * @num_configs: the number of entries in array @configs
+ */
+struct pinctrl_setting_configs {
+	unsigned group_or_pin;
+	unsigned long *configs;
+	unsigned num_configs;
+};
+
+/**
+ * struct pinctrl_setting - an individual mux or config setting
+ * @node: list node for struct pinctrl_settings's @settings field
+ * @type: the type of setting
+ * @pctldev: pin control device handling to be programmed. Not used for
+ *   PIN_MAP_TYPE_DUMMY_STATE.
+ * @dev_name: the name of the device using this state
+ * @data: Data specific to the setting type
+ */
+struct pinctrl_setting {
+	struct list_head node;
+	enum pinctrl_map_type type;
+	struct pinctrl_dev *pctldev;
+	const char *dev_name;
+	union {
+		struct pinctrl_setting_mux mux;
+		struct pinctrl_setting_configs configs;
+	} data;
+};
+
+/**
+ * struct pin_desc - pin descriptor for each physical pin in the arch
+ * @pctldev: corresponding pin control device
+ * @name: a name for the pin, e.g. the name of the pin/pad/finger on a
+ *	datasheet or such
+ * @dynamic_name: if the name of this pin was dynamically allocated
+ * @drv_data: driver-defined per-pin data. pinctrl core does not touch this
+ * @mux_usecount: If zero, the pin is not claimed, and @owner should be NULL.
+ *	If non-zero, this pin is claimed by @owner. This field is an integer
+ *	rather than a boolean, since pinctrl_get() might process multiple
+ *	mapping table entries that refer to, and hence claim, the same group
+ *	or pin, and each of these will increment the @usecount.
+ * @mux_owner: The name of device that called pinctrl_get().
+ * @mux_setting: The most recent selected mux setting for this pin, if any.
+ * @gpio_owner: If pinctrl_gpio_request() was called for this pin, this is
+ *	the name of the GPIO that "owns" this pin.
+ */
+struct pin_desc {
+	struct pinctrl_dev *pctldev;
+	const char *name;
+	bool dynamic_name;
+	void *drv_data;
+	/* These fields only added when supporting pinmux drivers */
+#ifdef CONFIG_PINMUX
+	unsigned mux_usecount;
+	const char *mux_owner;
+	const struct pinctrl_setting_mux *mux_setting;
+	const char *gpio_owner;
+#endif
+};
+
+/**
+ * struct pinctrl_maps - a list item containing part of the mapping table
+ * @node: mapping table list node
+ * @maps: array of mapping table entries
+ * @num_maps: the number of entries in @maps
+ */
+struct pinctrl_maps {
+	struct list_head node;
+	const struct pinctrl_map *maps;
+	unsigned num_maps;
+};
+
+#ifdef CONFIG_GENERIC_PINCTRL_GROUPS
+
+/**
+ * struct group_desc - generic pin group descriptor
+ * @name: name of the pin group
+ * @pins: array of pins that belong to the group
+ * @num_pins: number of pins in the group
+ * @data: pin controller driver specific data
+ */
+struct group_desc {
+	const char *name;
+	int *pins;
+	int num_pins;
+	void *data;
+};
+
+int pinctrl_generic_get_group_count(struct pinctrl_dev *pctldev);
+
+const char *pinctrl_generic_get_group_name(struct pinctrl_dev *pctldev,
+					   unsigned int group_selector);
+
+int pinctrl_generic_get_group_pins(struct pinctrl_dev *pctldev,
+				   unsigned int group_selector,
+				   const unsigned int **pins,
+				   unsigned int *npins);
+
+struct group_desc *pinctrl_generic_get_group(struct pinctrl_dev *pctldev,
+					     unsigned int group_selector);
+
+int pinctrl_generic_add_group(struct pinctrl_dev *pctldev, const char *name,
+			      int *gpins, int ngpins, void *data);
+
+int pinctrl_generic_remove_group(struct pinctrl_dev *pctldev,
+				 unsigned int group_selector);
+
+#endif	/* CONFIG_GENERIC_PINCTRL_GROUPS */
+
+struct pinctrl_dev *get_pinctrl_dev_from_devname(const char *dev_name);
+struct pinctrl_dev *get_pinctrl_dev_from_of_node(struct device_node *np);
+int pin_get_from_name(struct pinctrl_dev *pctldev, const char *name);
+const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned pin);
+int pinctrl_get_group_selector(struct pinctrl_dev *pctldev,
+			       const char *pin_group);
+
+static inline struct pin_desc *pin_desc_get(struct pinctrl_dev *pctldev,
+					    unsigned int pin)
+{
+	return radix_tree_lookup(&pctldev->pin_desc_tree, pin);
+}
+
+extern struct pinctrl_gpio_range *
+pinctrl_find_gpio_range_from_pin_nolock(struct pinctrl_dev *pctldev,
+					unsigned int pin);
+
+extern int pinctrl_force_sleep(struct pinctrl_dev *pctldev);
+extern int pinctrl_force_default(struct pinctrl_dev *pctldev);
+
+extern struct mutex pinctrl_maps_mutex;
+extern struct list_head pinctrl_maps;
+
+#define for_each_maps(_maps_node_, _i_, _map_) \
+	list_for_each_entry(_maps_node_, &pinctrl_maps, node) \
+		for (_i_ = 0, _map_ = &_maps_node_->maps[_i_]; \
+			_i_ < _maps_node_->num_maps; \
+			_i_++, _map_ = &_maps_node_->maps[_i_])
--- a/feed/kmod-nvme/Makefile
+++ b/feed/kmod-nvme/Makefile
@ -0,0 +1,32 @@
+include $(TOPDIR)/rules.mk
+include $(INCLUDE_DIR)/kernel.mk
+
+PKG_NAME:=nvme
+
+include $(INCLUDE_DIR)/package.mk
+
+define KernelPackage/$(PKG_NAME)
+  SUBMENU:=$(BLOCK_MENU)
+  TITLE:=NVM Express block device
+  DEPENDS:=@PCI_SUPPORT
+  FILES:= \
+	$(PKG_BUILD_DIR)/nvme-core.ko \
+	$(PKG_BUILD_DIR)/nvme.ko
+  AUTOLOAD:=$(call AutoLoad,30,nvme-core nvme)
+  KCONFIG:=
+endef
+
+define KernelPackage/nvme/description
+  Kernel module for NVM Express solid state drives directly
+  connected to the PCI or PCI Express bus.
+endef
+
+EXTRA_KCONFIG:= \
+	CONFIG_NVME_CORE=m \
+	CONFIG_BLK_DEV_NVME=m \
+	CONFIG_NVME_MULTIPATH=n \
+	CONFIG_NVME_HWMON=n
+
+include ../kmod.mk
+
+$(eval $(call KernelPackage,$(PKG_NAME)))
--- a/feed/kmod-nvme/src/Makefile
+++ b/feed/kmod-nvme/src/Makefile
@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y				+= -I$(src)
+
+obj-$(CONFIG_NVME_CORE)			+= nvme-core.o
+obj-$(CONFIG_BLK_DEV_NVME)		+= nvme.o
+obj-$(CONFIG_NVME_FABRICS)		+= nvme-fabrics.o
+obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
+obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
+obj-$(CONFIG_NVME_TCP)			+= nvme-tcp.o
+
+nvme-core-y				:= core.o
+nvme-core-$(CONFIG_TRACING)		+= trace.o
+nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
+nvme-core-$(CONFIG_NVM)			+= lightnvm.o
+nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
+nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
+nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
+
+nvme-y					+= pci.o
+
+nvme-fabrics-y				+= fabrics.o
+
+nvme-rdma-y				+= rdma.o
+
+nvme-fc-y				+= fc.o
+
+nvme-tcp-y				+= tcp.o
--- a/feed/kmod-nvme/src/core.c
+++ b/feed/kmod-nvme/src/core.c
@ -0,0 +1,4831 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/compat.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/hdreg.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/pr.h>
+#include <linux/ptrace.h>
+#include <linux/nvme_ioctl.h>
+#include <linux/pm_qos.h>
+#include <asm/unaligned.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#define NVME_MINORS		(1U << MINORBITS)
+
+unsigned int admin_timeout = 60;
+module_param(admin_timeout, uint, 0644);
+MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
+EXPORT_SYMBOL_GPL(admin_timeout);
+
+unsigned int nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
+MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
+EXPORT_SYMBOL_GPL(nvme_io_timeout);
+
+static unsigned char shutdown_timeout = 5;
+module_param(shutdown_timeout, byte, 0644);
+MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
+
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
+MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
+
+static unsigned long default_ps_max_latency_us = 100000;
+module_param(default_ps_max_latency_us, ulong, 0644);
+MODULE_PARM_DESC(default_ps_max_latency_us,
+		 "max power saving latency for new devices; use PM QOS to change per device");
+
+static bool force_apst;
+module_param(force_apst, bool, 0644);
+MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
+
+static bool streams;
+module_param(streams, bool, 0644);
+MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
+
+/*
+ * nvme_wq - hosts nvme related works that are not reset or delete
+ * nvme_reset_wq - hosts nvme reset works
+ * nvme_delete_wq - hosts nvme delete works
+ *
+ * nvme_wq will host works such as scan, aen handling, fw activation,
+ * keep-alive, periodic reconnects etc. nvme_reset_wq
+ * runs reset works which also flush works hosted on nvme_wq for
+ * serialization purposes. nvme_delete_wq host controller deletion
+ * works which flush reset works for serialization.
+ */
+struct workqueue_struct *nvme_wq;
+EXPORT_SYMBOL_GPL(nvme_wq);
+
+struct workqueue_struct *nvme_reset_wq;
+EXPORT_SYMBOL_GPL(nvme_reset_wq);
+
+struct workqueue_struct *nvme_delete_wq;
+EXPORT_SYMBOL_GPL(nvme_delete_wq);
+
+static LIST_HEAD(nvme_subsystems);
+static DEFINE_MUTEX(nvme_subsystems_lock);
+
+static DEFINE_IDA(nvme_instance_ida);
+static dev_t nvme_chr_devt;
+static struct class *nvme_class;
+static struct class *nvme_subsys_class;
+
+static void nvme_put_subsystem(struct nvme_subsystem *subsys);
+static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
+					   unsigned nsid);
+
+static void nvme_update_bdev_size(struct gendisk *disk)
+{
+	struct block_device *bdev = bdget_disk(disk, 0);
+
+	if (bdev) {
+		bd_set_nr_sectors(bdev, get_capacity(disk));
+		bdput(bdev);
+	}
+}
+
+/*
+ * Prepare a queue for teardown.
+ *
+ * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
+ * the capacity to 0 after that to avoid blocking dispatchers that may be
+ * holding bd_butex.  This will end buffered writers dirtying pages that can't
+ * be synced.
+ */
+static void nvme_set_queue_dying(struct nvme_ns *ns)
+{
+	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+		return;
+
+	blk_set_queue_dying(ns->queue);
+	blk_mq_unquiesce_queue(ns->queue);
+
+	set_capacity(ns->disk, 0);
+	nvme_update_bdev_size(ns->disk);
+}
+
+static void nvme_queue_scan(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * Only new queue scan work when admin and IO queues are both alive
+	 */
+	if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
+		queue_work(nvme_wq, &ctrl->scan_work);
+}
+
+/*
+ * Use this function to proceed with scheduling reset_work for a controller
+ * that had previously been set to the resetting state. This is intended for
+ * code paths that can't be interrupted by other reset attempts. A hot removal
+ * may prevent this from succeeding.
+ */
+int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->state != NVME_CTRL_RESETTING)
+		return -EBUSY;
+	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
+		return -EBUSY;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
+
+int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		return -EBUSY;
+	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
+		return -EBUSY;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
+
+int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	ret = nvme_reset_ctrl(ctrl);
+	if (!ret) {
+		flush_work(&ctrl->reset_work);
+		if (ctrl->state != NVME_CTRL_LIVE)
+			ret = -ENETRESET;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
+
+static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	dev_info(ctrl->device,
+		 "Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
+
+	flush_work(&ctrl->reset_work);
+	nvme_stop_ctrl(ctrl);
+	nvme_remove_namespaces(ctrl);
+	ctrl->ops->delete_ctrl(ctrl);
+	nvme_uninit_ctrl(ctrl);
+}
+
+static void nvme_delete_ctrl_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, delete_work);
+
+	nvme_do_delete_ctrl(ctrl);
+}
+
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
+		return -EBUSY;
+	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
+		return -EBUSY;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
+
+static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * Keep a reference until nvme_do_delete_ctrl() complete,
+	 * since ->delete_ctrl can free the controller.
+	 */
+	nvme_get_ctrl(ctrl);
+	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
+		nvme_do_delete_ctrl(ctrl);
+	nvme_put_ctrl(ctrl);
+}
+
+static blk_status_t nvme_error_status(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_SUCCESS:
+		return BLK_STS_OK;
+	case NVME_SC_CAP_EXCEEDED:
+		return BLK_STS_NOSPC;
+	case NVME_SC_LBA_RANGE:
+	case NVME_SC_CMD_INTERRUPTED:
+	case NVME_SC_NS_NOT_READY:
+		return BLK_STS_TARGET;
+	case NVME_SC_BAD_ATTRIBUTES:
+	case NVME_SC_ONCS_NOT_SUPPORTED:
+	case NVME_SC_INVALID_OPCODE:
+	case NVME_SC_INVALID_FIELD:
+	case NVME_SC_INVALID_NS:
+		return BLK_STS_NOTSUPP;
+	case NVME_SC_WRITE_FAULT:
+	case NVME_SC_READ_ERROR:
+	case NVME_SC_UNWRITTEN_BLOCK:
+	case NVME_SC_ACCESS_DENIED:
+	case NVME_SC_READ_ONLY:
+	case NVME_SC_COMPARE_FAILED:
+		return BLK_STS_MEDIUM;
+	case NVME_SC_GUARD_CHECK:
+	case NVME_SC_APPTAG_CHECK:
+	case NVME_SC_REFTAG_CHECK:
+	case NVME_SC_INVALID_PI:
+		return BLK_STS_PROTECTION;
+	case NVME_SC_RESERVATION_CONFLICT:
+		return BLK_STS_NEXUS;
+	case NVME_SC_HOST_PATH_ERROR:
+		return BLK_STS_TRANSPORT;
+	case NVME_SC_ZONE_TOO_MANY_ACTIVE:
+		return BLK_STS_ZONE_ACTIVE_RESOURCE;
+	case NVME_SC_ZONE_TOO_MANY_OPEN:
+		return BLK_STS_ZONE_OPEN_RESOURCE;
+	default:
+		return BLK_STS_IOERR;
+	}
+}
+
+static void nvme_retry_req(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	unsigned long delay = 0;
+	u16 crd;
+
+	/* The mask and shift result must be <= 3 */
+	crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
+	if (ns && crd)
+		delay = ns->ctrl->crdt[crd - 1] * 100;
+
+	nvme_req(req)->retries++;
+	blk_mq_requeue_request(req, false);
+	blk_mq_delay_kick_requeue_list(req->q, delay);
+}
+
+enum nvme_disposition {
+	COMPLETE,
+	RETRY,
+	FAILOVER,
+};
+
+static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
+{
+	if (likely(nvme_req(req)->status == 0))
+		return COMPLETE;
+
+	if (blk_noretry_request(req) ||
+	    (nvme_req(req)->status & NVME_SC_DNR) ||
+	    nvme_req(req)->retries >= nvme_max_retries)
+		return COMPLETE;
+
+	if (req->cmd_flags & REQ_NVME_MPATH) {
+		if (nvme_is_path_error(nvme_req(req)->status) ||
+		    blk_queue_dying(req->q))
+			return FAILOVER;
+	} else {
+		if (blk_queue_dying(req->q))
+			return COMPLETE;
+	}
+
+	return RETRY;
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
+
+	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+	    req_op(req) == REQ_OP_ZONE_APPEND)
+		req->__sector = nvme_lba_to_sect(req->q->queuedata,
+			le64_to_cpu(nvme_req(req)->result.u64));
+
+	nvme_trace_bio_complete(req, status);
+	blk_mq_end_request(req, status);
+}
+
+void nvme_complete_rq(struct request *req)
+{
+	trace_nvme_complete_rq(req);
+	nvme_cleanup_cmd(req);
+
+	if (nvme_req(req)->ctrl->kas)
+		nvme_req(req)->ctrl->comp_seen = true;
+
+	switch (nvme_decide_disposition(req)) {
+	case COMPLETE:
+		nvme_end_req(req);
+		return;
+	case RETRY:
+		nvme_retry_req(req);
+		return;
+	case FAILOVER:
+		nvme_failover_req(req);
+		return;
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
+bool nvme_cancel_request(struct request *req, void *data, bool reserved)
+{
+	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
+				"Cancelling I/O %d", req->tag);
+
+	/* don't abort one completed request */
+	if (blk_mq_request_completed(req))
+		return true;
+
+	nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
+	nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+	blk_mq_complete_request(req);
+	return true;
+}
+EXPORT_SYMBOL_GPL(nvme_cancel_request);
+
+void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->tagset) {
+		blk_mq_tagset_busy_iter(ctrl->tagset,
+				nvme_cancel_request, ctrl);
+		blk_mq_tagset_wait_completed_request(ctrl->tagset);
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
+
+void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->admin_tagset) {
+		blk_mq_tagset_busy_iter(ctrl->admin_tagset,
+				nvme_cancel_request, ctrl);
+		blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
+
+bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
+		enum nvme_ctrl_state new_state)
+{
+	enum nvme_ctrl_state old_state;
+	unsigned long flags;
+	bool changed = false;
+
+	spin_lock_irqsave(&ctrl->lock, flags);
+
+	old_state = ctrl->state;
+	switch (new_state) {
+	case NVME_CTRL_LIVE:
+		switch (old_state) {
+		case NVME_CTRL_NEW:
+		case NVME_CTRL_RESETTING:
+		case NVME_CTRL_CONNECTING:
+			changed = true;
+			fallthrough;
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_RESETTING:
+		switch (old_state) {
+		case NVME_CTRL_NEW:
+		case NVME_CTRL_LIVE:
+			changed = true;
+			fallthrough;
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_CONNECTING:
+		switch (old_state) {
+		case NVME_CTRL_NEW:
+		case NVME_CTRL_RESETTING:
+			changed = true;
+			fallthrough;
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_DELETING:
+		switch (old_state) {
+		case NVME_CTRL_LIVE:
+		case NVME_CTRL_RESETTING:
+		case NVME_CTRL_CONNECTING:
+			changed = true;
+			fallthrough;
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_DELETING_NOIO:
+		switch (old_state) {
+		case NVME_CTRL_DELETING:
+		case NVME_CTRL_DEAD:
+			changed = true;
+			fallthrough;
+		default:
+			break;
+		}
+		break;
+	case NVME_CTRL_DEAD:
+		switch (old_state) {
+		case NVME_CTRL_DELETING:
+			changed = true;
+			fallthrough;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if (changed) {
+		ctrl->state = new_state;
+		wake_up_all(&ctrl->state_wq);
+	}
+
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+	if (changed && ctrl->state == NVME_CTRL_LIVE)
+		nvme_kick_requeue_lists(ctrl);
+	return changed;
+}
+EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
+
+/*
+ * Returns true for sink states that can't ever transition back to live.
+ */
+static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
+{
+	switch (ctrl->state) {
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_LIVE:
+	case NVME_CTRL_RESETTING:
+	case NVME_CTRL_CONNECTING:
+		return false;
+	case NVME_CTRL_DELETING:
+	case NVME_CTRL_DELETING_NOIO:
+	case NVME_CTRL_DEAD:
+		return true;
+	default:
+		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
+		return true;
+	}
+}
+
+/*
+ * Waits for the controller state to be resetting, or returns false if it is
+ * not possible to ever transition to that state.
+ */
+bool nvme_wait_reset(struct nvme_ctrl *ctrl)
+{
+	wait_event(ctrl->state_wq,
+		   nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
+		   nvme_state_terminal(ctrl));
+	return ctrl->state == NVME_CTRL_RESETTING;
+}
+EXPORT_SYMBOL_GPL(nvme_wait_reset);
+
+static void nvme_free_ns_head(struct kref *ref)
+{
+	struct nvme_ns_head *head =
+		container_of(ref, struct nvme_ns_head, ref);
+
+	nvme_mpath_remove_disk(head);
+	ida_simple_remove(&head->subsys->ns_ida, head->instance);
+	cleanup_srcu_struct(&head->srcu);
+	nvme_put_subsystem(head->subsys);
+	kfree(head);
+}
+
+static void nvme_put_ns_head(struct nvme_ns_head *head)
+{
+	kref_put(&head->ref, nvme_free_ns_head);
+}
+
+static void nvme_free_ns(struct kref *kref)
+{
+	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+
+	if (ns->ndev)
+		nvme_nvm_unregister(ns);
+
+	put_disk(ns->disk);
+	nvme_put_ns_head(ns->head);
+	nvme_put_ctrl(ns->ctrl);
+	kfree(ns);
+}
+
+void nvme_put_ns(struct nvme_ns *ns)
+{
+	kref_put(&ns->kref, nvme_free_ns);
+}
+EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU);
+
+static inline void nvme_clear_nvme_request(struct request *req)
+{
+	nvme_req(req)->retries = 0;
+	nvme_req(req)->flags = 0;
+	req->rq_flags |= RQF_DONTPREP;
+}
+
+static inline unsigned int nvme_req_op(struct nvme_command *cmd)
+{
+	return nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
+}
+
+static inline void nvme_init_request(struct request *req,
+		struct nvme_command *cmd)
+{
+	if (req->q->queuedata)
+		req->timeout = NVME_IO_TIMEOUT;
+	else /* no queuedata implies admin queue */
+		req->timeout = ADMIN_TIMEOUT;
+
+	req->cmd_flags |= REQ_FAILFAST_DRIVER;
+	nvme_clear_nvme_request(req);
+	nvme_req(req)->cmd = cmd;
+}
+
+struct request *nvme_alloc_request(struct request_queue *q,
+		struct nvme_command *cmd, blk_mq_req_flags_t flags)
+{
+	struct request *req;
+
+	req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags);
+	if (!IS_ERR(req))
+		nvme_init_request(req, cmd);
+	return req;
+}
+EXPORT_SYMBOL_GPL(nvme_alloc_request);
+
+struct request *nvme_alloc_request_qid(struct request_queue *q,
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
+{
+	struct request *req;
+
+	req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags,
+			qid ? qid - 1 : 0);
+	if (!IS_ERR(req))
+		nvme_init_request(req, cmd);
+	return req;
+}
+EXPORT_SYMBOL_GPL(nvme_alloc_request_qid);
+
+static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+
+	c.directive.opcode = nvme_admin_directive_send;
+	c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
+	c.directive.doper = NVME_DIR_SND_ID_OP_ENABLE;
+	c.directive.dtype = NVME_DIR_IDENTIFY;
+	c.directive.tdtype = NVME_DIR_STREAMS;
+	c.directive.endir = enable ? NVME_DIR_ENDIR : 0;
+
+	return nvme_submit_sync_cmd(ctrl->admin_q, &c, NULL, 0);
+}
+
+static int nvme_disable_streams(struct nvme_ctrl *ctrl)
+{
+	return nvme_toggle_streams(ctrl, false);
+}
+
+static int nvme_enable_streams(struct nvme_ctrl *ctrl)
+{
+	return nvme_toggle_streams(ctrl, true);
+}
+
+static int nvme_get_stream_params(struct nvme_ctrl *ctrl,
+				  struct streams_directive_params *s, u32 nsid)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	memset(s, 0, sizeof(*s));
+
+	c.directive.opcode = nvme_admin_directive_recv;
+	c.directive.nsid = cpu_to_le32(nsid);
+	c.directive.numd = cpu_to_le32(nvme_bytes_to_numd(sizeof(*s)));
+	c.directive.doper = NVME_DIR_RCV_ST_OP_PARAM;
+	c.directive.dtype = NVME_DIR_STREAMS;
+
+	return nvme_submit_sync_cmd(ctrl->admin_q, &c, s, sizeof(*s));
+}
+
+static int nvme_configure_directives(struct nvme_ctrl *ctrl)
+{
+	struct streams_directive_params s;
+	int ret;
+
+	if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES))
+		return 0;
+	if (!streams)
+		return 0;
+
+	ret = nvme_enable_streams(ctrl);
+	if (ret)
+		return ret;
+
+	ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
+	if (ret)
+		goto out_disable_stream;
+
+	ctrl->nssa = le16_to_cpu(s.nssa);
+	if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) {
+		dev_info(ctrl->device, "too few streams (%u) available\n",
+					ctrl->nssa);
+		goto out_disable_stream;
+	}
+
+	ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+	dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
+	return 0;
+
+out_disable_stream:
+	nvme_disable_streams(ctrl);
+	return ret;
+}
+
+/*
+ * Check if 'req' has a write hint associated with it. If it does, assign
+ * a valid namespace stream to the write.
+ */
+static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
+				     struct request *req, u16 *control,
+				     u32 *dsmgmt)
+{
+	enum rw_hint streamid = req->write_hint;
+
+	if (streamid == WRITE_LIFE_NOT_SET || streamid == WRITE_LIFE_NONE)
+		streamid = 0;
+	else {
+		streamid--;
+		if (WARN_ON_ONCE(streamid > ctrl->nr_streams))
+			return;
+
+		*control |= NVME_RW_DTYPE_STREAMS;
+		*dsmgmt |= streamid << 16;
+	}
+
+	if (streamid < ARRAY_SIZE(req->q->write_hints))
+		req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9;
+}
+
+static inline void nvme_setup_passthrough(struct request *req,
+		struct nvme_command *cmd)
+{
+	memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd));
+	/* passthru commands should let the driver set the SGL flags */
+	cmd->common.flags &= ~NVME_CMD_SGL_ALL;
+}
+
+static inline void nvme_setup_flush(struct nvme_ns *ns,
+		struct nvme_command *cmnd)
+{
+	cmnd->common.opcode = nvme_cmd_flush;
+	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
+}
+
+static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *cmnd)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req), n = 0;
+	struct nvme_dsm_range *range;
+	struct bio *bio;
+
+	/*
+	 * Some devices do not consider the DSM 'Number of Ranges' field when
+	 * determining how much data to DMA. Always allocate memory for maximum
+	 * number of segments to prevent device reading beyond end of buffer.
+	 */
+	static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
+
+	range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
+	if (!range) {
+		/*
+		 * If we fail allocation our range, fallback to the controller
+		 * discard page. If that's also busy, it's safe to return
+		 * busy, as we know we can make progress once that's freed.
+		 */
+		if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
+			return BLK_STS_RESOURCE;
+
+		range = page_address(ns->ctrl->discard_page);
+	}
+
+	__rq_for_each_bio(bio, req) {
+		u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
+		u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
+
+		if (n < segments) {
+			range[n].cattr = cpu_to_le32(0);
+			range[n].nlb = cpu_to_le32(nlb);
+			range[n].slba = cpu_to_le64(slba);
+		}
+		n++;
+	}
+
+	if (WARN_ON_ONCE(n != segments)) {
+		if (virt_to_page(range) == ns->ctrl->discard_page)
+			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		else
+			kfree(range);
+		return BLK_STS_IOERR;
+	}
+
+	cmnd->dsm.opcode = nvme_cmd_dsm;
+	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
+	cmnd->dsm.nr = cpu_to_le32(segments - 1);
+	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+
+	req->special_vec.bv_page = virt_to_page(range);
+	req->special_vec.bv_offset = offset_in_page(range);
+	req->special_vec.bv_len = alloc_size;
+	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+	return BLK_STS_OK;
+}
+
+static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
+		struct request *req, struct nvme_command *cmnd)
+{
+	if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+		return nvme_setup_discard(ns, req, cmnd);
+
+	cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
+	cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
+	cmnd->write_zeroes.slba =
+		cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+	cmnd->write_zeroes.length =
+		cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+	if (nvme_ns_has_pi(ns))
+		cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
+	else
+		cmnd->write_zeroes.control = 0;
+	return BLK_STS_OK;
+}
+
+static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
+		struct request *req, struct nvme_command *cmnd,
+		enum nvme_opcode op)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	u16 control = 0;
+	u32 dsmgmt = 0;
+
+	if (req->cmd_flags & REQ_FUA)
+		control |= NVME_RW_FUA;
+	if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+		control |= NVME_RW_LR;
+
+	if (req->cmd_flags & REQ_RAHEAD)
+		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+	cmnd->rw.opcode = op;
+	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
+	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+
+	if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
+		nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
+
+	if (ns->ms) {
+		/*
+		 * If formated with metadata, the block layer always provides a
+		 * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled.  Else
+		 * we enable the PRACT bit for protection information or set the
+		 * namespace capacity to zero to prevent any I/O.
+		 */
+		if (!blk_integrity_rq(req)) {
+			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+				return BLK_STS_NOTSUPP;
+			control |= NVME_RW_PRINFO_PRACT;
+		}
+
+		switch (ns->pi_type) {
+		case NVME_NS_DPS_PI_TYPE3:
+			control |= NVME_RW_PRINFO_PRCHK_GUARD;
+			break;
+		case NVME_NS_DPS_PI_TYPE1:
+		case NVME_NS_DPS_PI_TYPE2:
+			control |= NVME_RW_PRINFO_PRCHK_GUARD |
+					NVME_RW_PRINFO_PRCHK_REF;
+			if (op == nvme_cmd_zone_append)
+				control |= NVME_RW_APPEND_PIREMAP;
+			cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
+			break;
+		}
+	}
+
+	cmnd->rw.control = cpu_to_le16(control);
+	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+	return 0;
+}
+
+void nvme_cleanup_cmd(struct request *req)
+{
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		struct nvme_ns *ns = req->rq_disk->private_data;
+		struct page *page = req->special_vec.bv_page;
+
+		if (page == ns->ctrl->discard_page)
+			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		else
+			kfree(page_address(page) + req->special_vec.bv_offset);
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
+
+blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *cmd)
+{
+	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
+	blk_status_t ret = BLK_STS_OK;
+
+	if (!(req->rq_flags & RQF_DONTPREP))
+		nvme_clear_nvme_request(req);
+
+	memset(cmd, 0, sizeof(*cmd));
+	switch (req_op(req)) {
+	case REQ_OP_DRV_IN:
+	case REQ_OP_DRV_OUT:
+		nvme_setup_passthrough(req, cmd);
+		break;
+	case REQ_OP_FLUSH:
+		nvme_setup_flush(ns, cmd);
+		break;
+	case REQ_OP_ZONE_RESET_ALL:
+	case REQ_OP_ZONE_RESET:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
+		break;
+	case REQ_OP_ZONE_OPEN:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
+		break;
+	case REQ_OP_ZONE_FINISH:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		ret = nvme_setup_write_zeroes(ns, req, cmd);
+		break;
+	case REQ_OP_DISCARD:
+		ret = nvme_setup_discard(ns, req, cmd);
+		break;
+	case REQ_OP_READ:
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
+		break;
+	case REQ_OP_WRITE:
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+		break;
+	case REQ_OP_ZONE_APPEND:
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return BLK_STS_IOERR;
+	}
+
+	if (!(ctrl->quirks & NVME_QUIRK_SKIP_CID_GEN))
+		nvme_req(req)->genctr++;
+	cmd->common.command_id = nvme_cid(req);
+	trace_nvme_setup_cmd(req, cmd);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_setup_cmd);
+
+static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
+{
+	struct completion *waiting = rq->end_io_data;
+
+	rq->end_io_data = NULL;
+	complete(waiting);
+}
+
+static void nvme_execute_rq_polled(struct request_queue *q,
+		struct gendisk *bd_disk, struct request *rq, int at_head)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+
+	WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
+
+	rq->cmd_flags |= REQ_HIPRI;
+	rq->end_io_data = &wait;
+	blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
+
+	while (!completion_done(&wait)) {
+		blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
+		cond_resched();
+	}
+}
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		union nvme_result *result, void *buffer, unsigned bufflen,
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags, bool poll)
+{
+	struct request *req;
+	int ret;
+
+	if (qid == NVME_QID_ANY)
+		req = nvme_alloc_request(q, cmd, flags);
+	else
+		req = nvme_alloc_request_qid(q, cmd, flags, qid);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	if (timeout)
+		req->timeout = timeout;
+
+	if (buffer && bufflen) {
+		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+		if (ret)
+			goto out;
+	}
+
+	if (poll)
+		nvme_execute_rq_polled(req->q, NULL, req, at_head);
+	else
+		blk_execute_rq(req->q, NULL, req, at_head);
+	if (result)
+		*result = nvme_req(req)->result;
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		ret = -EINTR;
+	else
+		ret = nvme_req(req)->status;
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen)
+{
+	return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
+			NVME_QID_ANY, 0, 0, false);
+}
+EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
+
+static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
+		unsigned len, u32 seed, bool write)
+{
+	struct bio_integrity_payload *bip;
+	int ret = -ENOMEM;
+	void *buf;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = -EFAULT;
+	if (write && copy_from_user(buf, ubuf, len))
+		goto out_free_meta;
+
+	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+	if (IS_ERR(bip)) {
+		ret = PTR_ERR(bip);
+		goto out_free_meta;
+	}
+
+	bip->bip_iter.bi_size = len;
+	bip->bip_iter.bi_sector = seed;
+	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
+			offset_in_page(buf));
+	if (ret == len)
+		return buf;
+	ret = -ENOMEM;
+out_free_meta:
+	kfree(buf);
+out:
+	return ERR_PTR(ret);
+}
+
+static u32 nvme_known_admin_effects(u8 opcode)
+{
+	switch (opcode) {
+	case nvme_admin_format_nvm:
+		return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC |
+			NVME_CMD_EFFECTS_CSE_MASK;
+	case nvme_admin_sanitize_nvm:
+		return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK;
+	default:
+		break;
+	}
+	return 0;
+}
+
+u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode)
+{
+	u32 effects = 0;
+
+	if (ns) {
+		if (ns->head->effects)
+			effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
+		if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
+			dev_warn(ctrl->device,
+				 "IO command:%02x has unhandled effects:%08x\n",
+				 opcode, effects);
+		return 0;
+	}
+
+	if (ctrl->effects)
+		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
+	effects |= nvme_known_admin_effects(opcode);
+
+	return effects;
+}
+EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU);
+
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+			       u8 opcode)
+{
+	u32 effects = nvme_command_effects(ctrl, ns, opcode);
+
+	/*
+	 * For simplicity, IO to all namespaces is quiesced even if the command
+	 * effects say only one namespace is affected.
+	 */
+	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
+		mutex_lock(&ctrl->scan_lock);
+		mutex_lock(&ctrl->subsys->lock);
+		nvme_mpath_start_freeze(ctrl->subsys);
+		nvme_mpath_wait_freeze(ctrl->subsys);
+		nvme_start_freeze(ctrl);
+		nvme_wait_freeze(ctrl);
+	}
+	return effects;
+}
+
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+{
+	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
+		nvme_unfreeze(ctrl);
+		nvme_mpath_unfreeze(ctrl->subsys);
+		mutex_unlock(&ctrl->subsys->lock);
+		nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
+		mutex_unlock(&ctrl->scan_lock);
+	}
+	if (effects & NVME_CMD_EFFECTS_CCC)
+		nvme_init_identify(ctrl);
+	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
+		nvme_queue_scan(ctrl);
+		flush_work(&ctrl->scan_work);
+	}
+}
+
+void nvme_execute_passthru_rq(struct request *rq)
+{
+	struct nvme_command *cmd = nvme_req(rq)->cmd;
+	struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
+	struct nvme_ns *ns = rq->q->queuedata;
+	struct gendisk *disk = ns ? ns->disk : NULL;
+	u32 effects;
+
+	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
+	blk_execute_rq(rq->q, disk, rq, 0);
+	nvme_passthru_end(ctrl, effects);
+}
+EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
+
+static int nvme_submit_user_cmd(struct request_queue *q,
+		struct nvme_command *cmd, void __user *ubuffer,
+		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
+		u32 meta_seed, u64 *result, unsigned timeout)
+{
+	bool write = nvme_is_write(cmd);
+	struct nvme_ns *ns = q->queuedata;
+	struct gendisk *disk = ns ? ns->disk : NULL;
+	struct request *req;
+	struct bio *bio = NULL;
+	void *meta = NULL;
+	int ret;
+
+	req = nvme_alloc_request(q, cmd, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	if (timeout)
+		req->timeout = timeout;
+	nvme_req(req)->flags |= NVME_REQ_USERCMD;
+
+	if (ubuffer && bufflen) {
+		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+				GFP_KERNEL);
+		if (ret)
+			goto out;
+		bio = req->bio;
+		bio->bi_disk = disk;
+		if (disk && meta_buffer && meta_len) {
+			meta = nvme_add_user_metadata(bio, meta_buffer, meta_len,
+					meta_seed, write);
+			if (IS_ERR(meta)) {
+				ret = PTR_ERR(meta);
+				goto out_unmap;
+			}
+			req->cmd_flags |= REQ_INTEGRITY;
+		}
+	}
+
+	nvme_execute_passthru_rq(req);
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		ret = -EINTR;
+	else
+		ret = nvme_req(req)->status;
+	if (result)
+		*result = le64_to_cpu(nvme_req(req)->result.u64);
+	if (meta && !ret && !write) {
+		if (copy_to_user(meta_buffer, meta, meta_len))
+			ret = -EFAULT;
+	}
+	kfree(meta);
+ out_unmap:
+	if (bio)
+		blk_rq_unmap_user(bio);
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+
+static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
+{
+	struct nvme_ctrl *ctrl = rq->end_io_data;
+	unsigned long flags;
+	bool startka = false;
+
+	blk_mq_free_request(rq);
+
+	if (status) {
+		dev_err(ctrl->device,
+			"failed nvme_keep_alive_end_io error=%d\n",
+				status);
+		return;
+	}
+
+	ctrl->comp_seen = false;
+	spin_lock_irqsave(&ctrl->lock, flags);
+	if (ctrl->state == NVME_CTRL_LIVE ||
+	    ctrl->state == NVME_CTRL_CONNECTING)
+		startka = true;
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+	if (startka)
+		queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
+}
+
+static int nvme_keep_alive(struct nvme_ctrl *ctrl)
+{
+	struct request *rq;
+
+	rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd,
+			BLK_MQ_REQ_RESERVED);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+
+	rq->timeout = ctrl->kato * HZ;
+	rq->end_io_data = ctrl;
+
+	blk_execute_rq_nowait(rq->q, NULL, rq, 0, nvme_keep_alive_end_io);
+
+	return 0;
+}
+
+static void nvme_keep_alive_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
+			struct nvme_ctrl, ka_work);
+	bool comp_seen = ctrl->comp_seen;
+
+	if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
+		dev_dbg(ctrl->device,
+			"reschedule traffic based keep-alive timer\n");
+		ctrl->comp_seen = false;
+		queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
+		return;
+	}
+
+	if (nvme_keep_alive(ctrl)) {
+		/* allocation failure, reset the controller */
+		dev_err(ctrl->device, "keep-alive failed\n");
+		nvme_reset_ctrl(ctrl);
+		return;
+	}
+}
+
+static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
+{
+	if (unlikely(ctrl->kato == 0))
+		return;
+
+	queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
+}
+
+void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
+{
+	if (unlikely(ctrl->kato == 0))
+		return;
+
+	cancel_delayed_work_sync(&ctrl->ka_work);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
+
+/*
+ * In NVMe 1.0 the CNS field was just a binary controller or namespace
+ * flag, thus sending any new CNS opcodes has a big chance of not working.
+ * Qemu unfortunately had that bug after reporting a 1.1 version compliance
+ * (but not for any later version).
+ */
+static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
+{
+	if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
+		return ctrl->vs < NVME_VS(1, 2, 0);
+	return ctrl->vs < NVME_VS(1, 1, 0);
+}
+
+static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = NVME_ID_CNS_CTRL;
+
+	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ctrl));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+static bool nvme_multi_css(struct nvme_ctrl *ctrl)
+{
+	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
+}
+
+static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
+		struct nvme_ns_id_desc *cur, bool *csi_seen)
+{
+	const char *warn_str = "ctrl returned bogus length:";
+	void *data = cur;
+
+	switch (cur->nidt) {
+	case NVME_NIDT_EUI64:
+		if (cur->nidl != NVME_NIDT_EUI64_LEN) {
+			dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n",
+				 warn_str, cur->nidl);
+			return -1;
+		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_EUI64_LEN;
+		memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN);
+		return NVME_NIDT_EUI64_LEN;
+	case NVME_NIDT_NGUID:
+		if (cur->nidl != NVME_NIDT_NGUID_LEN) {
+			dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n",
+				 warn_str, cur->nidl);
+			return -1;
+		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_NGUID_LEN;
+		memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN);
+		return NVME_NIDT_NGUID_LEN;
+	case NVME_NIDT_UUID:
+		if (cur->nidl != NVME_NIDT_UUID_LEN) {
+			dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n",
+				 warn_str, cur->nidl);
+			return -1;
+		}
+		if (ctrl->quirks & NVME_QUIRK_BOGUS_NID)
+			return NVME_NIDT_UUID_LEN;
+		uuid_copy(&ids->uuid, data + sizeof(*cur));
+		return NVME_NIDT_UUID_LEN;
+	case NVME_NIDT_CSI:
+		if (cur->nidl != NVME_NIDT_CSI_LEN) {
+			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
+				 warn_str, cur->nidl);
+			return -1;
+		}
+		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
+		*csi_seen = true;
+		return NVME_NIDT_CSI_LEN;
+	default:
+		/* Skip unknown types */
+		return cur->nidl;
+	}
+}
+
+static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
+		struct nvme_ns_ids *ids)
+{
+	struct nvme_command c = { };
+	bool csi_seen = false;
+	int status, pos, len;
+	void *data;
+
+	if (ctrl->vs < NVME_VS(1, 3, 0) && !nvme_multi_css(ctrl))
+		return 0;
+	if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
+		return 0;
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
+
+	data = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
+				      NVME_IDENTIFY_DATA_SIZE);
+	if (status) {
+		dev_warn(ctrl->device,
+			"Identify Descriptors failed (%d)\n", status);
+		goto free_data;
+	}
+
+	for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
+		struct nvme_ns_id_desc *cur = data + pos;
+
+		if (cur->nidl == 0)
+			break;
+
+		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
+		if (len < 0)
+			break;
+
+		len += sizeof(*cur);
+	}
+
+	if (nvme_multi_css(ctrl) && !csi_seen) {
+		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
+			 nsid);
+		status = -EINVAL;
+	}
+
+free_data:
+	kfree(data);
+	return status;
+}
+
+static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+			struct nvme_ns_ids *ids, struct nvme_id_ns **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.cns = NVME_ID_CNS_NS;
+
+	*id = kmalloc(sizeof(**id), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
+	if (error) {
+		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
+		goto out_free_id;
+	}
+
+	error = NVME_SC_INVALID_NS | NVME_SC_DNR;
+	if ((*id)->ncap == 0) /* namespace not allocated or attached */
+		goto out_free_id;
+
+
+	if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
+		dev_info(ctrl->device,
+			 "Ignoring bogus Namespace Identifiers\n");
+	} else {
+		if (ctrl->vs >= NVME_VS(1, 1, 0) &&
+		    !memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+			memcpy(ids->eui64, (*id)->eui64, sizeof(ids->eui64));
+		if (ctrl->vs >= NVME_VS(1, 2, 0) &&
+		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+			memcpy(ids->nguid, (*id)->nguid, sizeof(ids->nguid));
+	}
+
+	return 0;
+
+out_free_id:
+	kfree(*id);
+	return error;
+}
+
+static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
+		unsigned int dword11, void *buffer, size_t buflen, u32 *result)
+{
+	union nvme_result res = { 0 };
+	struct nvme_command c;
+	int ret;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = op;
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
+			buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
+	if (ret >= 0 && result)
+		*result = le32_to_cpu(res.u32);
+	return ret;
+}
+
+int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
+		      unsigned int dword11, void *buffer, size_t buflen,
+		      u32 *result)
+{
+	return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
+			     buflen, result);
+}
+EXPORT_SYMBOL_GPL(nvme_set_features);
+
+int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
+		      unsigned int dword11, void *buffer, size_t buflen,
+		      u32 *result)
+{
+	return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
+			     buflen, result);
+}
+EXPORT_SYMBOL_GPL(nvme_get_features);
+
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
+{
+	u32 q_count = (*count - 1) | ((*count - 1) << 16);
+	u32 result;
+	int status, nr_io_queues;
+
+	status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
+			&result);
+	if (status < 0)
+		return status;
+
+	/*
+	 * Degraded controllers might return an error when setting the queue
+	 * count.  We still want to be able to bring them online and offer
+	 * access to the admin queue, as that might be only way to fix them up.
+	 */
+	if (status > 0) {
+		dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
+		*count = 0;
+	} else {
+		nr_io_queues = min(result & 0xffff, result >> 16) + 1;
+		*count = min(*count, nr_io_queues);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_set_queue_count);
+
+#define NVME_AEN_SUPPORTED \
+	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
+	 NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
+
+static void nvme_enable_aen(struct nvme_ctrl *ctrl)
+{
+	u32 result, supported_aens = ctrl->oaes & NVME_AEN_SUPPORTED;
+	int status;
+
+	if (!supported_aens)
+		return;
+
+	status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported_aens,
+			NULL, 0, &result);
+	if (status)
+		dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
+			 supported_aens);
+
+	queue_work(nvme_wq, &ctrl->async_event_work);
+}
+
+/*
+ * Convert integer values from ioctl structures to user pointers, silently
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
+ * kernels.
+ */
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
+{
+	if (in_compat_syscall())
+		ptrval = (compat_uptr_t)ptrval;
+	return (void __user *)ptrval;
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+	struct nvme_user_io io;
+	struct nvme_command c;
+	unsigned length, meta_len;
+	void __user *metadata;
+
+	if (copy_from_user(&io, uio, sizeof(io)))
+		return -EFAULT;
+	if (io.flags)
+		return -EINVAL;
+
+	switch (io.opcode) {
+	case nvme_cmd_write:
+	case nvme_cmd_read:
+	case nvme_cmd_compare:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	length = (io.nblocks + 1) << ns->lba_shift;
+
+	if ((io.control & NVME_RW_PRINFO_PRACT) &&
+	    ns->ms == sizeof(struct t10_pi_tuple)) {
+		/*
+		 * Protection information is stripped/inserted by the
+		 * controller.
+		 */
+		if (nvme_to_user_ptr(io.metadata))
+			return -EINVAL;
+		meta_len = 0;
+		metadata = NULL;
+	} else {
+		meta_len = (io.nblocks + 1) * ns->ms;
+		metadata = nvme_to_user_ptr(io.metadata);
+	}
+
+	if (ns->features & NVME_NS_EXT_LBAS) {
+		length += meta_len;
+		meta_len = 0;
+	} else if (meta_len) {
+		if ((io.metadata & 3) || !io.metadata)
+			return -EINVAL;
+	}
+
+	memset(&c, 0, sizeof(c));
+	c.rw.opcode = io.opcode;
+	c.rw.flags = io.flags;
+	c.rw.nsid = cpu_to_le32(ns->head->ns_id);
+	c.rw.slba = cpu_to_le64(io.slba);
+	c.rw.length = cpu_to_le16(io.nblocks);
+	c.rw.control = cpu_to_le16(io.control);
+	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+	c.rw.reftag = cpu_to_le32(io.reftag);
+	c.rw.apptag = cpu_to_le16(io.apptag);
+	c.rw.appmask = cpu_to_le16(io.appmask);
+
+	return nvme_submit_user_cmd(ns->queue, &c,
+			nvme_to_user_ptr(io.addr), length,
+			metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
+}
+
+static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+			struct nvme_passthru_cmd __user *ucmd)
+{
+	struct nvme_passthru_cmd cmd;
+	struct nvme_command c;
+	unsigned timeout = 0;
+	u64 result;
+	int status;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+		return -EFAULT;
+	if (cmd.flags)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+
+	if (cmd.timeout_ms)
+		timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+			nvme_to_user_ptr(cmd.addr), cmd.data_len,
+			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
+			0, &result, timeout);
+
+	if (status >= 0) {
+		if (put_user(result, &ucmd->result))
+			return -EFAULT;
+	}
+
+	return status;
+}
+
+static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+			struct nvme_passthru_cmd64 __user *ucmd)
+{
+	struct nvme_passthru_cmd64 cmd;
+	struct nvme_command c;
+	unsigned timeout = 0;
+	int status;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+		return -EFAULT;
+	if (cmd.flags)
+		return -EINVAL;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+
+	if (cmd.timeout_ms)
+		timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+			nvme_to_user_ptr(cmd.addr), cmd.data_len,
+			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
+			0, &cmd.result, timeout);
+
+	if (status >= 0) {
+		if (put_user(cmd.result, &ucmd->result))
+			return -EFAULT;
+	}
+
+	return status;
+}
+
+/*
+ * Issue ioctl requests on the first available path.  Note that unlike normal
+ * block layer requests we will not retry failed request on another controller.
+ */
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+		struct nvme_ns_head **head, int *srcu_idx)
+{
+#ifdef CONFIG_NVME_MULTIPATH
+	if (disk->fops == &nvme_ns_head_ops) {
+		struct nvme_ns *ns;
+
+		*head = disk->private_data;
+		*srcu_idx = srcu_read_lock(&(*head)->srcu);
+		ns = nvme_find_path(*head);
+		if (!ns)
+			srcu_read_unlock(&(*head)->srcu, *srcu_idx);
+		return ns;
+	}
+#endif
+	*head = NULL;
+	*srcu_idx = -1;
+	return disk->private_data;
+}
+
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+{
+	if (head)
+		srcu_read_unlock(&head->srcu, idx);
+}
+
+static bool is_ctrl_ioctl(unsigned int cmd)
+{
+	if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
+		return true;
+	if (is_sed_ioctl(cmd))
+		return true;
+	return false;
+}
+
+static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
+				  void __user *argp,
+				  struct nvme_ns_head *head,
+				  int srcu_idx)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	int ret;
+
+	nvme_get_ctrl(ns->ctrl);
+	nvme_put_ns_from_disk(head, srcu_idx);
+
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		ret = nvme_user_cmd(ctrl, NULL, argp);
+		break;
+	case NVME_IOCTL_ADMIN64_CMD:
+		ret = nvme_user_cmd64(ctrl, NULL, argp);
+		break;
+	default:
+		ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+		break;
+	}
+	nvme_put_ctrl(ctrl);
+	return ret;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	struct nvme_ns_head *head = NULL;
+	void __user *argp = (void __user *)arg;
+	struct nvme_ns *ns;
+	int srcu_idx, ret;
+
+	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		return -EWOULDBLOCK;
+
+	/*
+	 * Handle ioctls that apply to the controller instead of the namespace
+	 * seperately and drop the ns SRCU reference early.  This avoids a
+	 * deadlock when deleting namespaces using the passthrough interface.
+	 */
+	if (is_ctrl_ioctl(cmd))
+		return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+
+	switch (cmd) {
+	case NVME_IOCTL_ID:
+		force_successful_syscall_return();
+		ret = ns->head->ns_id;
+		break;
+	case NVME_IOCTL_IO_CMD:
+		ret = nvme_user_cmd(ns->ctrl, ns, argp);
+		break;
+	case NVME_IOCTL_SUBMIT_IO:
+		ret = nvme_submit_io(ns, argp);
+		break;
+	case NVME_IOCTL_IO64_CMD:
+		ret = nvme_user_cmd64(ns->ctrl, ns, argp);
+		break;
+	default:
+		if (ns->ndev)
+			ret = nvme_nvm_ioctl(ns, cmd, arg);
+		else
+			ret = -ENOTTY;
+	}
+
+	nvme_put_ns_from_disk(head, srcu_idx);
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+struct nvme_user_io32 {
+	__u8	opcode;
+	__u8	flags;
+	__u16	control;
+	__u16	nblocks;
+	__u16	rsvd;
+	__u64	metadata;
+	__u64	addr;
+	__u64	slba;
+	__u32	dsmgmt;
+	__u32	reftag;
+	__u16	apptag;
+	__u16	appmask;
+} __attribute__((__packed__));
+
+#define NVME_IOCTL_SUBMIT_IO32	_IOW('N', 0x42, struct nvme_user_io32)
+
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	/*
+	 * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO
+	 * between 32 bit programs and 64 bit kernel.
+	 * The cause is that the results of sizeof(struct nvme_user_io),
+	 * which is used to define NVME_IOCTL_SUBMIT_IO,
+	 * are not same between 32 bit compiler and 64 bit compiler.
+	 * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling
+	 * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs.
+	 * Other IOCTL numbers are same between 32 bit and 64 bit.
+	 * So there is nothing to do regarding to other IOCTL numbers.
+	 */
+	if (cmd == NVME_IOCTL_SUBMIT_IO32)
+		return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg);
+
+	return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl	NULL
+#endif /* CONFIG_COMPAT */
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+#ifdef CONFIG_NVME_MULTIPATH
+	/* should never be called due to GENHD_FL_HIDDEN */
+	if (WARN_ON_ONCE(ns->head->disk))
+		goto fail;
+#endif
+	if (!kref_get_unless_zero(&ns->kref))
+		goto fail;
+	if (!try_module_get(ns->ctrl->ops->module))
+		goto fail_put_ns;
+
+	return 0;
+
+fail_put_ns:
+	nvme_put_ns(ns);
+fail:
+	return -ENXIO;
+}
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+	struct nvme_ns *ns = disk->private_data;
+
+	module_put(ns->ctrl->ops->module);
+	nvme_put_ns(ns);
+}
+
+static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
+	return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
+				u32 max_integrity_segments)
+{
+	struct blk_integrity integrity;
+
+	memset(&integrity, 0, sizeof(integrity));
+	switch (pi_type) {
+	case NVME_NS_DPS_PI_TYPE3:
+		integrity.profile = &t10_pi_type3_crc;
+		integrity.tag_size = sizeof(u16) + sizeof(u32);
+		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+		break;
+	case NVME_NS_DPS_PI_TYPE1:
+	case NVME_NS_DPS_PI_TYPE2:
+		integrity.profile = &t10_pi_type1_crc;
+		integrity.tag_size = sizeof(u16);
+		integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
+		break;
+	default:
+		integrity.profile = NULL;
+		break;
+	}
+	integrity.tuple_size = ms;
+	blk_integrity_register(disk, &integrity);
+	blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
+}
+#else
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type,
+				u32 max_integrity_segments)
+{
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct request_queue *queue = disk->queue;
+	u32 size = queue_logical_block_size(queue);
+
+	if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue);
+		return;
+	}
+
+	if (ctrl->nr_streams && ns->sws && ns->sgs)
+		size *= ns->sws * ns->sgs;
+
+	BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
+			NVME_DSM_MAX_RANGES);
+
+	queue->limits.discard_alignment = 0;
+	queue->limits.discard_granularity = size;
+
+	/* If discard is already enabled, don't reset queue limits */
+	if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue))
+		return;
+
+	blk_queue_max_discard_sectors(queue, UINT_MAX);
+	blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+
+	if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+		blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
+}
+
+/*
+ * Even though NVMe spec explicitly states that MDTS is not applicable to the
+ * write-zeroes, we are cautious and limit the size to the controllers
+ * max_hw_sectors value, which is based on the MDTS field and possibly other
+ * limiting factors.
+ */
+static void nvme_config_write_zeroes(struct request_queue *q,
+		struct nvme_ctrl *ctrl)
+{
+	if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
+	    !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
+		blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors);
+}
+
+static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
+{
+	return !uuid_is_null(&ids->uuid) ||
+		memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
+		memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
+}
+
+static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
+{
+	return uuid_equal(&a->uuid, &b->uuid) &&
+		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
+		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
+		a->csi == b->csi;
+}
+
+static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+				 u32 *phys_bs, u32 *io_opt)
+{
+	struct streams_directive_params s;
+	int ret;
+
+	if (!ctrl->nr_streams)
+		return 0;
+
+	ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id);
+	if (ret)
+		return ret;
+
+	ns->sws = le32_to_cpu(s.sws);
+	ns->sgs = le16_to_cpu(s.sgs);
+
+	if (ns->sws) {
+		*phys_bs = ns->sws * (1 << ns->lba_shift);
+		if (ns->sgs)
+			*io_opt = *phys_bs * ns->sgs;
+	}
+
+	return 0;
+}
+
+static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+
+	/*
+	 * The PI implementation requires the metadata size to be equal to the
+	 * t10 pi tuple size.
+	 */
+	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+	if (ns->ms == sizeof(struct t10_pi_tuple))
+		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+	else
+		ns->pi_type = 0;
+
+	ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+	if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+		return 0;
+	if (ctrl->ops->flags & NVME_F_FABRICS) {
+		/*
+		 * The NVMe over Fabrics specification only supports metadata as
+		 * part of the extended data LBA.  We rely on HCA/HBA support to
+		 * remap the separate metadata buffer from the block layer.
+		 */
+		if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
+			return -EINVAL;
+		if (ctrl->max_integrity_segments)
+			ns->features |=
+				(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+	} else {
+		/*
+		 * For PCIe controllers, we can't easily remap the separate
+		 * metadata buffer from the block layer and thus require a
+		 * separate metadata buffer for block layer metadata/PI support.
+		 * We allow extended LBAs for the passthrough interface, though.
+		 */
+		if (id->flbas & NVME_NS_FLBAS_META_EXT)
+			ns->features |= NVME_NS_EXT_LBAS;
+		else
+			ns->features |= NVME_NS_METADATA_SUPPORTED;
+	}
+
+	return 0;
+}
+
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+		struct request_queue *q)
+{
+	bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
+
+	if (ctrl->max_hw_sectors) {
+		u32 max_segments =
+			(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
+
+		max_segments = min_not_zero(max_segments, ctrl->max_segments);
+		blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+		blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+	}
+	blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
+	blk_queue_dma_alignment(q, 3);
+	blk_queue_write_cache(q, vwc, vwc);
+}
+
+static void nvme_update_disk_info(struct gendisk *disk,
+		struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+	sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
+	unsigned short bs = 1 << ns->lba_shift;
+	u32 atomic_bs, phys_bs, io_opt = 0;
+
+	/*
+	 * The block layer can't support LBA sizes larger than the page size
+	 * yet, so catch this early and don't allow block I/O.
+	 */
+	if (ns->lba_shift > PAGE_SHIFT) {
+		capacity = 0;
+		bs = (1 << 9);
+	}
+
+	blk_integrity_unregister(disk);
+
+	atomic_bs = phys_bs = bs;
+	nvme_setup_streams_ns(ns->ctrl, ns, &phys_bs, &io_opt);
+	if (id->nabo == 0) {
+		/*
+		 * Bit 1 indicates whether NAWUPF is defined for this namespace
+		 * and whether it should be used instead of AWUPF. If NAWUPF ==
+		 * 0 then AWUPF must be used instead.
+		 */
+		if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
+			atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
+		else
+			atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+	}
+
+	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
+		/* NPWG = Namespace Preferred Write Granularity */
+		phys_bs = bs * (1 + le16_to_cpu(id->npwg));
+		/* NOWS = Namespace Optimal Write Size */
+		io_opt = bs * (1 + le16_to_cpu(id->nows));
+	}
+
+	blk_queue_logical_block_size(disk->queue, bs);
+	/*
+	 * Linux filesystems assume writing a single physical block is
+	 * an atomic operation. Hence limit the physical block size to the
+	 * value of the Atomic Write Unit Power Fail parameter.
+	 */
+	blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
+	blk_queue_io_min(disk->queue, phys_bs);
+	blk_queue_io_opt(disk->queue, io_opt);
+
+	/*
+	 * Register a metadata profile for PI, or the plain non-integrity NVMe
+	 * metadata masquerading as Type 0 if supported, otherwise reject block
+	 * I/O to namespaces with metadata except when the namespace supports
+	 * PI, as it can strip/insert in that case.
+	 */
+	if (ns->ms) {
+		if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
+		    (ns->features & NVME_NS_METADATA_SUPPORTED))
+			nvme_init_integrity(disk, ns->ms, ns->pi_type,
+					    ns->ctrl->max_integrity_segments);
+		else if (!nvme_ns_has_pi(ns))
+			capacity = 0;
+	}
+
+	set_capacity_revalidate_and_notify(disk, capacity, false);
+
+	nvme_config_discard(disk, ns);
+	nvme_config_write_zeroes(disk->queue, ns->ctrl);
+
+	if (id->nsattr & NVME_NS_ATTR_RO)
+		set_disk_ro(disk, true);
+}
+
+static inline bool nvme_first_scan(struct gendisk *disk)
+{
+	/* nvme_alloc_ns() scans the disk prior to adding it */
+	return !(disk->flags & GENHD_FL_UP);
+}
+
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	u32 iob;
+
+	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+	    is_power_of_2(ctrl->max_hw_sectors))
+		iob = ctrl->max_hw_sectors;
+	else
+		iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
+
+	if (!iob)
+		return;
+
+	if (!is_power_of_2(iob)) {
+		if (nvme_first_scan(ns->disk))
+			pr_warn("%s: ignoring unaligned IO boundary:%u\n",
+				ns->disk->disk_name, iob);
+		return;
+	}
+
+	if (blk_queue_is_zoned(ns->disk->queue)) {
+		if (nvme_first_scan(ns->disk))
+			pr_warn("%s: ignoring zoned namespace IO boundary\n",
+				ns->disk->disk_name);
+		return;
+	}
+
+	blk_queue_chunk_sectors(ns->queue, iob);
+}
+
+static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+	unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+	int ret;
+
+	blk_mq_freeze_queue(ns->disk->queue);
+	ns->lba_shift = id->lbaf[lbaf].ds;
+	nvme_set_queue_limits(ns->ctrl, ns->queue);
+
+	if (ns->head->ids.csi == NVME_CSI_ZNS) {
+		ret = nvme_update_zone_info(ns, lbaf);
+		if (ret)
+			goto out_unfreeze;
+	}
+
+	ret = nvme_configure_metadata(ns, id);
+	if (ret)
+		goto out_unfreeze;
+	nvme_set_chunk_sectors(ns, id);
+	nvme_update_disk_info(ns->disk, ns, id);
+	blk_mq_unfreeze_queue(ns->disk->queue);
+
+	if (blk_queue_is_zoned(ns->queue)) {
+		ret = nvme_revalidate_zones(ns);
+		if (ret && !nvme_first_scan(ns->disk))
+			return ret;
+	}
+
+#ifdef CONFIG_NVME_MULTIPATH
+	if (ns->head->disk) {
+		blk_mq_freeze_queue(ns->head->disk->queue);
+		nvme_update_disk_info(ns->head->disk, ns, id);
+		blk_stack_limits(&ns->head->disk->queue->limits,
+				 &ns->queue->limits, 0);
+		blk_queue_update_readahead(ns->head->disk->queue);
+		nvme_update_bdev_size(ns->head->disk);
+		blk_mq_unfreeze_queue(ns->head->disk->queue);
+	}
+#endif
+	return 0;
+
+out_unfreeze:
+	blk_mq_unfreeze_queue(ns->disk->queue);
+	return ret;
+}
+
+static char nvme_pr_type(enum pr_type type)
+{
+	switch (type) {
+	case PR_WRITE_EXCLUSIVE:
+		return 1;
+	case PR_EXCLUSIVE_ACCESS:
+		return 2;
+	case PR_WRITE_EXCLUSIVE_REG_ONLY:
+		return 3;
+	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+		return 4;
+	case PR_WRITE_EXCLUSIVE_ALL_REGS:
+		return 5;
+	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+		return 6;
+	default:
+		return 0;
+	}
+};
+
+static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+				u64 key, u64 sa_key, u8 op)
+{
+	struct nvme_ns_head *head = NULL;
+	struct nvme_ns *ns;
+	struct nvme_command c;
+	int srcu_idx, ret;
+	u8 data[16] = { 0, };
+
+	ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		return -EWOULDBLOCK;
+
+	put_unaligned_le64(key, &data[0]);
+	put_unaligned_le64(sa_key, &data[8]);
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = op;
+	c.common.nsid = cpu_to_le32(ns->head->ns_id);
+	c.common.cdw10 = cpu_to_le32(cdw10);
+
+	ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+	nvme_put_ns_from_disk(head, srcu_idx);
+	return ret;
+}
+
+static int nvme_pr_register(struct block_device *bdev, u64 old,
+		u64 new, unsigned flags)
+{
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	cdw10 = old ? 2 : 0;
+	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+		enum pr_type type, unsigned flags)
+{
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	cdw10 = nvme_pr_type(type) << 8;
+	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+		enum pr_type type, bool abort)
+{
+	u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1);
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_clear(struct block_device *bdev, u64 key)
+{
+	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+	u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+}
+
+static const struct pr_ops nvme_pr_ops = {
+	.pr_register	= nvme_pr_register,
+	.pr_reserve	= nvme_pr_reserve,
+	.pr_release	= nvme_pr_release,
+	.pr_preempt	= nvme_pr_preempt,
+	.pr_clear	= nvme_pr_clear,
+};
+
+#ifdef CONFIG_BLK_SED_OPAL
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+		bool send)
+{
+	struct nvme_ctrl *ctrl = data;
+	struct nvme_command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	if (send)
+		cmd.common.opcode = nvme_admin_security_send;
+	else
+		cmd.common.opcode = nvme_admin_security_recv;
+	cmd.common.nsid = 0;
+	cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+	cmd.common.cdw11 = cpu_to_le32(len);
+
+	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
+				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
+}
+EXPORT_SYMBOL_GPL(nvme_sec_submit);
+#endif /* CONFIG_BLK_SED_OPAL */
+
+static const struct block_device_operations nvme_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_compat_ioctl,
+	.open		= nvme_open,
+	.release	= nvme_release,
+	.getgeo		= nvme_getgeo,
+	.report_zones	= nvme_report_zones,
+	.pr_ops		= &nvme_pr_ops,
+};
+
+#ifdef CONFIG_NVME_MULTIPATH
+static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nvme_ns_head *head = bdev->bd_disk->private_data;
+
+	if (!kref_get_unless_zero(&head->ref))
+		return -ENXIO;
+	return 0;
+}
+
+static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
+{
+	nvme_put_ns_head(disk->private_data);
+}
+
+const struct block_device_operations nvme_ns_head_ops = {
+	.owner		= THIS_MODULE,
+	.submit_bio	= nvme_ns_head_submit_bio,
+	.open		= nvme_ns_head_open,
+	.release	= nvme_ns_head_release,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_compat_ioctl,
+	.getgeo		= nvme_getgeo,
+	.report_zones	= nvme_report_zones,
+	.pr_ops		= &nvme_pr_ops,
+};
+#endif /* CONFIG_NVME_MULTIPATH */
+
+static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
+{
+	unsigned long timeout =
+		((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+	u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
+	int ret;
+
+	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+		if (csts == ~0)
+			return -ENODEV;
+		if ((csts & NVME_CSTS_RDY) == bit)
+			break;
+
+		usleep_range(1000, 2000);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(ctrl->device,
+				"Device not ready; aborting %s, CSTS=0x%x\n",
+				enabled ? "initialisation" : "reset", csts);
+			return -ENODEV;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * If the device has been passed off to us in an enabled state, just clear
+ * the enabled bit.  The spec says we should set the 'shutdown notification
+ * bits', but doing so may cause the device to complete commands to the
+ * admin queue ... and we don't know what memory that might be pointing at!
+ */
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+	ctrl->ctrl_config &= ~NVME_CC_ENABLE;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+
+	if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
+		msleep(NVME_QUIRK_DELAY_AMOUNT);
+
+	return nvme_wait_ready(ctrl, ctrl->cap, false);
+}
+EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
+
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
+{
+	unsigned dev_page_min;
+	int ret;
+
+	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+	if (ret) {
+		dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
+		return ret;
+	}
+	dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+
+	if (NVME_CTRL_PAGE_SHIFT < dev_page_min) {
+		dev_err(ctrl->device,
+			"Minimum device page size %u too large for host (%u)\n",
+			1 << dev_page_min, 1 << NVME_CTRL_PAGE_SHIFT);
+		return -ENODEV;
+	}
+
+	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
+		ctrl->ctrl_config = NVME_CC_CSS_CSI;
+	else
+		ctrl->ctrl_config = NVME_CC_CSS_NVM;
+	ctrl->ctrl_config |= (NVME_CTRL_PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
+	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
+	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+	ctrl->ctrl_config |= NVME_CC_ENABLE;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+	return nvme_wait_ready(ctrl, ctrl->cap, true);
+}
+EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
+
+int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
+{
+	unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
+	u32 csts;
+	int ret;
+
+	ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
+	ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
+
+	ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+	if (ret)
+		return ret;
+
+	while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+		if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
+			break;
+
+		msleep(100);
+		if (fatal_signal_pending(current))
+			return -EINTR;
+		if (time_after(jiffies, timeout)) {
+			dev_err(ctrl->device,
+				"Device shutdown incomplete; abort shutdown\n");
+			return -ENODEV;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
+
+static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
+{
+	__le64 ts;
+	int ret;
+
+	if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
+		return 0;
+
+	ts = cpu_to_le64(ktime_to_ms(ktime_get_real()));
+	ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
+			NULL);
+	if (ret)
+		dev_warn_once(ctrl->device,
+			"could not set timestamp (%d)\n", ret);
+	return ret;
+}
+
+static int nvme_configure_acre(struct nvme_ctrl *ctrl)
+{
+	struct nvme_feat_host_behavior *host;
+	int ret;
+
+	/* Don't bother enabling the feature if retry delay is not reported */
+	if (!ctrl->crdt[0])
+		return 0;
+
+	host = kzalloc(sizeof(*host), GFP_KERNEL);
+	if (!host)
+		return 0;
+
+	host->acre = NVME_ENABLE_ACRE;
+	ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
+				host, sizeof(*host), NULL);
+	kfree(host);
+	return ret;
+}
+
+static int nvme_configure_apst(struct nvme_ctrl *ctrl)
+{
+	/*
+	 * APST (Autonomous Power State Transition) lets us program a
+	 * table of power state transitions that the controller will
+	 * perform automatically.  We configure it with a simple
+	 * heuristic: we are willing to spend at most 2% of the time
+	 * transitioning between power states.  Therefore, when running
+	 * in any given state, we will enter the next lower-power
+	 * non-operational state after waiting 50 * (enlat + exlat)
+	 * microseconds, as long as that state's exit latency is under
+	 * the requested maximum latency.
+	 *
+	 * We will not autonomously enter any non-operational state for
+	 * which the total latency exceeds ps_max_latency_us.  Users
+	 * can set ps_max_latency_us to zero to turn off APST.
+	 */
+
+	unsigned apste;
+	struct nvme_feat_auto_pst *table;
+	u64 max_lat_us = 0;
+	int max_ps = -1;
+	int ret;
+
+	/*
+	 * If APST isn't supported or if we haven't been initialized yet,
+	 * then don't do anything.
+	 */
+	if (!ctrl->apsta)
+		return 0;
+
+	if (ctrl->npss > 31) {
+		dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
+		return 0;
+	}
+
+	table = kzalloc(sizeof(*table), GFP_KERNEL);
+	if (!table)
+		return 0;
+
+	if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) {
+		/* Turn off APST. */
+		apste = 0;
+		dev_dbg(ctrl->device, "APST disabled\n");
+	} else {
+		__le64 target = cpu_to_le64(0);
+		int state;
+
+		/*
+		 * Walk through all states from lowest- to highest-power.
+		 * According to the spec, lower-numbered states use more
+		 * power.  NPSS, despite the name, is the index of the
+		 * lowest-power state, not the number of states.
+		 */
+		for (state = (int)ctrl->npss; state >= 0; state--) {
+			u64 total_latency_us, exit_latency_us, transition_ms;
+
+			if (target)
+				table->entries[state] = target;
+
+			/*
+			 * Don't allow transitions to the deepest state
+			 * if it's quirked off.
+			 */
+			if (state == ctrl->npss &&
+			    (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS))
+				continue;
+
+			/*
+			 * Is this state a useful non-operational state for
+			 * higher-power states to autonomously transition to?
+			 */
+			if (!(ctrl->psd[state].flags &
+			      NVME_PS_FLAGS_NON_OP_STATE))
+				continue;
+
+			exit_latency_us =
+				(u64)le32_to_cpu(ctrl->psd[state].exit_lat);
+			if (exit_latency_us > ctrl->ps_max_latency_us)
+				continue;
+
+			total_latency_us =
+				exit_latency_us +
+				le32_to_cpu(ctrl->psd[state].entry_lat);
+
+			/*
+			 * This state is good.  Use it as the APST idle
+			 * target for higher power states.
+			 */
+			transition_ms = total_latency_us + 19;
+			do_div(transition_ms, 20);
+			if (transition_ms > (1 << 24) - 1)
+				transition_ms = (1 << 24) - 1;
+
+			target = cpu_to_le64((state << 3) |
+					     (transition_ms << 8));
+
+			if (max_ps == -1)
+				max_ps = state;
+
+			if (total_latency_us > max_lat_us)
+				max_lat_us = total_latency_us;
+		}
+
+		apste = 1;
+
+		if (max_ps == -1) {
+			dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n");
+		} else {
+			dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n",
+				max_ps, max_lat_us, (int)sizeof(*table), table);
+		}
+	}
+
+	ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
+				table, sizeof(*table), NULL);
+	if (ret)
+		dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
+
+	kfree(table);
+	return ret;
+}
+
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	u64 latency;
+
+	switch (val) {
+	case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
+	case PM_QOS_LATENCY_ANY:
+		latency = U64_MAX;
+		break;
+
+	default:
+		latency = val;
+	}
+
+	if (ctrl->ps_max_latency_us != latency) {
+		ctrl->ps_max_latency_us = latency;
+		if (ctrl->state == NVME_CTRL_LIVE)
+			nvme_configure_apst(ctrl);
+	}
+}
+
+struct nvme_core_quirk_entry {
+	/*
+	 * NVMe model and firmware strings are padded with spaces.  For
+	 * simplicity, strings in the quirk table are padded with NULLs
+	 * instead.
+	 */
+	u16 vid;
+	const char *mn;
+	const char *fr;
+	unsigned long quirks;
+};
+
+static const struct nvme_core_quirk_entry core_quirks[] = {
+	{
+		/*
+		 * This Toshiba device seems to die using any APST states.  See:
+		 * https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1678184/comments/11
+		 */
+		.vid = 0x1179,
+		.mn = "THNSF5256GPUK TOSHIBA",
+		.quirks = NVME_QUIRK_NO_APST,
+	},
+	{
+		/*
+		 * This LiteON CL1-3D*-Q11 firmware version has a race
+		 * condition associated with actions related to suspend to idle
+		 * LiteON has resolved the problem in future firmware
+		 */
+		.vid = 0x14a4,
+		.fr = "22301111",
+		.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
+	},
+	{
+		/*
+		 * This Kioxia CD6-V Series / HPE PE8030 device times out and
+		 * aborts I/O during any load, but more easily reproducible
+		 * with discards (fstrim).
+		 *
+		 * The device is left in a state where it is also not possible
+		 * to use "nvme set-feature" to disable APST, but booting with
+		 * nvme_core.default_ps_max_latency=0 works.
+		 */
+		.vid = 0x1e0f,
+		.mn = "KCD6XVUL6T40",
+		.quirks = NVME_QUIRK_NO_APST,
+	},
+	{
+		/*
+		 * The external Samsung X5 SSD fails initialization without a
+		 * delay before checking if it is ready and has a whole set of
+		 * other problems.  To make this even more interesting, it
+		 * shares the PCI ID with internal Samsung 970 Evo Plus that
+		 * does not need or want these quirks.
+		 */
+		.vid = 0x144d,
+		.mn = "Samsung Portable SSD X5",
+		.quirks = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
+			  NVME_QUIRK_NO_DEEPEST_PS |
+			  NVME_QUIRK_IGNORE_DEV_SUBNQN,
+	}
+};
+
+/* match is null-terminated but idstr is space-padded. */
+static bool string_matches(const char *idstr, const char *match, size_t len)
+{
+	size_t matchlen;
+
+	if (!match)
+		return true;
+
+	matchlen = strlen(match);
+	WARN_ON_ONCE(matchlen > len);
+
+	if (memcmp(idstr, match, matchlen))
+		return false;
+
+	for (; matchlen < len; matchlen++)
+		if (idstr[matchlen] != ' ')
+			return false;
+
+	return true;
+}
+
+static bool quirk_matches(const struct nvme_id_ctrl *id,
+			  const struct nvme_core_quirk_entry *q)
+{
+	return q->vid == le16_to_cpu(id->vid) &&
+		string_matches(id->mn, q->mn, sizeof(id->mn)) &&
+		string_matches(id->fr, q->fr, sizeof(id->fr));
+}
+
+static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl,
+		struct nvme_id_ctrl *id)
+{
+	size_t nqnlen;
+	int off;
+
+	if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
+		nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
+		if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
+			strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
+			return;
+		}
+
+		if (ctrl->vs >= NVME_VS(1, 2, 1))
+			dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
+	}
+
+	/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
+	off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
+			"nqn.2014.08.org.nvmexpress:%04x%04x",
+			le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
+	memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
+	off += sizeof(id->sn);
+	memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn));
+	off += sizeof(id->mn);
+	memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
+}
+
+static void nvme_release_subsystem(struct device *dev)
+{
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+
+	if (subsys->instance >= 0)
+		ida_simple_remove(&nvme_instance_ida, subsys->instance);
+	kfree(subsys);
+}
+
+static void nvme_destroy_subsystem(struct kref *ref)
+{
+	struct nvme_subsystem *subsys =
+			container_of(ref, struct nvme_subsystem, ref);
+
+	mutex_lock(&nvme_subsystems_lock);
+	list_del(&subsys->entry);
+	mutex_unlock(&nvme_subsystems_lock);
+
+	ida_destroy(&subsys->ns_ida);
+	device_del(&subsys->dev);
+	put_device(&subsys->dev);
+}
+
+static void nvme_put_subsystem(struct nvme_subsystem *subsys)
+{
+	kref_put(&subsys->ref, nvme_destroy_subsystem);
+}
+
+static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn)
+{
+	struct nvme_subsystem *subsys;
+
+	lockdep_assert_held(&nvme_subsystems_lock);
+
+	/*
+	 * Fail matches for discovery subsystems. This results
+	 * in each discovery controller bound to a unique subsystem.
+	 * This avoids issues with validating controller values
+	 * that can only be true when there is a single unique subsystem.
+	 * There may be multiple and completely independent entities
+	 * that provide discovery controllers.
+	 */
+	if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
+		return NULL;
+
+	list_for_each_entry(subsys, &nvme_subsystems, entry) {
+		if (strcmp(subsys->subnqn, subsysnqn))
+			continue;
+		if (!kref_get_unless_zero(&subsys->ref))
+			continue;
+		return subsys;
+	}
+
+	return NULL;
+}
+
+#define SUBSYS_ATTR_RO(_name, _mode, _show)			\
+	struct device_attribute subsys_attr_##_name = \
+		__ATTR(_name, _mode, _show, NULL)
+
+static ssize_t nvme_subsys_show_nqn(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn);
+}
+static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn);
+
+#define nvme_subsys_show_str_function(field)				\
+static ssize_t subsys_##field##_show(struct device *dev,		\
+			    struct device_attribute *attr, char *buf)	\
+{									\
+	struct nvme_subsystem *subsys =					\
+		container_of(dev, struct nvme_subsystem, dev);		\
+	return sysfs_emit(buf, "%.*s\n",				\
+			   (int)sizeof(subsys->field), subsys->field);	\
+}									\
+static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show);
+
+nvme_subsys_show_str_function(model);
+nvme_subsys_show_str_function(serial);
+nvme_subsys_show_str_function(firmware_rev);
+
+static struct attribute *nvme_subsys_attrs[] = {
+	&subsys_attr_model.attr,
+	&subsys_attr_serial.attr,
+	&subsys_attr_firmware_rev.attr,
+	&subsys_attr_subsysnqn.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&subsys_attr_iopolicy.attr,
+#endif
+	NULL,
+};
+
+static struct attribute_group nvme_subsys_attrs_group = {
+	.attrs = nvme_subsys_attrs,
+};
+
+static const struct attribute_group *nvme_subsys_attrs_groups[] = {
+	&nvme_subsys_attrs_group,
+	NULL,
+};
+
+static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl)
+{
+	return ctrl->opts && ctrl->opts->discovery_nqn;
+}
+
+static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
+		struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+	struct nvme_ctrl *tmp;
+
+	lockdep_assert_held(&nvme_subsystems_lock);
+
+	list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
+		if (nvme_state_terminal(tmp))
+			continue;
+
+		if (tmp->cntlid == ctrl->cntlid) {
+			dev_err(ctrl->device,
+				"Duplicate cntlid %u with %s, rejecting\n",
+				ctrl->cntlid, dev_name(tmp->device));
+			return false;
+		}
+
+		if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
+		    nvme_discovery_ctrl(ctrl))
+			continue;
+
+		dev_err(ctrl->device,
+			"Subsystem does not support multiple controllers\n");
+		return false;
+	}
+
+	return true;
+}
+
+static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+	struct nvme_subsystem *subsys, *found;
+	int ret;
+
+	subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
+	if (!subsys)
+		return -ENOMEM;
+
+	subsys->instance = -1;
+	mutex_init(&subsys->lock);
+	kref_init(&subsys->ref);
+	INIT_LIST_HEAD(&subsys->ctrls);
+	INIT_LIST_HEAD(&subsys->nsheads);
+	nvme_init_subnqn(subsys, ctrl, id);
+	memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
+	memcpy(subsys->model, id->mn, sizeof(subsys->model));
+	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
+	subsys->vendor_id = le16_to_cpu(id->vid);
+	subsys->cmic = id->cmic;
+	subsys->awupf = le16_to_cpu(id->awupf);
+#ifdef CONFIG_NVME_MULTIPATH
+	subsys->iopolicy = NVME_IOPOLICY_NUMA;
+#endif
+
+	subsys->dev.class = nvme_subsys_class;
+	subsys->dev.release = nvme_release_subsystem;
+	subsys->dev.groups = nvme_subsys_attrs_groups;
+	dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
+	device_initialize(&subsys->dev);
+
+	mutex_lock(&nvme_subsystems_lock);
+	found = __nvme_find_get_subsystem(subsys->subnqn);
+	if (found) {
+		put_device(&subsys->dev);
+		subsys = found;
+
+		if (!nvme_validate_cntlid(subsys, ctrl, id)) {
+			ret = -EINVAL;
+			goto out_put_subsystem;
+		}
+	} else {
+		ret = device_add(&subsys->dev);
+		if (ret) {
+			dev_err(ctrl->device,
+				"failed to register subsystem device.\n");
+			put_device(&subsys->dev);
+			goto out_unlock;
+		}
+		ida_init(&subsys->ns_ida);
+		list_add_tail(&subsys->entry, &nvme_subsystems);
+	}
+
+	ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
+				dev_name(ctrl->device));
+	if (ret) {
+		dev_err(ctrl->device,
+			"failed to create sysfs link from subsystem.\n");
+		goto out_put_subsystem;
+	}
+
+	if (!found)
+		subsys->instance = ctrl->instance;
+	ctrl->subsys = subsys;
+	list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
+	mutex_unlock(&nvme_subsystems_lock);
+	return 0;
+
+out_put_subsystem:
+	nvme_put_subsystem(subsys);
+out_unlock:
+	mutex_unlock(&nvme_subsystems_lock);
+	return ret;
+}
+
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
+		void *log, size_t size, u64 offset)
+{
+	struct nvme_command c = { };
+	u32 dwlen = nvme_bytes_to_numd(size);
+
+	c.get_log_page.opcode = nvme_admin_get_log_page;
+	c.get_log_page.nsid = cpu_to_le32(nsid);
+	c.get_log_page.lid = log_page;
+	c.get_log_page.lsp = lsp;
+	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
+	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
+	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
+	c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
+	c.get_log_page.csi = csi;
+
+	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
+}
+
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
+				struct nvme_effects_log **log)
+{
+	struct nvme_effects_log	*cel = xa_load(&ctrl->cels, csi);
+	int ret;
+
+	if (cel)
+		goto out;
+
+	cel = kzalloc(sizeof(*cel), GFP_KERNEL);
+	if (!cel)
+		return -ENOMEM;
+
+	ret = nvme_get_log(ctrl, 0x00, NVME_LOG_CMD_EFFECTS, 0, csi,
+			cel, sizeof(*cel), 0);
+	if (ret) {
+		kfree(cel);
+		return ret;
+	}
+
+	xa_store(&ctrl->cels, csi, cel, GFP_KERNEL);
+out:
+	*log = cel;
+	return 0;
+}
+
+/*
+ * Initialize the cached copies of the Identify data and various controller
+ * register in our nvme_ctrl structure.  This should be called as soon as
+ * the admin queue is fully up and running.
+ */
+int nvme_init_identify(struct nvme_ctrl *ctrl)
+{
+	struct nvme_id_ctrl *id;
+	int ret, page_shift;
+	u32 max_hw_sectors;
+	bool prev_apst_enabled;
+
+	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
+	if (ret) {
+		dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
+		return ret;
+	}
+	page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+	ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
+
+	if (ctrl->vs >= NVME_VS(1, 1, 0))
+		ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
+
+	ret = nvme_identify_ctrl(ctrl, &id);
+	if (ret) {
+		dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
+		return -EIO;
+	}
+
+	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
+		ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
+		if (ret < 0)
+			goto out_free;
+	}
+
+	if (!(ctrl->ops->flags & NVME_F_FABRICS))
+		ctrl->cntlid = le16_to_cpu(id->cntlid);
+
+	if (!ctrl->identified) {
+		int i;
+
+		ret = nvme_init_subsystem(ctrl, id);
+		if (ret)
+			goto out_free;
+
+		/*
+		 * Check for quirks.  Quirk can depend on firmware version,
+		 * so, in principle, the set of quirks present can change
+		 * across a reset.  As a possible future enhancement, we
+		 * could re-scan for quirks every time we reinitialize
+		 * the device, but we'd have to make sure that the driver
+		 * behaves intelligently if the quirks change.
+		 */
+		for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
+			if (quirk_matches(id, &core_quirks[i]))
+				ctrl->quirks |= core_quirks[i].quirks;
+		}
+	}
+
+	if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
+		dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
+		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
+	}
+
+	ctrl->crdt[0] = le16_to_cpu(id->crdt1);
+	ctrl->crdt[1] = le16_to_cpu(id->crdt2);
+	ctrl->crdt[2] = le16_to_cpu(id->crdt3);
+
+	ctrl->oacs = le16_to_cpu(id->oacs);
+	ctrl->oncs = le16_to_cpu(id->oncs);
+	ctrl->mtfa = le16_to_cpu(id->mtfa);
+	ctrl->oaes = le32_to_cpu(id->oaes);
+	ctrl->wctemp = le16_to_cpu(id->wctemp);
+	ctrl->cctemp = le16_to_cpu(id->cctemp);
+
+	atomic_set(&ctrl->abort_limit, id->acl + 1);
+	ctrl->vwc = id->vwc;
+	if (id->mdts)
+		max_hw_sectors = 1 << (id->mdts + page_shift - 9);
+	else
+		max_hw_sectors = UINT_MAX;
+	ctrl->max_hw_sectors =
+		min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
+
+	nvme_set_queue_limits(ctrl, ctrl->admin_q);
+	ctrl->sgls = le32_to_cpu(id->sgls);
+	ctrl->kas = le16_to_cpu(id->kas);
+	ctrl->max_namespaces = le32_to_cpu(id->mnan);
+	ctrl->ctratt = le32_to_cpu(id->ctratt);
+
+	if (id->rtd3e) {
+		/* us -> s */
+		u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
+
+		ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
+						 shutdown_timeout, 60);
+
+		if (ctrl->shutdown_timeout != shutdown_timeout)
+			dev_info(ctrl->device,
+				 "Shutdown timeout set to %u seconds\n",
+				 ctrl->shutdown_timeout);
+	} else
+		ctrl->shutdown_timeout = shutdown_timeout;
+
+	ctrl->npss = id->npss;
+	ctrl->apsta = id->apsta;
+	prev_apst_enabled = ctrl->apst_enabled;
+	if (ctrl->quirks & NVME_QUIRK_NO_APST) {
+		if (force_apst && id->apsta) {
+			dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
+			ctrl->apst_enabled = true;
+		} else {
+			ctrl->apst_enabled = false;
+		}
+	} else {
+		ctrl->apst_enabled = id->apsta;
+	}
+	memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
+
+	if (ctrl->ops->flags & NVME_F_FABRICS) {
+		ctrl->icdoff = le16_to_cpu(id->icdoff);
+		ctrl->ioccsz = le32_to_cpu(id->ioccsz);
+		ctrl->iorcsz = le32_to_cpu(id->iorcsz);
+		ctrl->maxcmd = le16_to_cpu(id->maxcmd);
+
+		/*
+		 * In fabrics we need to verify the cntlid matches the
+		 * admin connect
+		 */
+		if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
+			dev_err(ctrl->device,
+				"Mismatching cntlid: Connect %u vs Identify "
+				"%u, rejecting\n",
+				ctrl->cntlid, le16_to_cpu(id->cntlid));
+			ret = -EINVAL;
+			goto out_free;
+		}
+
+		if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
+			dev_err(ctrl->device,
+				"keep-alive support is mandatory for fabrics\n");
+			ret = -EINVAL;
+			goto out_free;
+		}
+	} else {
+		ctrl->hmpre = le32_to_cpu(id->hmpre);
+		ctrl->hmmin = le32_to_cpu(id->hmmin);
+		ctrl->hmminds = le32_to_cpu(id->hmminds);
+		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
+	}
+
+	ret = nvme_mpath_init_identify(ctrl, id);
+	kfree(id);
+
+	if (ret < 0)
+		return ret;
+
+	if (ctrl->apst_enabled && !prev_apst_enabled)
+		dev_pm_qos_expose_latency_tolerance(ctrl->device);
+	else if (!ctrl->apst_enabled && prev_apst_enabled)
+		dev_pm_qos_hide_latency_tolerance(ctrl->device);
+
+	ret = nvme_configure_apst(ctrl);
+	if (ret < 0)
+		return ret;
+	
+	ret = nvme_configure_timestamp(ctrl);
+	if (ret < 0)
+		return ret;
+
+	ret = nvme_configure_directives(ctrl);
+	if (ret < 0)
+		return ret;
+
+	ret = nvme_configure_acre(ctrl);
+	if (ret < 0)
+		return ret;
+
+	if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
+		ret = nvme_hwmon_init(ctrl);
+		if (ret < 0)
+			return ret;
+	}
+
+	ctrl->identified = true;
+
+	return 0;
+
+out_free:
+	kfree(id);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_init_identify);
+
+static int nvme_dev_open(struct inode *inode, struct file *file)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
+
+	switch (ctrl->state) {
+	case NVME_CTRL_LIVE:
+		break;
+	default:
+		return -EWOULDBLOCK;
+	}
+
+	nvme_get_ctrl(ctrl);
+	if (!try_module_get(ctrl->ops->module)) {
+		nvme_put_ctrl(ctrl);
+		return -EINVAL;
+	}
+
+	file->private_data = ctrl;
+	return 0;
+}
+
+static int nvme_dev_release(struct inode *inode, struct file *file)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(inode->i_cdev, struct nvme_ctrl, cdev);
+
+	module_put(ctrl->ops->module);
+	nvme_put_ctrl(ctrl);
+	return 0;
+}
+
+static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
+{
+	struct nvme_ns *ns;
+	int ret;
+
+	down_read(&ctrl->namespaces_rwsem);
+	if (list_empty(&ctrl->namespaces)) {
+		ret = -ENOTTY;
+		goto out_unlock;
+	}
+
+	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
+	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
+		dev_warn(ctrl->device,
+			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	dev_warn(ctrl->device,
+		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
+	kref_get(&ns->kref);
+	up_read(&ctrl->namespaces_rwsem);
+
+	ret = nvme_user_cmd(ctrl, ns, argp);
+	nvme_put_ns(ns);
+	return ret;
+
+out_unlock:
+	up_read(&ctrl->namespaces_rwsem);
+	return ret;
+}
+
+static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct nvme_ctrl *ctrl = file->private_data;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_cmd(ctrl, NULL, argp);
+	case NVME_IOCTL_ADMIN64_CMD:
+		return nvme_user_cmd64(ctrl, NULL, argp);
+	case NVME_IOCTL_IO_CMD:
+		return nvme_dev_user_cmd(ctrl, argp);
+	case NVME_IOCTL_RESET:
+		dev_warn(ctrl->device, "resetting controller\n");
+		return nvme_reset_ctrl_sync(ctrl);
+	case NVME_IOCTL_SUBSYS_RESET:
+		return nvme_reset_subsystem(ctrl);
+	case NVME_IOCTL_RESCAN:
+		nvme_queue_scan(ctrl);
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations nvme_dev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nvme_dev_open,
+	.release	= nvme_dev_release,
+	.unlocked_ioctl	= nvme_dev_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
+};
+
+static ssize_t nvme_sysfs_reset(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	int ret;
+
+	ret = nvme_reset_ctrl_sync(ctrl);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
+
+static ssize_t nvme_sysfs_rescan(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	nvme_queue_scan(ctrl);
+	return count;
+}
+static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan);
+
+static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	if (disk->fops == &nvme_fops)
+		return nvme_get_ns_from_dev(dev)->head;
+	else
+		return disk->private_data;
+}
+
+static ssize_t wwid_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+	struct nvme_ns_ids *ids = &head->ids;
+	struct nvme_subsystem *subsys = head->subsys;
+	int serial_len = sizeof(subsys->serial);
+	int model_len = sizeof(subsys->model);
+
+	if (!uuid_is_null(&ids->uuid))
+		return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid);
+
+	if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+		return sysfs_emit(buf, "eui.%16phN\n", ids->nguid);
+
+	if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+		return sysfs_emit(buf, "eui.%8phN\n", ids->eui64);
+
+	while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' ||
+				  subsys->serial[serial_len - 1] == '\0'))
+		serial_len--;
+	while (model_len > 0 && (subsys->model[model_len - 1] == ' ' ||
+				 subsys->model[model_len - 1] == '\0'))
+		model_len--;
+
+	return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id,
+		serial_len, subsys->serial, model_len, subsys->model,
+		head->ns_id);
+}
+static DEVICE_ATTR_RO(wwid);
+
+static ssize_t nguid_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid);
+}
+static DEVICE_ATTR_RO(nguid);
+
+static ssize_t uuid_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
+
+	/* For backward compatibility expose the NGUID to userspace if
+	 * we have no UUID set
+	 */
+	if (uuid_is_null(&ids->uuid)) {
+		dev_warn_ratelimited(dev,
+			"No UUID available providing old NGUID\n");
+		return sysfs_emit(buf, "%pU\n", ids->nguid);
+	}
+	return sysfs_emit(buf, "%pU\n", &ids->uuid);
+}
+static DEVICE_ATTR_RO(uuid);
+
+static ssize_t eui_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64);
+}
+static DEVICE_ATTR_RO(eui);
+
+static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
+		char *buf)
+{
+	return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id);
+}
+static DEVICE_ATTR_RO(nsid);
+
+static struct attribute *nvme_ns_id_attrs[] = {
+	&dev_attr_wwid.attr,
+	&dev_attr_uuid.attr,
+	&dev_attr_nguid.attr,
+	&dev_attr_eui.attr,
+	&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&dev_attr_ana_grpid.attr,
+	&dev_attr_ana_state.attr,
+#endif
+	NULL,
+};
+
+static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids;
+
+	if (a == &dev_attr_uuid.attr) {
+		if (uuid_is_null(&ids->uuid) &&
+		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+			return 0;
+	}
+	if (a == &dev_attr_nguid.attr) {
+		if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
+			return 0;
+	}
+	if (a == &dev_attr_eui.attr) {
+		if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
+			return 0;
+	}
+#ifdef CONFIG_NVME_MULTIPATH
+	if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
+		if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
+			return 0;
+		if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
+			return 0;
+	}
+#endif
+	return a->mode;
+}
+
+static const struct attribute_group nvme_ns_id_attr_group = {
+	.attrs		= nvme_ns_id_attrs,
+	.is_visible	= nvme_ns_id_attrs_are_visible,
+};
+
+const struct attribute_group *nvme_ns_id_attr_groups[] = {
+	&nvme_ns_id_attr_group,
+#ifdef CONFIG_NVM
+	&nvme_nvm_attr_group,
+#endif
+	NULL,
+};
+
+#define nvme_show_str_function(field)						\
+static ssize_t  field##_show(struct device *dev,				\
+			    struct device_attribute *attr, char *buf)		\
+{										\
+        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
+        return sysfs_emit(buf, "%.*s\n",					\
+		(int)sizeof(ctrl->subsys->field), ctrl->subsys->field);		\
+}										\
+static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
+
+nvme_show_str_function(model);
+nvme_show_str_function(serial);
+nvme_show_str_function(firmware_rev);
+
+#define nvme_show_int_function(field)						\
+static ssize_t  field##_show(struct device *dev,				\
+			    struct device_attribute *attr, char *buf)		\
+{										\
+        struct nvme_ctrl *ctrl = dev_get_drvdata(dev);				\
+        return sysfs_emit(buf, "%d\n", ctrl->field);				\
+}										\
+static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
+
+nvme_show_int_function(cntlid);
+nvme_show_int_function(numa_node);
+nvme_show_int_function(queue_count);
+nvme_show_int_function(sqsize);
+
+static ssize_t nvme_sysfs_delete(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	if (device_remove_file_self(dev, attr))
+		nvme_delete_ctrl_sync(ctrl);
+	return count;
+}
+static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete);
+
+static ssize_t nvme_sysfs_show_transport(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->ops->name);
+}
+static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL);
+
+static ssize_t nvme_sysfs_show_state(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	static const char *const state_name[] = {
+		[NVME_CTRL_NEW]		= "new",
+		[NVME_CTRL_LIVE]	= "live",
+		[NVME_CTRL_RESETTING]	= "resetting",
+		[NVME_CTRL_CONNECTING]	= "connecting",
+		[NVME_CTRL_DELETING]	= "deleting",
+		[NVME_CTRL_DELETING_NOIO]= "deleting (no IO)",
+		[NVME_CTRL_DEAD]	= "dead",
+	};
+
+	if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) &&
+	    state_name[ctrl->state])
+		return sysfs_emit(buf, "%s\n", state_name[ctrl->state]);
+
+	return sysfs_emit(buf, "unknown state\n");
+}
+
+static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL);
+
+static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn);
+}
+static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL);
+
+static ssize_t nvme_sysfs_show_hostnqn(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn);
+}
+static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL);
+
+static ssize_t nvme_sysfs_show_hostid(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id);
+}
+static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL);
+
+static ssize_t nvme_sysfs_show_address(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE);
+}
+static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
+
+static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+
+	if (ctrl->opts->max_reconnects == -1)
+		return sysfs_emit(buf, "off\n");
+	return sysfs_emit(buf, "%d\n",
+			  opts->max_reconnects * opts->reconnect_delay);
+}
+
+static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	int ctrl_loss_tmo, err;
+
+	err = kstrtoint(buf, 10, &ctrl_loss_tmo);
+	if (err)
+		return -EINVAL;
+
+	else if (ctrl_loss_tmo < 0)
+		opts->max_reconnects = -1;
+	else
+		opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+						opts->reconnect_delay);
+	return count;
+}
+static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
+	nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
+
+static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	if (ctrl->opts->reconnect_delay == -1)
+		return sysfs_emit(buf, "off\n");
+	return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay);
+}
+
+static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	unsigned int v;
+	int err;
+
+	err = kstrtou32(buf, 10, &v);
+	if (err)
+		return err;
+
+	ctrl->opts->reconnect_delay = v;
+	return count;
+}
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
+	nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
+
+static struct attribute *nvme_dev_attrs[] = {
+	&dev_attr_reset_controller.attr,
+	&dev_attr_rescan_controller.attr,
+	&dev_attr_model.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_firmware_rev.attr,
+	&dev_attr_cntlid.attr,
+	&dev_attr_delete_controller.attr,
+	&dev_attr_transport.attr,
+	&dev_attr_subsysnqn.attr,
+	&dev_attr_address.attr,
+	&dev_attr_state.attr,
+	&dev_attr_numa_node.attr,
+	&dev_attr_queue_count.attr,
+	&dev_attr_sqsize.attr,
+	&dev_attr_hostnqn.attr,
+	&dev_attr_hostid.attr,
+	&dev_attr_ctrl_loss_tmo.attr,
+	&dev_attr_reconnect_delay.attr,
+	NULL
+};
+
+static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl)
+		return 0;
+	if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
+		return 0;
+	if (a == &dev_attr_hostnqn.attr && !ctrl->opts)
+		return 0;
+	if (a == &dev_attr_hostid.attr && !ctrl->opts)
+		return 0;
+	if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
+		return 0;
+	if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
+		return 0;
+
+	return a->mode;
+}
+
+static struct attribute_group nvme_dev_attrs_group = {
+	.attrs		= nvme_dev_attrs,
+	.is_visible	= nvme_dev_attrs_are_visible,
+};
+
+static const struct attribute_group *nvme_dev_attr_groups[] = {
+	&nvme_dev_attrs_group,
+	NULL,
+};
+
+static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
+		unsigned nsid)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+
+	list_for_each_entry(h, &subsys->nsheads, entry) {
+		if (h->ns_id == nsid && kref_get_unless_zero(&h->ref))
+			return h;
+	}
+
+	return NULL;
+}
+
+static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
+		struct nvme_ns_ids *ids)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+
+	list_for_each_entry(h, &subsys->nsheads, entry) {
+		if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
+		unsigned nsid, struct nvme_ns_ids *ids)
+{
+	struct nvme_ns_head *head;
+	size_t size = sizeof(*head);
+	int ret = -ENOMEM;
+
+#ifdef CONFIG_NVME_MULTIPATH
+	size += num_possible_nodes() * sizeof(struct nvme_ns *);
+#endif
+
+	head = kzalloc(size, GFP_KERNEL);
+	if (!head)
+		goto out;
+	ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto out_free_head;
+	head->instance = ret;
+	INIT_LIST_HEAD(&head->list);
+	ret = init_srcu_struct(&head->srcu);
+	if (ret)
+		goto out_ida_remove;
+	head->subsys = ctrl->subsys;
+	head->ns_id = nsid;
+	head->ids = *ids;
+	kref_init(&head->ref);
+
+	ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids);
+	if (ret) {
+		dev_err(ctrl->device,
+			"duplicate IDs for nsid %d\n", nsid);
+		goto out_cleanup_srcu;
+	}
+
+	if (head->ids.csi) {
+		ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
+		if (ret)
+			goto out_cleanup_srcu;
+	} else
+		head->effects = ctrl->effects;
+
+	ret = nvme_mpath_alloc_disk(ctrl, head);
+	if (ret)
+		goto out_cleanup_srcu;
+
+	list_add_tail(&head->entry, &ctrl->subsys->nsheads);
+
+	kref_get(&ctrl->subsys->ref);
+
+	return head;
+out_cleanup_srcu:
+	cleanup_srcu_struct(&head->srcu);
+out_ida_remove:
+	ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
+out_free_head:
+	kfree(head);
+out:
+	if (ret > 0)
+		ret = blk_status_to_errno(nvme_error_status(ret));
+	return ERR_PTR(ret);
+}
+
+static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
+		struct nvme_ns_ids *ids, bool is_shared)
+{
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_ns_head *head = NULL;
+	int ret = 0;
+
+	mutex_lock(&ctrl->subsys->lock);
+	head = nvme_find_ns_head(ctrl->subsys, nsid);
+	if (!head) {
+		head = nvme_alloc_ns_head(ctrl, nsid, ids);
+		if (IS_ERR(head)) {
+			ret = PTR_ERR(head);
+			goto out_unlock;
+		}
+		head->shared = is_shared;
+	} else {
+		ret = -EINVAL;
+		if (!is_shared || !head->shared) {
+			dev_err(ctrl->device,
+				"Duplicate unshared namespace %d\n", nsid);
+			goto out_put_ns_head;
+		}
+		if (!nvme_ns_ids_equal(&head->ids, ids)) {
+			dev_err(ctrl->device,
+				"IDs don't match for shared namespace %d\n",
+					nsid);
+			goto out_put_ns_head;
+		}
+	}
+
+	list_add_tail(&ns->siblings, &head->list);
+	ns->head = head;
+	mutex_unlock(&ctrl->subsys->lock);
+	return 0;
+
+out_put_ns_head:
+	nvme_put_ns_head(head);
+out_unlock:
+	mutex_unlock(&ctrl->subsys->lock);
+	return ret;
+}
+
+struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns, *ret = NULL;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->head->ns_id == nsid) {
+			if (!kref_get_unless_zero(&ns->kref))
+				continue;
+			ret = ns;
+			break;
+		}
+		if (ns->head->ns_id > nsid)
+			break;
+	}
+	up_read(&ctrl->namespaces_rwsem);
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
+
+/*
+ * Add the namespace to the controller list while keeping the list ordered.
+ */
+static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
+{
+	struct nvme_ns *tmp;
+
+	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
+		if (tmp->head->ns_id < ns->head->ns_id) {
+			list_add(&ns->list, &tmp->list);
+			return;
+		}
+	}
+	list_add(&ns->list, &ns->ctrl->namespaces);
+}
+
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid,
+		struct nvme_ns_ids *ids)
+{
+	struct nvme_ns *ns;
+	struct gendisk *disk;
+	struct nvme_id_ns *id;
+	char disk_name[DISK_NAME_LEN];
+	int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
+
+	if (nvme_identify_ns(ctrl, nsid, ids, &id))
+		return;
+
+	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
+	if (!ns)
+		goto out_free_id;
+
+	ns->queue = blk_mq_init_queue(ctrl->tagset);
+	if (IS_ERR(ns->queue))
+		goto out_free_ns;
+
+	if (ctrl->opts && ctrl->opts->data_digest)
+		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
+
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
+	if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
+		blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
+
+	ns->queue->queuedata = ns;
+	ns->ctrl = ctrl;
+	kref_init(&ns->kref);
+
+	ret = nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED);
+	if (ret)
+		goto out_free_queue;
+	nvme_set_disk_name(disk_name, ns, ctrl, &flags);
+
+	disk = alloc_disk_node(0, node);
+	if (!disk)
+		goto out_unlink_ns;
+
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	disk->flags = flags;
+	memcpy(disk->disk_name, disk_name, DISK_NAME_LEN);
+	ns->disk = disk;
+
+	if (nvme_update_ns_info(ns, id))
+		goto out_put_disk;
+
+	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
+		ret = nvme_nvm_register(ns, disk_name, node);
+		if (ret) {
+			dev_warn(ctrl->device, "LightNVM init failure\n");
+			goto out_put_disk;
+		}
+	}
+
+	down_write(&ctrl->namespaces_rwsem);
+	nvme_ns_add_to_ctrl_list(ns);
+	up_write(&ctrl->namespaces_rwsem);
+	nvme_get_ctrl(ctrl);
+
+	device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups);
+
+	nvme_mpath_add_disk(ns, id);
+	nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
+	kfree(id);
+
+	return;
+ out_put_disk:
+	/* prevent double queue cleanup */
+	ns->disk->queue = NULL;
+	put_disk(ns->disk);
+ out_unlink_ns:
+	mutex_lock(&ctrl->subsys->lock);
+	list_del_rcu(&ns->siblings);
+	if (list_empty(&ns->head->list))
+		list_del_init(&ns->head->entry);
+	mutex_unlock(&ctrl->subsys->lock);
+	nvme_put_ns_head(ns->head);
+ out_free_queue:
+	blk_cleanup_queue(ns->queue);
+ out_free_ns:
+	kfree(ns);
+ out_free_id:
+	kfree(id);
+}
+
+static void nvme_ns_remove(struct nvme_ns *ns)
+{
+	if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
+		return;
+
+	set_capacity(ns->disk, 0);
+	nvme_fault_inject_fini(&ns->fault_inject);
+
+	mutex_lock(&ns->ctrl->subsys->lock);
+	list_del_rcu(&ns->siblings);
+	if (list_empty(&ns->head->list))
+		list_del_init(&ns->head->entry);
+	mutex_unlock(&ns->ctrl->subsys->lock);
+
+	synchronize_rcu(); /* guarantee not available in head->list */
+	nvme_mpath_clear_current_path(ns);
+	synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
+
+	if (ns->disk->flags & GENHD_FL_UP) {
+		del_gendisk(ns->disk);
+		blk_cleanup_queue(ns->queue);
+		if (blk_get_integrity(ns->disk))
+			blk_integrity_unregister(ns->disk);
+	}
+
+	down_write(&ns->ctrl->namespaces_rwsem);
+	list_del_init(&ns->list);
+	up_write(&ns->ctrl->namespaces_rwsem);
+
+	nvme_mpath_check_last_path(ns);
+	nvme_put_ns(ns);
+}
+
+static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
+{
+	struct nvme_ns *ns = nvme_find_get_ns(ctrl, nsid);
+
+	if (ns) {
+		nvme_ns_remove(ns);
+		nvme_put_ns(ns);
+	}
+}
+
+static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_ids *ids)
+{
+	struct nvme_id_ns *id;
+	int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
+
+	if (test_bit(NVME_NS_DEAD, &ns->flags))
+		goto out;
+
+	ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, ids, &id);
+	if (ret)
+		goto out;
+
+	ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
+	if (!nvme_ns_ids_equal(&ns->head->ids, ids)) {
+		dev_err(ns->ctrl->device,
+			"identifiers changed for nsid %d\n", ns->head->ns_id);
+		goto out_free_id;
+	}
+
+	ret = nvme_update_ns_info(ns, id);
+
+out_free_id:
+	kfree(id);
+out:
+	/*
+	 * Only remove the namespace if we got a fatal error back from the
+	 * device, otherwise ignore the error and just move on.
+	 *
+	 * TODO: we should probably schedule a delayed retry here.
+	 */
+	if (ret > 0 && (ret & NVME_SC_DNR))
+		nvme_ns_remove(ns);
+	else
+		revalidate_disk_size(ns->disk, true);
+}
+
+static void nvme_validate_or_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns_ids ids = { };
+	struct nvme_ns *ns;
+
+	if (nvme_identify_ns_descs(ctrl, nsid, &ids))
+		return;
+
+	ns = nvme_find_get_ns(ctrl, nsid);
+	if (ns) {
+		nvme_validate_ns(ns, &ids);
+		nvme_put_ns(ns);
+		return;
+	}
+
+	switch (ids.csi) {
+	case NVME_CSI_NVM:
+		nvme_alloc_ns(ctrl, nsid, &ids);
+		break;
+	case NVME_CSI_ZNS:
+		if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+			dev_warn(ctrl->device,
+				"nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
+				nsid);
+			break;
+		}
+		if (!nvme_multi_css(ctrl)) {
+			dev_warn(ctrl->device,
+				"command set not reported for nsid: %d\n",
+				nsid);
+			break;
+		}
+		nvme_alloc_ns(ctrl, nsid, &ids);
+		break;
+	default:
+		dev_warn(ctrl->device, "unknown csi %u for nsid %u\n",
+			ids.csi, nsid);
+		break;
+	}
+}
+
+static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
+					unsigned nsid)
+{
+	struct nvme_ns *ns, *next;
+	LIST_HEAD(rm_list);
+
+	down_write(&ctrl->namespaces_rwsem);
+	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
+		if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
+			list_move_tail(&ns->list, &rm_list);
+	}
+	up_write(&ctrl->namespaces_rwsem);
+
+	list_for_each_entry_safe(ns, next, &rm_list, list)
+		nvme_ns_remove(ns);
+
+}
+
+static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
+{
+	const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
+	__le32 *ns_list;
+	u32 prev = 0;
+	int ret = 0, i;
+
+	if (nvme_ctrl_limited_cns(ctrl))
+		return -EOPNOTSUPP;
+
+	ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
+	if (!ns_list)
+		return -ENOMEM;
+
+	for (;;) {
+		struct nvme_command cmd = {
+			.identify.opcode	= nvme_admin_identify,
+			.identify.cns		= NVME_ID_CNS_NS_ACTIVE_LIST,
+			.identify.nsid		= cpu_to_le32(prev),
+		};
+
+		ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
+					    NVME_IDENTIFY_DATA_SIZE);
+		if (ret)
+			goto free;
+
+		for (i = 0; i < nr_entries; i++) {
+			u32 nsid = le32_to_cpu(ns_list[i]);
+
+			if (!nsid)	/* end of the list? */
+				goto out;
+			nvme_validate_or_alloc_ns(ctrl, nsid);
+			while (++prev < nsid)
+				nvme_ns_remove_by_nsid(ctrl, prev);
+		}
+	}
+ out:
+	nvme_remove_invalid_namespaces(ctrl, prev);
+ free:
+	kfree(ns_list);
+	return ret;
+}
+
+static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl)
+{
+	struct nvme_id_ctrl *id;
+	u32 nn, i;
+
+	if (nvme_identify_ctrl(ctrl, &id))
+		return;
+	nn = le32_to_cpu(id->nn);
+	kfree(id);
+
+	for (i = 1; i <= nn; i++)
+		nvme_validate_or_alloc_ns(ctrl, i);
+
+	nvme_remove_invalid_namespaces(ctrl, nn);
+}
+
+static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
+{
+	size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32);
+	__le32 *log;
+	int error;
+
+	log = kzalloc(log_size, GFP_KERNEL);
+	if (!log)
+		return;
+
+	/*
+	 * We need to read the log to clear the AEN, but we don't want to rely
+	 * on it for the changed namespace information as userspace could have
+	 * raced with us in reading the log page, which could cause us to miss
+	 * updates.
+	 */
+	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
+			NVME_CSI_NVM, log, log_size, 0);
+	if (error)
+		dev_warn(ctrl->device,
+			"reading changed ns log failed: %d\n", error);
+
+	kfree(log);
+}
+
+static void nvme_scan_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, scan_work);
+
+	/* No tagset on a live ctrl means IO queues could not created */
+	if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
+		return;
+
+	if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
+		dev_info(ctrl->device, "rescanning namespaces.\n");
+		nvme_clear_changed_ns_log(ctrl);
+	}
+
+	mutex_lock(&ctrl->scan_lock);
+	if (nvme_scan_ns_list(ctrl) != 0)
+		nvme_scan_ns_sequential(ctrl);
+	mutex_unlock(&ctrl->scan_lock);
+}
+
+/*
+ * This function iterates the namespace list unlocked to allow recovery from
+ * controller failure. It is up to the caller to ensure the namespace list is
+ * not modified by scan work while this function is executing.
+ */
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns, *next;
+	LIST_HEAD(ns_list);
+
+	/*
+	 * make sure to requeue I/O to all namespaces as these
+	 * might result from the scan itself and must complete
+	 * for the scan_work to make progress
+	 */
+	nvme_mpath_clear_ctrl_paths(ctrl);
+
+	/* prevent racing with ns scanning */
+	flush_work(&ctrl->scan_work);
+
+	/*
+	 * The dead states indicates the controller was not gracefully
+	 * disconnected. In that case, we won't be able to flush any data while
+	 * removing the namespaces' disks; fail all the queues now to avoid
+	 * potentially having to clean up the failed sync later.
+	 */
+	if (ctrl->state == NVME_CTRL_DEAD)
+		nvme_kill_queues(ctrl);
+
+	/* this is a no-op when called from the controller reset handler */
+	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
+
+	down_write(&ctrl->namespaces_rwsem);
+	list_splice_init(&ctrl->namespaces, &ns_list);
+	up_write(&ctrl->namespaces_rwsem);
+
+	list_for_each_entry_safe(ns, next, &ns_list, list)
+		nvme_ns_remove(ns);
+}
+EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
+
+static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(dev, struct nvme_ctrl, ctrl_device);
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	int ret;
+
+	ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
+	if (ret)
+		return ret;
+
+	if (opts) {
+		ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
+		if (ret)
+			return ret;
+
+		ret = add_uevent_var(env, "NVME_TRSVCID=%s",
+				opts->trsvcid ?: "none");
+		if (ret)
+			return ret;
+
+		ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
+				opts->host_traddr ?: "none");
+	}
+	return ret;
+}
+
+static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
+{
+	char *envp[2] = { NULL, NULL };
+	u32 aen_result = ctrl->aen_result;
+
+	ctrl->aen_result = 0;
+	if (!aen_result)
+		return;
+
+	envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result);
+	if (!envp[0])
+		return;
+	kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp);
+	kfree(envp[0]);
+}
+
+static void nvme_async_event_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, async_event_work);
+
+	nvme_aen_uevent(ctrl);
+
+	/*
+	 * The transport drivers must guarantee AER submission here is safe by
+	 * flushing ctrl async_event_work after changing the controller state
+	 * from LIVE and before freeing the admin queue.
+	*/
+	if (ctrl->state == NVME_CTRL_LIVE)
+		ctrl->ops->submit_async_event(ctrl);
+}
+
+static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)
+{
+
+	u32 csts;
+
+	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
+		return false;
+
+	if (csts == ~0)
+		return false;
+
+	return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
+}
+
+static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
+{
+	struct nvme_fw_slot_info_log *log;
+
+	log = kmalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return;
+
+	if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
+			log, sizeof(*log), 0))
+		dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
+	kfree(log);
+}
+
+static void nvme_fw_act_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(work,
+				struct nvme_ctrl, fw_act_work);
+	unsigned long fw_act_timeout;
+
+	if (ctrl->mtfa)
+		fw_act_timeout = jiffies +
+				msecs_to_jiffies(ctrl->mtfa * 100);
+	else
+		fw_act_timeout = jiffies +
+				msecs_to_jiffies(admin_timeout * 1000);
+
+	nvme_stop_queues(ctrl);
+	while (nvme_ctrl_pp_status(ctrl)) {
+		if (time_after(jiffies, fw_act_timeout)) {
+			dev_warn(ctrl->device,
+				"Fw activation timeout, reset controller\n");
+			nvme_try_sched_reset(ctrl);
+			return;
+		}
+		msleep(100);
+	}
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
+		return;
+
+	nvme_start_queues(ctrl);
+	/* read FW slot information to clear the AER */
+	nvme_get_fw_slot_info(ctrl);
+}
+
+static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
+{
+	u32 aer_notice_type = (result & 0xff00) >> 8;
+
+	trace_nvme_async_event(ctrl, aer_notice_type);
+
+	switch (aer_notice_type) {
+	case NVME_AER_NOTICE_NS_CHANGED:
+		set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
+		nvme_queue_scan(ctrl);
+		break;
+	case NVME_AER_NOTICE_FW_ACT_STARTING:
+		/*
+		 * We are (ab)using the RESETTING state to prevent subsequent
+		 * recovery actions from interfering with the controller's
+		 * firmware activation.
+		 */
+		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+			queue_work(nvme_wq, &ctrl->fw_act_work);
+		break;
+#ifdef CONFIG_NVME_MULTIPATH
+	case NVME_AER_NOTICE_ANA:
+		if (!ctrl->ana_log_buf)
+			break;
+		queue_work(nvme_wq, &ctrl->ana_work);
+		break;
+#endif
+	case NVME_AER_NOTICE_DISC_CHANGED:
+		ctrl->aen_result = result;
+		break;
+	default:
+		dev_warn(ctrl->device, "async event result %08x\n", result);
+	}
+}
+
+void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
+		volatile union nvme_result *res)
+{
+	u32 result = le32_to_cpu(res->u32);
+	u32 aer_type = result & 0x07;
+
+	if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
+		return;
+
+	switch (aer_type) {
+	case NVME_AER_NOTICE:
+		nvme_handle_aen_notice(ctrl, result);
+		break;
+	case NVME_AER_ERROR:
+	case NVME_AER_SMART:
+	case NVME_AER_CSS:
+	case NVME_AER_VS:
+		trace_nvme_async_event(ctrl, aer_type);
+		ctrl->aen_result = result;
+		break;
+	default:
+		break;
+	}
+	queue_work(nvme_wq, &ctrl->async_event_work);
+}
+EXPORT_SYMBOL_GPL(nvme_complete_async_event);
+
+void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+	nvme_mpath_stop(ctrl);
+	nvme_stop_keep_alive(ctrl);
+	flush_work(&ctrl->async_event_work);
+	cancel_work_sync(&ctrl->fw_act_work);
+	if (ctrl->ops->stop_ctrl)
+		ctrl->ops->stop_ctrl(ctrl);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
+
+void nvme_start_ctrl(struct nvme_ctrl *ctrl)
+{
+	nvme_start_keep_alive(ctrl);
+
+	nvme_enable_aen(ctrl);
+
+	if (ctrl->queue_count > 1) {
+		nvme_queue_scan(ctrl);
+		nvme_start_queues(ctrl);
+		nvme_mpath_update(ctrl);
+	}
+}
+EXPORT_SYMBOL_GPL(nvme_start_ctrl);
+
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
+{
+	nvme_fault_inject_fini(&ctrl->fault_inject);
+	dev_pm_qos_hide_latency_tolerance(ctrl->device);
+	cdev_device_del(&ctrl->cdev, ctrl->device);
+	nvme_put_ctrl(ctrl);
+}
+EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
+
+static void nvme_free_cels(struct nvme_ctrl *ctrl)
+{
+	struct nvme_effects_log	*cel;
+	unsigned long i;
+
+	xa_for_each (&ctrl->cels, i, cel) {
+		xa_erase(&ctrl->cels, i);
+		kfree(cel);
+	}
+
+	xa_destroy(&ctrl->cels);
+}
+
+static void nvme_free_ctrl(struct device *dev)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(dev, struct nvme_ctrl, ctrl_device);
+	struct nvme_subsystem *subsys = ctrl->subsys;
+
+	if (!subsys || ctrl->instance != subsys->instance)
+		ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+
+	nvme_free_cels(ctrl);
+	nvme_mpath_uninit(ctrl);
+	__free_page(ctrl->discard_page);
+
+	if (subsys) {
+		mutex_lock(&nvme_subsystems_lock);
+		list_del(&ctrl->subsys_entry);
+		sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
+		mutex_unlock(&nvme_subsystems_lock);
+	}
+
+	ctrl->ops->free_ctrl(ctrl);
+
+	if (subsys)
+		nvme_put_subsystem(subsys);
+}
+
+/*
+ * Initialize a NVMe controller structures.  This needs to be called during
+ * earliest initialization so that we have the initialized structured around
+ * during probing.
+ */
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+		const struct nvme_ctrl_ops *ops, unsigned long quirks)
+{
+	int ret;
+
+	ctrl->state = NVME_CTRL_NEW;
+	spin_lock_init(&ctrl->lock);
+	mutex_init(&ctrl->scan_lock);
+	INIT_LIST_HEAD(&ctrl->namespaces);
+	xa_init(&ctrl->cels);
+	init_rwsem(&ctrl->namespaces_rwsem);
+	ctrl->dev = dev;
+	ctrl->ops = ops;
+	ctrl->quirks = quirks;
+	ctrl->numa_node = NUMA_NO_NODE;
+	INIT_WORK(&ctrl->scan_work, nvme_scan_work);
+	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
+	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
+	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
+	init_waitqueue_head(&ctrl->state_wq);
+
+	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
+	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
+	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+
+	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
+			PAGE_SIZE);
+	ctrl->discard_page = alloc_page(GFP_KERNEL);
+	if (!ctrl->discard_page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+	ctrl->instance = ret;
+
+	device_initialize(&ctrl->ctrl_device);
+	ctrl->device = &ctrl->ctrl_device;
+	ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance);
+	ctrl->device->class = nvme_class;
+	ctrl->device->parent = ctrl->dev;
+	ctrl->device->groups = nvme_dev_attr_groups;
+	ctrl->device->release = nvme_free_ctrl;
+	dev_set_drvdata(ctrl->device, ctrl);
+	ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
+	if (ret)
+		goto out_release_instance;
+
+	nvme_get_ctrl(ctrl);
+	cdev_init(&ctrl->cdev, &nvme_dev_fops);
+	ctrl->cdev.owner = ops->module;
+	ret = cdev_device_add(&ctrl->cdev, ctrl->device);
+	if (ret)
+		goto out_free_name;
+
+	/*
+	 * Initialize latency tolerance controls.  The sysfs files won't
+	 * be visible to userspace unless the device actually supports APST.
+	 */
+	ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
+	dev_pm_qos_update_user_latency_tolerance(ctrl->device,
+		min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+
+	nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
+	nvme_mpath_init_ctrl(ctrl);
+
+	return 0;
+out_free_name:
+	nvme_put_ctrl(ctrl);
+	kfree_const(ctrl->device->kobj.name);
+out_release_instance:
+	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+out:
+	if (ctrl->discard_page)
+		__free_page(ctrl->discard_page);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_init_ctrl);
+
+/**
+ * nvme_kill_queues(): Ends all namespace queues
+ * @ctrl: the dead controller that needs to end
+ *
+ * Call this function when the driver determines it is unable to get the
+ * controller in a state capable of servicing IO.
+ */
+void nvme_kill_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+
+	/* Forcibly unquiesce queues to avoid blocking dispatch */
+	if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
+		blk_mq_unquiesce_queue(ctrl->admin_q);
+
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		nvme_set_queue_dying(ns);
+
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_kill_queues);
+
+void nvme_unfreeze(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_unfreeze_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unfreeze);
+
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
+		if (timeout <= 0)
+			break;
+	}
+	up_read(&ctrl->namespaces_rwsem);
+	return timeout;
+}
+EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
+
+void nvme_wait_freeze(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_freeze_queue_wait(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_wait_freeze);
+
+void nvme_start_freeze(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_freeze_queue_start(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_start_freeze);
+
+void nvme_stop_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_quiesce_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_stop_queues);
+
+void nvme_start_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_unquiesce_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_start_queues);
+
+void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_sync_queue(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
+
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+	nvme_sync_io_queues(ctrl);
+	if (ctrl->admin_q)
+		blk_sync_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
+struct nvme_ctrl *nvme_ctrl_from_file(struct file *file)
+{
+	if (file->f_op != &nvme_dev_fops)
+		return NULL;
+	return file->private_data;
+}
+EXPORT_SYMBOL_NS_GPL(nvme_ctrl_from_file, NVME_TARGET_PASSTHRU);
+
+/*
+ * Check we didn't inadvertently grow the command structure sizes:
+ */
+static inline void _nvme_check_size(void)
+{
+	BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
+}
+
+
+static int __init nvme_core_init(void)
+{
+	int result = -ENOMEM;
+
+	_nvme_check_size();
+
+	nvme_wq = alloc_workqueue("nvme-wq",
+			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+	if (!nvme_wq)
+		goto out;
+
+	nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
+			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+	if (!nvme_reset_wq)
+		goto destroy_wq;
+
+	nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
+			WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+	if (!nvme_delete_wq)
+		goto destroy_reset_wq;
+
+	result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
+	if (result < 0)
+		goto destroy_delete_wq;
+
+	nvme_class = class_create(THIS_MODULE, "nvme");
+	if (IS_ERR(nvme_class)) {
+		result = PTR_ERR(nvme_class);
+		goto unregister_chrdev;
+	}
+	nvme_class->dev_uevent = nvme_class_uevent;
+
+	nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
+	if (IS_ERR(nvme_subsys_class)) {
+		result = PTR_ERR(nvme_subsys_class);
+		goto destroy_class;
+	}
+	return 0;
+
+destroy_class:
+	class_destroy(nvme_class);
+unregister_chrdev:
+	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+destroy_delete_wq:
+	destroy_workqueue(nvme_delete_wq);
+destroy_reset_wq:
+	destroy_workqueue(nvme_reset_wq);
+destroy_wq:
+	destroy_workqueue(nvme_wq);
+out:
+	return result;
+}
+
+static void __exit nvme_core_exit(void)
+{
+	class_destroy(nvme_subsys_class);
+	class_destroy(nvme_class);
+	unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+	destroy_workqueue(nvme_delete_wq);
+	destroy_workqueue(nvme_reset_wq);
+	destroy_workqueue(nvme_wq);
+	ida_destroy(&nvme_instance_ida);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+module_init(nvme_core_init);
+module_exit(nvme_core_exit);
--- a/feed/kmod-nvme/src/fabrics.h
+++ b/feed/kmod-nvme/src/fabrics.h
@ -0,0 +1,192 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NVMe over Fabrics common host code.
+ * Copyright (c) 2015-2016 HGST, a Western Digital Company.
+ */
+#ifndef _NVME_FABRICS_H
+#define _NVME_FABRICS_H 1
+
+#include <linux/in.h>
+#include <linux/inet.h>
+
+#define NVMF_MIN_QUEUE_SIZE	16
+#define NVMF_MAX_QUEUE_SIZE	1024
+#define NVMF_DEF_QUEUE_SIZE	128
+#define NVMF_DEF_RECONNECT_DELAY	10
+/* default to 600 seconds of reconnect attempts before giving up */
+#define NVMF_DEF_CTRL_LOSS_TMO		600
+
+/*
+ * Define a host as seen by the target.  We allocate one at boot, but also
+ * allow the override it when creating controllers.  This is both to provide
+ * persistence of the Host NQN over multiple boots, and to allow using
+ * multiple ones, for example in a container scenario.  Because we must not
+ * use different Host NQNs with the same Host ID we generate a Host ID and
+ * use this structure to keep track of the relation between the two.
+ */
+struct nvmf_host {
+	struct kref		ref;
+	struct list_head	list;
+	char			nqn[NVMF_NQN_SIZE];
+	uuid_t			id;
+};
+
+/**
+ * enum nvmf_parsing_opts - used to define the sysfs parsing options used.
+ */
+enum {
+	NVMF_OPT_ERR		= 0,
+	NVMF_OPT_TRANSPORT	= 1 << 0,
+	NVMF_OPT_NQN		= 1 << 1,
+	NVMF_OPT_TRADDR		= 1 << 2,
+	NVMF_OPT_TRSVCID	= 1 << 3,
+	NVMF_OPT_QUEUE_SIZE	= 1 << 4,
+	NVMF_OPT_NR_IO_QUEUES	= 1 << 5,
+	NVMF_OPT_TL_RETRY_COUNT	= 1 << 6,
+	NVMF_OPT_KATO		= 1 << 7,
+	NVMF_OPT_HOSTNQN	= 1 << 8,
+	NVMF_OPT_RECONNECT_DELAY = 1 << 9,
+	NVMF_OPT_HOST_TRADDR	= 1 << 10,
+	NVMF_OPT_CTRL_LOSS_TMO	= 1 << 11,
+	NVMF_OPT_HOST_ID	= 1 << 12,
+	NVMF_OPT_DUP_CONNECT	= 1 << 13,
+	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
+	NVMF_OPT_HDR_DIGEST	= 1 << 15,
+	NVMF_OPT_DATA_DIGEST	= 1 << 16,
+	NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
+	NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
+	NVMF_OPT_TOS		= 1 << 19,
+};
+
+/**
+ * struct nvmf_ctrl_options - Used to hold the options specified
+ *			      with the parsing opts enum.
+ * @mask:	Used by the fabrics library to parse through sysfs options
+ *		on adding a NVMe controller.
+ * @transport:	Holds the fabric transport "technology name" (for a lack of
+ *		better description) that will be used by an NVMe controller
+ *		being added.
+ * @subsysnqn:	Hold the fully qualified NQN subystem name (format defined
+ *		in the NVMe specification, "NVMe Qualified Names").
+ * @traddr:	The transport-specific TRADDR field for a port on the
+ *              subsystem which is adding a controller.
+ * @trsvcid:	The transport-specific TRSVCID field for a port on the
+ *              subsystem which is adding a controller.
+ * @host_traddr: A transport-specific field identifying the NVME host port
+ *              to use for the connection to the controller.
+ * @queue_size: Number of IO queue elements.
+ * @nr_io_queues: Number of controller IO queues that will be established.
+ * @reconnect_delay: Time between two consecutive reconnect attempts.
+ * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
+ * @kato:	Keep-alive timeout.
+ * @host:	Virtual NVMe host, contains the NQN and Host ID.
+ * @max_reconnects: maximum number of allowed reconnect attempts before removing
+ *              the controller, (-1) means reconnect forever, zero means remove
+ *              immediately;
+ * @disable_sqflow: disable controller sq flow control
+ * @hdr_digest: generate/verify header digest (TCP)
+ * @data_digest: generate/verify data digest (TCP)
+ * @nr_write_queues: number of queues for write I/O
+ * @nr_poll_queues: number of queues for polling I/O
+ * @tos: type of service
+ */
+struct nvmf_ctrl_options {
+	unsigned		mask;
+	char			*transport;
+	char			*subsysnqn;
+	char			*traddr;
+	char			*trsvcid;
+	char			*host_traddr;
+	size_t			queue_size;
+	unsigned int		nr_io_queues;
+	unsigned int		reconnect_delay;
+	bool			discovery_nqn;
+	bool			duplicate_connect;
+	unsigned int		kato;
+	struct nvmf_host	*host;
+	int			max_reconnects;
+	bool			disable_sqflow;
+	bool			hdr_digest;
+	bool			data_digest;
+	unsigned int		nr_write_queues;
+	unsigned int		nr_poll_queues;
+	int			tos;
+};
+
+/*
+ * struct nvmf_transport_ops - used to register a specific
+ *			       fabric implementation of NVMe fabrics.
+ * @entry:		Used by the fabrics library to add the new
+ *			registration entry to its linked-list internal tree.
+ * @module:             Transport module reference
+ * @name:		Name of the NVMe fabric driver implementation.
+ * @required_opts:	sysfs command-line options that must be specified
+ *			when adding a new NVMe controller.
+ * @allowed_opts:	sysfs command-line options that can be specified
+ *			when adding a new NVMe controller.
+ * @create_ctrl():	function pointer that points to a non-NVMe
+ *			implementation-specific fabric technology
+ *			that would go into starting up that fabric
+ *			for the purpose of conneciton to an NVMe controller
+ *			using that fabric technology.
+ *
+ * Notes:
+ *	1. At minimum, 'required_opts' and 'allowed_opts' should
+ *	   be set to the same enum parsing options defined earlier.
+ *	2. create_ctrl() must be defined (even if it does nothing)
+ *	3. struct nvmf_transport_ops must be statically allocated in the
+ *	   modules .bss section so that a pure module_get on @module
+ *	   prevents the memory from beeing freed.
+ */
+struct nvmf_transport_ops {
+	struct list_head	entry;
+	struct module		*module;
+	const char		*name;
+	int			required_opts;
+	int			allowed_opts;
+	struct nvme_ctrl	*(*create_ctrl)(struct device *dev,
+					struct nvmf_ctrl_options *opts);
+};
+
+static inline bool
+nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
+			struct nvmf_ctrl_options *opts)
+{
+	if (ctrl->state == NVME_CTRL_DELETING ||
+	    ctrl->state == NVME_CTRL_DELETING_NOIO ||
+	    ctrl->state == NVME_CTRL_DEAD ||
+	    strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) ||
+	    strcmp(opts->host->nqn, ctrl->opts->host->nqn) ||
+	    memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t)))
+		return false;
+
+	return true;
+}
+
+int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
+int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
+int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
+int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll);
+int nvmf_register_transport(struct nvmf_transport_ops *ops);
+void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
+void nvmf_free_options(struct nvmf_ctrl_options *opts);
+int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
+blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
+		struct request *rq);
+bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
+		bool queue_live);
+bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
+		struct nvmf_ctrl_options *opts);
+
+static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
+		bool queue_live)
+{
+	if (likely(ctrl->state == NVME_CTRL_LIVE ||
+		   ctrl->state == NVME_CTRL_DELETING))
+		return true;
+	return __nvmf_check_ready(ctrl, rq, queue_live);
+}
+
+#endif /* _NVME_FABRICS_H */
--- a/feed/kmod-nvme/src/nvme.h
+++ b/feed/kmod-nvme/src/nvme.h
@ -0,0 +1,893 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2011-2014, Intel Corporation.
+ */
+
+#ifndef _NVME_H
+#define _NVME_H
+
+#include <linux/nvme.h>
+#include <linux/cdev.h>
+#include <linux/pci.h>
+#include <linux/kref.h>
+#include <linux/blk-mq.h>
+#include <linux/lightnvm.h>
+#include <linux/sed-opal.h>
+#include <linux/fault-inject.h>
+#include <linux/rcupdate.h>
+#include <linux/wait.h>
+#include <linux/t10-pi.h>
+
+#include <trace/events/block.h>
+
+extern unsigned int nvme_io_timeout;
+#define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
+
+extern unsigned int admin_timeout;
+#define ADMIN_TIMEOUT	(admin_timeout * HZ)
+
+#define NVME_DEFAULT_KATO	5
+#define NVME_KATO_GRACE		10
+
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
+#define  NVME_INLINE_SG_CNT  0
+#define  NVME_INLINE_METADATA_SG_CNT  0
+#else
+#define  NVME_INLINE_SG_CNT  2
+#define  NVME_INLINE_METADATA_SG_CNT  1
+#endif
+
+/*
+ * Default to a 4K page size, with the intention to update this
+ * path in the future to accommodate architectures with differing
+ * kernel and IO page sizes.
+ */
+#define NVME_CTRL_PAGE_SHIFT	12
+#define NVME_CTRL_PAGE_SIZE	(1 << NVME_CTRL_PAGE_SHIFT)
+
+extern struct workqueue_struct *nvme_wq;
+extern struct workqueue_struct *nvme_reset_wq;
+extern struct workqueue_struct *nvme_delete_wq;
+
+enum {
+	NVME_NS_LBA		= 0,
+	NVME_NS_LIGHTNVM	= 1,
+};
+
+/*
+ * List of workarounds for devices that required behavior not specified in
+ * the standard.
+ */
+enum nvme_quirks {
+	/*
+	 * Prefers I/O aligned to a stripe size specified in a vendor
+	 * specific Identify field.
+	 */
+	NVME_QUIRK_STRIPE_SIZE			= (1 << 0),
+
+	/*
+	 * The controller doesn't handle Identify value others than 0 or 1
+	 * correctly.
+	 */
+	NVME_QUIRK_IDENTIFY_CNS			= (1 << 1),
+
+	/*
+	 * The controller deterministically returns O's on reads to
+	 * logical blocks that deallocate was called on.
+	 */
+	NVME_QUIRK_DEALLOCATE_ZEROES		= (1 << 2),
+
+	/*
+	 * The controller needs a delay before starts checking the device
+	 * readiness, which is done by reading the NVME_CSTS_RDY bit.
+	 */
+	NVME_QUIRK_DELAY_BEFORE_CHK_RDY		= (1 << 3),
+
+	/*
+	 * APST should not be used.
+	 */
+	NVME_QUIRK_NO_APST			= (1 << 4),
+
+	/*
+	 * The deepest sleep state should not be used.
+	 */
+	NVME_QUIRK_NO_DEEPEST_PS		= (1 << 5),
+
+	/*
+	 * Supports the LighNVM command set if indicated in vs[1].
+	 */
+	NVME_QUIRK_LIGHTNVM			= (1 << 6),
+
+	/*
+	 * Set MEDIUM priority on SQ creation
+	 */
+	NVME_QUIRK_MEDIUM_PRIO_SQ		= (1 << 7),
+
+	/*
+	 * Ignore device provided subnqn.
+	 */
+	NVME_QUIRK_IGNORE_DEV_SUBNQN		= (1 << 8),
+
+	/*
+	 * Broken Write Zeroes.
+	 */
+	NVME_QUIRK_DISABLE_WRITE_ZEROES		= (1 << 9),
+
+	/*
+	 * Force simple suspend/resume path.
+	 */
+	NVME_QUIRK_SIMPLE_SUSPEND		= (1 << 10),
+
+	/*
+	 * Use only one interrupt vector for all queues
+	 */
+	NVME_QUIRK_SINGLE_VECTOR		= (1 << 11),
+
+	/*
+	 * Use non-standard 128 bytes SQEs.
+	 */
+	NVME_QUIRK_128_BYTES_SQES		= (1 << 12),
+
+	/*
+	 * Prevent tag overlap between queues
+	 */
+	NVME_QUIRK_SHARED_TAGS                  = (1 << 13),
+
+	/*
+	 * Don't change the value of the temperature threshold feature
+	 */
+	NVME_QUIRK_NO_TEMP_THRESH_CHANGE	= (1 << 14),
+
+	/*
+	 * The controller doesn't handle the Identify Namespace
+	 * Identification Descriptor list subcommand despite claiming
+	 * NVMe 1.3 compliance.
+	 */
+	NVME_QUIRK_NO_NS_DESC_LIST		= (1 << 15),
+
+	/*
+	 * The controller requires the command_id value be be limited, so skip
+	 * encoding the generation sequence number.
+	 */
+	NVME_QUIRK_SKIP_CID_GEN			= (1 << 17),
+
+	/*
+	 * Reports garbage in the namespace identifiers (eui64, nguid, uuid).
+	 */
+	NVME_QUIRK_BOGUS_NID			= (1 << 18),
+};
+
+/*
+ * Common request structure for NVMe passthrough.  All drivers must have
+ * this structure as the first member of their request-private data.
+ */
+struct nvme_request {
+	struct nvme_command	*cmd;
+	union nvme_result	result;
+	u8			genctr;
+	u8			retries;
+	u8			flags;
+	u16			status;
+	struct nvme_ctrl	*ctrl;
+};
+
+/*
+ * Mark a bio as coming in through the mpath node.
+ */
+#define REQ_NVME_MPATH		REQ_DRV
+
+enum {
+	NVME_REQ_CANCELLED		= (1 << 0),
+	NVME_REQ_USERCMD		= (1 << 1),
+};
+
+static inline struct nvme_request *nvme_req(struct request *req)
+{
+	return blk_mq_rq_to_pdu(req);
+}
+
+static inline u16 nvme_req_qid(struct request *req)
+{
+	if (!req->q->queuedata)
+		return 0;
+	return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1;
+}
+
+/* The below value is the specific amount of delay needed before checking
+ * readiness in case of the PCI_DEVICE(0x1c58, 0x0003), which needs the
+ * NVME_QUIRK_DELAY_BEFORE_CHK_RDY quirk enabled. The value (in ms) was
+ * found empirically.
+ */
+#define NVME_QUIRK_DELAY_AMOUNT		2300
+
+/*
+ * enum nvme_ctrl_state: Controller state
+ *
+ * @NVME_CTRL_NEW:		New controller just allocated, initial state
+ * @NVME_CTRL_LIVE:		Controller is connected and I/O capable
+ * @NVME_CTRL_RESETTING:	Controller is resetting (or scheduled reset)
+ * @NVME_CTRL_CONNECTING:	Controller is disconnected, now connecting the
+ *				transport
+ * @NVME_CTRL_DELETING:		Controller is deleting (or scheduled deletion)
+ * @NVME_CTRL_DELETING_NOIO:	Controller is deleting and I/O is not
+ *				disabled/failed immediately. This state comes
+ * 				after all async event processing took place and
+ * 				before ns removal and the controller deletion
+ * 				progress
+ * @NVME_CTRL_DEAD:		Controller is non-present/unresponsive during
+ *				shutdown or removal. In this case we forcibly
+ *				kill all inflight I/O as they have no chance to
+ *				complete
+ */
+enum nvme_ctrl_state {
+	NVME_CTRL_NEW,
+	NVME_CTRL_LIVE,
+	NVME_CTRL_RESETTING,
+	NVME_CTRL_CONNECTING,
+	NVME_CTRL_DELETING,
+	NVME_CTRL_DELETING_NOIO,
+	NVME_CTRL_DEAD,
+};
+
+struct nvme_fault_inject {
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+	struct fault_attr attr;
+	struct dentry *parent;
+	bool dont_retry;	/* DNR, do not retry */
+	u16 status;		/* status code */
+#endif
+};
+
+struct nvme_ctrl {
+	bool comp_seen;
+	enum nvme_ctrl_state state;
+	bool identified;
+	spinlock_t lock;
+	struct mutex scan_lock;
+	const struct nvme_ctrl_ops *ops;
+	struct request_queue *admin_q;
+	struct request_queue *connect_q;
+	struct request_queue *fabrics_q;
+	struct device *dev;
+	int instance;
+	int numa_node;
+	struct blk_mq_tag_set *tagset;
+	struct blk_mq_tag_set *admin_tagset;
+	struct list_head namespaces;
+	struct rw_semaphore namespaces_rwsem;
+	struct device ctrl_device;
+	struct device *device;	/* char device */
+	struct cdev cdev;
+	struct work_struct reset_work;
+	struct work_struct delete_work;
+	wait_queue_head_t state_wq;
+
+	struct nvme_subsystem *subsys;
+	struct list_head subsys_entry;
+
+	struct opal_dev *opal_dev;
+
+	char name[12];
+	u16 cntlid;
+
+	u32 ctrl_config;
+	u16 mtfa;
+	u32 queue_count;
+
+	u64 cap;
+	u32 max_hw_sectors;
+	u32 max_segments;
+	u32 max_integrity_segments;
+#ifdef CONFIG_BLK_DEV_ZONED
+	u32 max_zone_append;
+#endif
+	u16 crdt[3];
+	u16 oncs;
+	u16 oacs;
+	u16 nssa;
+	u16 nr_streams;
+	u16 sqsize;
+	u32 max_namespaces;
+	atomic_t abort_limit;
+	u8 vwc;
+	u32 vs;
+	u32 sgls;
+	u16 kas;
+	u8 npss;
+	u8 apsta;
+	u16 wctemp;
+	u16 cctemp;
+	u32 oaes;
+	u32 aen_result;
+	u32 ctratt;
+	unsigned int shutdown_timeout;
+	unsigned int kato;
+	bool subsystem;
+	unsigned long quirks;
+	struct nvme_id_power_state psd[32];
+	struct nvme_effects_log *effects;
+	struct xarray cels;
+	struct work_struct scan_work;
+	struct work_struct async_event_work;
+	struct delayed_work ka_work;
+	struct nvme_command ka_cmd;
+	struct work_struct fw_act_work;
+	unsigned long events;
+
+#ifdef CONFIG_NVME_MULTIPATH
+	/* asymmetric namespace access: */
+	u8 anacap;
+	u8 anatt;
+	u32 anagrpmax;
+	u32 nanagrpid;
+	struct mutex ana_lock;
+	struct nvme_ana_rsp_hdr *ana_log_buf;
+	size_t ana_log_size;
+	struct timer_list anatt_timer;
+	struct work_struct ana_work;
+#endif
+
+	/* Power saving configuration */
+	u64 ps_max_latency_us;
+	bool apst_enabled;
+
+	/* PCIe only: */
+	u32 hmpre;
+	u32 hmmin;
+	u32 hmminds;
+	u16 hmmaxd;
+
+	/* Fabrics only */
+	u32 ioccsz;
+	u32 iorcsz;
+	u16 icdoff;
+	u16 maxcmd;
+	int nr_reconnects;
+	struct nvmf_ctrl_options *opts;
+
+	struct page *discard_page;
+	unsigned long discard_page_busy;
+
+	struct nvme_fault_inject fault_inject;
+};
+
+enum nvme_iopolicy {
+	NVME_IOPOLICY_NUMA,
+	NVME_IOPOLICY_RR,
+};
+
+struct nvme_subsystem {
+	int			instance;
+	struct device		dev;
+	/*
+	 * Because we unregister the device on the last put we need
+	 * a separate refcount.
+	 */
+	struct kref		ref;
+	struct list_head	entry;
+	struct mutex		lock;
+	struct list_head	ctrls;
+	struct list_head	nsheads;
+	char			subnqn[NVMF_NQN_SIZE];
+	char			serial[20];
+	char			model[40];
+	char			firmware_rev[8];
+	u8			cmic;
+	u16			vendor_id;
+	u16			awupf;	/* 0's based awupf value. */
+	struct ida		ns_ida;
+#ifdef CONFIG_NVME_MULTIPATH
+	enum nvme_iopolicy	iopolicy;
+#endif
+};
+
+/*
+ * Container structure for uniqueue namespace identifiers.
+ */
+struct nvme_ns_ids {
+	u8	eui64[8];
+	u8	nguid[16];
+	uuid_t	uuid;
+	u8	csi;
+};
+
+/*
+ * Anchor structure for namespaces.  There is one for each namespace in a
+ * NVMe subsystem that any of our controllers can see, and the namespace
+ * structure for each controller is chained of it.  For private namespaces
+ * there is a 1:1 relation to our namespace structures, that is ->list
+ * only ever has a single entry for private namespaces.
+ */
+struct nvme_ns_head {
+	struct list_head	list;
+	struct srcu_struct      srcu;
+	struct nvme_subsystem	*subsys;
+	unsigned		ns_id;
+	struct nvme_ns_ids	ids;
+	struct list_head	entry;
+	struct kref		ref;
+	bool			shared;
+	int			instance;
+	struct nvme_effects_log *effects;
+#ifdef CONFIG_NVME_MULTIPATH
+	struct gendisk		*disk;
+	struct bio_list		requeue_list;
+	spinlock_t		requeue_lock;
+	struct work_struct	requeue_work;
+	struct mutex		lock;
+	unsigned long		flags;
+#define NVME_NSHEAD_DISK_LIVE	0
+	struct nvme_ns __rcu	*current_path[];
+#endif
+};
+
+enum nvme_ns_features {
+	NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */
+	NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
+};
+
+struct nvme_ns {
+	struct list_head list;
+
+	struct nvme_ctrl *ctrl;
+	struct request_queue *queue;
+	struct gendisk *disk;
+#ifdef CONFIG_NVME_MULTIPATH
+	enum nvme_ana_state ana_state;
+	u32 ana_grpid;
+#endif
+	struct list_head siblings;
+	struct nvm_dev *ndev;
+	struct kref kref;
+	struct nvme_ns_head *head;
+
+	int lba_shift;
+	u16 ms;
+	u16 sgs;
+	u32 sws;
+	u8 pi_type;
+#ifdef CONFIG_BLK_DEV_ZONED
+	u64 zsze;
+#endif
+	unsigned long features;
+	unsigned long flags;
+#define NVME_NS_REMOVING	0
+#define NVME_NS_DEAD     	1
+#define NVME_NS_ANA_PENDING	2
+
+	struct nvme_fault_inject fault_inject;
+
+};
+
+/* NVMe ns supports metadata actions by the controller (generate/strip) */
+static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+{
+	return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
+}
+
+struct nvme_ctrl_ops {
+	const char *name;
+	struct module *module;
+	unsigned int flags;
+#define NVME_F_FABRICS			(1 << 0)
+#define NVME_F_METADATA_SUPPORTED	(1 << 1)
+#define NVME_F_PCI_P2PDMA		(1 << 2)
+	int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
+	int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
+	int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
+	void (*free_ctrl)(struct nvme_ctrl *ctrl);
+	void (*submit_async_event)(struct nvme_ctrl *ctrl);
+	void (*delete_ctrl)(struct nvme_ctrl *ctrl);
+	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
+	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+};
+
+/*
+ * nvme command_id is constructed as such:
+ * | xxxx | xxxxxxxxxxxx |
+ *   gen    request tag
+ */
+#define nvme_genctr_mask(gen)			(gen & 0xf)
+#define nvme_cid_install_genctr(gen)		(nvme_genctr_mask(gen) << 12)
+#define nvme_genctr_from_cid(cid)		((cid & 0xf000) >> 12)
+#define nvme_tag_from_cid(cid)			(cid & 0xfff)
+
+static inline u16 nvme_cid(struct request *rq)
+{
+	return nvme_cid_install_genctr(nvme_req(rq)->genctr) | rq->tag;
+}
+
+static inline struct request *nvme_find_rq(struct blk_mq_tags *tags,
+		u16 command_id)
+{
+	u8 genctr = nvme_genctr_from_cid(command_id);
+	u16 tag = nvme_tag_from_cid(command_id);
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(tags, tag);
+	if (unlikely(!rq)) {
+		pr_err("could not locate request for tag %#x\n",
+			tag);
+		return NULL;
+	}
+	if (unlikely(nvme_genctr_mask(nvme_req(rq)->genctr) != genctr)) {
+		dev_err(nvme_req(rq)->ctrl->device,
+			"request %#x genctr mismatch (got %#x expected %#x)\n",
+			tag, genctr, nvme_genctr_mask(nvme_req(rq)->genctr));
+		return NULL;
+	}
+	return rq;
+}
+
+static inline struct request *nvme_cid_to_rq(struct blk_mq_tags *tags,
+                u16 command_id)
+{
+	return blk_mq_tag_to_rq(tags, nvme_tag_from_cid(command_id));
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj,
+			    const char *dev_name);
+void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject);
+void nvme_should_fail(struct request *req);
+#else
+static inline void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj,
+					  const char *dev_name)
+{
+}
+static inline void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inj)
+{
+}
+static inline void nvme_should_fail(struct request *req) {}
+#endif
+
+static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
+{
+	if (!ctrl->subsystem)
+		return -ENOTTY;
+	return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
+}
+
+/*
+ * Convert a 512B sector number to a device logical block number.
+ */
+static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
+{
+	return sector >> (ns->lba_shift - SECTOR_SHIFT);
+}
+
+/*
+ * Convert a device logical block number to a 512B sector number.
+ */
+static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
+{
+	return lba << (ns->lba_shift - SECTOR_SHIFT);
+}
+
+/*
+ * Convert byte length to nvme's 0-based num dwords
+ */
+static inline u32 nvme_bytes_to_numd(size_t len)
+{
+	return (len >> 2) - 1;
+}
+
+static inline bool nvme_is_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static inline bool nvme_is_path_error(u16 status)
+{
+	/* check for a status code type of 'path related status' */
+	return (status & 0x700) == 0x300;
+}
+
+/*
+ * Fill in the status and result information from the CQE, and then figure out
+ * if blk-mq will need to use IPI magic to complete the request, and if yes do
+ * so.  If not let the caller complete the request without an indirect function
+ * call.
+ */
+static inline bool nvme_try_complete_req(struct request *req, __le16 status,
+		union nvme_result result)
+{
+	struct nvme_request *rq = nvme_req(req);
+
+	rq->status = le16_to_cpu(status) >> 1;
+	rq->result = result;
+	/* inject error when permitted by fault injection framework */
+	nvme_should_fail(req);
+	if (unlikely(blk_should_fake_timeout(req->q)))
+		return true;
+	return blk_mq_complete_request_remote(req);
+}
+
+static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl)
+{
+	get_device(ctrl->device);
+}
+
+static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
+{
+	put_device(ctrl->device);
+}
+
+static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
+{
+	return !qid &&
+		nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH;
+}
+
+void nvme_complete_rq(struct request *req);
+bool nvme_cancel_request(struct request *req, void *data, bool reserved);
+void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
+void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
+bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
+		enum nvme_ctrl_state new_state);
+bool nvme_wait_reset(struct nvme_ctrl *ctrl);
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
+int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+		const struct nvme_ctrl_ops *ops, unsigned long quirks);
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
+void nvme_start_ctrl(struct nvme_ctrl *ctrl);
+void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
+int nvme_init_identify(struct nvme_ctrl *ctrl);
+
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
+
+int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+		bool send);
+
+void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
+		volatile union nvme_result *res);
+
+void nvme_stop_queues(struct nvme_ctrl *ctrl);
+void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_kill_queues(struct nvme_ctrl *ctrl);
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
+void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
+void nvme_unfreeze(struct nvme_ctrl *ctrl);
+void nvme_wait_freeze(struct nvme_ctrl *ctrl);
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
+void nvme_start_freeze(struct nvme_ctrl *ctrl);
+
+#define NVME_QID_ANY -1
+struct request *nvme_alloc_request(struct request_queue *q,
+		struct nvme_command *cmd, blk_mq_req_flags_t flags);
+struct request *nvme_alloc_request_qid(struct request_queue *q,
+		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid);
+void nvme_cleanup_cmd(struct request *req);
+blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *cmd);
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buf, unsigned bufflen);
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		union nvme_result *result, void *buffer, unsigned bufflen,
+		unsigned timeout, int qid, int at_head,
+		blk_mq_req_flags_t flags, bool poll);
+int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
+		      unsigned int dword11, void *buffer, size_t buflen,
+		      u32 *result);
+int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
+		      unsigned int dword11, void *buffer, size_t buflen,
+		      u32 *result);
+int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
+void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
+int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
+int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
+
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
+		void *log, size_t size, u64 offset);
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+		struct nvme_ns_head **head, int *srcu_idx);
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
+
+extern const struct attribute_group *nvme_ns_id_attr_groups[];
+extern const struct block_device_operations nvme_ns_head_ops;
+
+#ifdef CONFIG_NVME_MULTIPATH
+static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+	return ctrl->ana_log_buf != NULL;
+}
+
+void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
+void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
+void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
+void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
+			struct nvme_ctrl *ctrl, int *flags);
+void nvme_failover_req(struct request *req);
+void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
+int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
+void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
+void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
+void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
+void nvme_mpath_update(struct nvme_ctrl *ctrl);
+void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
+void nvme_mpath_stop(struct nvme_ctrl *ctrl);
+bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
+void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
+struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+blk_qc_t nvme_ns_head_submit_bio(struct bio *bio);
+
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+	struct nvme_ns_head *head = ns->head;
+
+	if (head->disk && list_empty(&head->list))
+		kblockd_schedule_work(&head->requeue_work);
+}
+
+static inline void nvme_trace_bio_complete(struct request *req,
+        blk_status_t status)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+
+	if (req->cmd_flags & REQ_NVME_MPATH)
+		trace_block_bio_complete(ns->head->disk->queue, req->bio);
+}
+
+extern struct device_attribute dev_attr_ana_grpid;
+extern struct device_attribute dev_attr_ana_state;
+extern struct device_attribute subsys_attr_iopolicy;
+
+#else
+static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+	return false;
+}
+/*
+ * Without the multipath code enabled, multiple controller per subsystems are
+ * visible as devices and thus we cannot use the subsystem instance.
+ */
+static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
+				      struct nvme_ctrl *ctrl, int *flags)
+{
+	sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
+}
+
+static inline void nvme_failover_req(struct request *req)
+{
+}
+static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
+{
+}
+static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
+		struct nvme_ns_head *head)
+{
+	return 0;
+}
+static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
+		struct nvme_id_ns *id)
+{
+}
+static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+{
+}
+static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+	return false;
+}
+static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
+{
+}
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+}
+static inline void nvme_trace_bio_complete(struct request *req,
+        blk_status_t status)
+{
+}
+static inline void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
+{
+}
+static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl,
+		struct nvme_id_ctrl *id)
+{
+	if (ctrl->subsys->cmic & (1 << 3))
+		dev_warn(ctrl->device,
+"Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n");
+	return 0;
+}
+static inline void nvme_mpath_update(struct nvme_ctrl *ctrl)
+{
+}
+static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
+{
+}
+static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
+{
+}
+static inline void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
+{
+}
+static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
+{
+}
+static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
+{
+}
+#endif /* CONFIG_NVME_MULTIPATH */
+
+int nvme_revalidate_zones(struct nvme_ns *ns);
+#ifdef CONFIG_BLK_DEV_ZONED
+int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
+int nvme_report_zones(struct gendisk *disk, sector_t sector,
+		      unsigned int nr_zones, report_zones_cb cb, void *data);
+
+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
+				       struct nvme_command *cmnd,
+				       enum nvme_zone_mgmt_action action);
+#else
+#define nvme_report_zones NULL
+
+static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
+		struct request *req, struct nvme_command *cmnd,
+		enum nvme_zone_mgmt_action action)
+{
+	return BLK_STS_NOTSUPP;
+}
+
+static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
+{
+	dev_warn(ns->ctrl->device,
+		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
+	return -EPROTONOSUPPORT;
+}
+#endif
+
+#ifdef CONFIG_NVM
+int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
+void nvme_nvm_unregister(struct nvme_ns *ns);
+extern const struct attribute_group nvme_nvm_attr_group;
+int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
+#else
+static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
+				    int node)
+{
+	return 0;
+}
+
+static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
+static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd,
+							unsigned long arg)
+{
+	return -ENOTTY;
+}
+#endif /* CONFIG_NVM */
+
+static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
+{
+	return dev_to_disk(dev)->private_data;
+}
+
+#ifdef CONFIG_NVME_HWMON
+int nvme_hwmon_init(struct nvme_ctrl *ctrl);
+#else
+static inline int nvme_hwmon_init(struct nvme_ctrl *ctrl)
+{
+	return 0;
+}
+#endif
+
+u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+			 u8 opcode);
+void nvme_execute_passthru_rq(struct request *rq);
+struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
+struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
+void nvme_put_ns(struct nvme_ns *ns);
+
+#endif /* _NVME_H */
--- a/feed/kmod-nvme/src/pci.c
+++ b/feed/kmod-nvme/src/pci.c
@ -0,0 +1,3310 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ */
+
+#include <linux/acpi.h>
+#include <linux/aer.h>
+#include <linux/async.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-pci.h>
+#include <linux/dmi.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/once.h>
+#include <linux/pci.h>
+#include <linux/suspend.h>
+#include <linux/t10-pi.h>
+#include <linux/types.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/sed-opal.h>
+#include <linux/pci-p2pdma.h>
+
+#include "trace.h"
+#include "nvme.h"
+
+#define SQ_SIZE(q)	((q)->q_depth << (q)->sqes)
+#define CQ_SIZE(q)	((q)->q_depth * sizeof(struct nvme_completion))
+
+#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+
+/*
+ * These can be higher, but we need to ensure that any command doesn't
+ * require an sg allocation that needs more than a page of data.
+ */
+#define NVME_MAX_KB_SZ	4096
+#define NVME_MAX_SEGS	127
+
+static int use_threaded_interrupts;
+module_param(use_threaded_interrupts, int, 0);
+
+static bool use_cmb_sqes = true;
+module_param(use_cmb_sqes, bool, 0444);
+MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
+
+static unsigned int max_host_mem_size_mb = 128;
+module_param(max_host_mem_size_mb, uint, 0444);
+MODULE_PARM_DESC(max_host_mem_size_mb,
+	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
+
+static unsigned int sgl_threshold = SZ_32K;
+module_param(sgl_threshold, uint, 0644);
+MODULE_PARM_DESC(sgl_threshold,
+		"Use SGLs when average request segment size is larger or equal to "
+		"this size. Use 0 to disable SGLs.");
+
+static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
+static const struct kernel_param_ops io_queue_depth_ops = {
+	.set = io_queue_depth_set,
+	.get = param_get_uint,
+};
+
+static unsigned int io_queue_depth = 1024;
+module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
+MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
+
+static int io_queue_count_set(const char *val, const struct kernel_param *kp)
+{
+	unsigned int n;
+	int ret;
+
+	ret = kstrtouint(val, 10, &n);
+	if (ret != 0 || n > num_possible_cpus())
+		return -EINVAL;
+	return param_set_uint(val, kp);
+}
+
+static const struct kernel_param_ops io_queue_count_ops = {
+	.set = io_queue_count_set,
+	.get = param_get_uint,
+};
+
+static unsigned int write_queues;
+module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
+MODULE_PARM_DESC(write_queues,
+	"Number of queues to use for writes. If not set, reads and writes "
+	"will share a queue set.");
+
+static unsigned int poll_queues;
+module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
+MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
+
+static bool noacpi;
+module_param(noacpi, bool, 0444);
+MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
+
+struct nvme_dev;
+struct nvme_queue;
+
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
+static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
+
+/*
+ * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+	struct nvme_queue *queues;
+	struct blk_mq_tag_set tagset;
+	struct blk_mq_tag_set admin_tagset;
+	u32 __iomem *dbs;
+	struct device *dev;
+	struct dma_pool *prp_page_pool;
+	struct dma_pool *prp_small_pool;
+	unsigned online_queues;
+	unsigned max_qid;
+	unsigned io_queues[HCTX_MAX_TYPES];
+	unsigned int num_vecs;
+	u32 q_depth;
+	int io_sqes;
+	u32 db_stride;
+	void __iomem *bar;
+	unsigned long bar_mapped_size;
+	struct work_struct remove_work;
+	struct mutex shutdown_lock;
+	bool subsystem;
+	u64 cmb_size;
+	bool cmb_use_sqes;
+	u32 cmbsz;
+	u32 cmbloc;
+	struct nvme_ctrl ctrl;
+	u32 last_ps;
+
+	mempool_t *iod_mempool;
+
+	/* shadow doorbell buffer support: */
+	u32 *dbbuf_dbs;
+	dma_addr_t dbbuf_dbs_dma_addr;
+	u32 *dbbuf_eis;
+	dma_addr_t dbbuf_eis_dma_addr;
+
+	/* host memory buffer support: */
+	u64 host_mem_size;
+	u32 nr_host_mem_descs;
+	dma_addr_t host_mem_descs_dma;
+	struct nvme_host_mem_buf_desc *host_mem_descs;
+	void **host_mem_desc_bufs;
+	unsigned int nr_allocated_queues;
+	unsigned int nr_write_queues;
+	unsigned int nr_poll_queues;
+};
+
+static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
+{
+	int ret;
+	u32 n;
+
+	ret = kstrtou32(val, 10, &n);
+	if (ret != 0 || n < 2)
+		return -EINVAL;
+
+	return param_set_uint(val, kp);
+}
+
+static inline unsigned int sq_idx(unsigned int qid, u32 stride)
+{
+	return qid * 2 * stride;
+}
+
+static inline unsigned int cq_idx(unsigned int qid, u32 stride)
+{
+	return (qid * 2 + 1) * stride;
+}
+
+static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_dev, ctrl);
+}
+
+/*
+ * An NVM Express queue.  Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+	struct nvme_dev *dev;
+	spinlock_t sq_lock;
+	void *sq_cmds;
+	 /* only used for poll queues: */
+	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
+	struct nvme_completion *cqes;
+	dma_addr_t sq_dma_addr;
+	dma_addr_t cq_dma_addr;
+	u32 __iomem *q_db;
+	u32 q_depth;
+	u16 cq_vector;
+	u16 sq_tail;
+	u16 last_sq_tail;
+	u16 cq_head;
+	u16 qid;
+	u8 cq_phase;
+	u8 sqes;
+	unsigned long flags;
+#define NVMEQ_ENABLED		0
+#define NVMEQ_SQ_CMB		1
+#define NVMEQ_DELETE_ERROR	2
+#define NVMEQ_POLLED		3
+	u32 *dbbuf_sq_db;
+	u32 *dbbuf_cq_db;
+	u32 *dbbuf_sq_ei;
+	u32 *dbbuf_cq_ei;
+	struct completion delete_done;
+};
+
+/*
+ * The nvme_iod describes the data in an I/O.
+ *
+ * The sg pointer contains the list of PRP/SGL chunk allocations in addition
+ * to the actual struct scatterlist.
+ */
+struct nvme_iod {
+	struct nvme_request req;
+	struct nvme_command cmd;
+	struct nvme_queue *nvmeq;
+	bool use_sgl;
+	int aborted;
+	int npages;		/* In the PRP list. 0 means small pool in use */
+	int nents;		/* Used in scatterlist */
+	dma_addr_t first_dma;
+	unsigned int dma_len;	/* length of single DMA segment mapping */
+	dma_addr_t meta_dma;
+	struct scatterlist *sg;
+};
+
+static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
+{
+	return dev->nr_allocated_queues * 8 * dev->db_stride;
+}
+
+static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+{
+	unsigned int mem_size = nvme_dbbuf_size(dev);
+
+	if (dev->dbbuf_dbs)
+		return 0;
+
+	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
+					    &dev->dbbuf_dbs_dma_addr,
+					    GFP_KERNEL);
+	if (!dev->dbbuf_dbs)
+		return -ENOMEM;
+	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
+					    &dev->dbbuf_eis_dma_addr,
+					    GFP_KERNEL);
+	if (!dev->dbbuf_eis) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+		dev->dbbuf_dbs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
+{
+	unsigned int mem_size = nvme_dbbuf_size(dev);
+
+	if (dev->dbbuf_dbs) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+		dev->dbbuf_dbs = NULL;
+	}
+	if (dev->dbbuf_eis) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
+		dev->dbbuf_eis = NULL;
+	}
+}
+
+static void nvme_dbbuf_init(struct nvme_dev *dev,
+			    struct nvme_queue *nvmeq, int qid)
+{
+	if (!dev->dbbuf_dbs || !qid)
+		return;
+
+	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+}
+
+static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
+{
+	if (!nvmeq->qid)
+		return;
+
+	nvmeq->dbbuf_sq_db = NULL;
+	nvmeq->dbbuf_cq_db = NULL;
+	nvmeq->dbbuf_sq_ei = NULL;
+	nvmeq->dbbuf_cq_ei = NULL;
+}
+
+static void nvme_dbbuf_set(struct nvme_dev *dev)
+{
+	struct nvme_command c;
+	unsigned int i;
+
+	if (!dev->dbbuf_dbs)
+		return;
+
+	memset(&c, 0, sizeof(c));
+	c.dbbuf.opcode = nvme_admin_dbbuf;
+	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
+	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
+
+	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
+		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
+		/* Free memory and continue on */
+		nvme_dbbuf_dma_free(dev);
+
+		for (i = 1; i <= dev->online_queues; i++)
+			nvme_dbbuf_free(&dev->queues[i]);
+	}
+}
+
+static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+/* Update dbbuf and return true if an MMIO is required */
+static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
+					      volatile u32 *dbbuf_ei)
+{
+	if (dbbuf_db) {
+		u16 old_value;
+
+		/*
+		 * Ensure that the queue is written before updating
+		 * the doorbell in memory
+		 */
+		wmb();
+
+		old_value = *dbbuf_db;
+		*dbbuf_db = value;
+
+		/*
+		 * Ensure that the doorbell is updated before reading the event
+		 * index from memory.  The controller needs to provide similar
+		 * ordering to ensure the envent index is updated before reading
+		 * the doorbell.
+		 */
+		mb();
+
+		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Will slightly overestimate the number of pages needed.  This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_pci_npages_prp(void)
+{
+	unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
+				      NVME_CTRL_PAGE_SIZE);
+	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+/*
+ * Calculates the number of pages needed for the SGL segments. For example a 4k
+ * page can accommodate 256 SGL descriptors.
+ */
+static int nvme_pci_npages_sgl(void)
+{
+	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
+			PAGE_SIZE);
+}
+
+static size_t nvme_pci_iod_alloc_size(void)
+{
+	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
+
+	return sizeof(__le64 *) * npages +
+		sizeof(struct scatterlist) * NVME_MAX_SEGS;
+}
+
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+				unsigned int hctx_idx)
+{
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = &dev->queues[0];
+
+	WARN_ON(hctx_idx != 0);
+	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+
+	hctx->driver_data = nvmeq;
+	return 0;
+}
+
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+			  unsigned int hctx_idx)
+{
+	struct nvme_dev *dev = data;
+	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+
+	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
+	hctx->driver_data = nvmeq;
+	return 0;
+}
+
+static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
+		unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct nvme_dev *dev = set->driver_data;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
+	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
+
+	BUG_ON(!nvmeq);
+	iod->nvmeq = nvmeq;
+
+	nvme_req(req)->ctrl = &dev->ctrl;
+	return 0;
+}
+
+static int queue_irq_offset(struct nvme_dev *dev)
+{
+	/* if we have more than 1 vec, admin queue offsets us by 1 */
+	if (dev->num_vecs > 1)
+		return 1;
+
+	return 0;
+}
+
+static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
+{
+	struct nvme_dev *dev = set->driver_data;
+	int i, qoff, offset;
+
+	offset = queue_irq_offset(dev);
+	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+		struct blk_mq_queue_map *map = &set->map[i];
+
+		map->nr_queues = dev->io_queues[i];
+		if (!map->nr_queues) {
+			BUG_ON(i == HCTX_TYPE_DEFAULT);
+			continue;
+		}
+
+		/*
+		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
+		 * affinity), so use the regular blk-mq cpu mapping
+		 */
+		map->queue_offset = qoff;
+		if (i != HCTX_TYPE_POLL && offset)
+			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+		else
+			blk_mq_map_queues(map);
+		qoff += map->nr_queues;
+		offset += map->nr_queues;
+	}
+
+	return 0;
+}
+
+/*
+ * Write sq tail if we are asked to, or if the next command would wrap.
+ */
+static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
+{
+	if (!write_sq) {
+		u16 next_tail = nvmeq->sq_tail + 1;
+
+		if (next_tail == nvmeq->q_depth)
+			next_tail = 0;
+		if (next_tail != nvmeq->last_sq_tail)
+			return;
+	}
+
+	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
+			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
+		writel(nvmeq->sq_tail, nvmeq->q_db);
+	nvmeq->last_sq_tail = nvmeq->sq_tail;
+}
+
+/**
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * @nvmeq: The queue to use
+ * @cmd: The command to send
+ * @write_sq: whether to write to the SQ doorbell
+ */
+static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+			    bool write_sq)
+{
+	spin_lock(&nvmeq->sq_lock);
+	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
+	       cmd, sizeof(*cmd));
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	nvme_write_sq_db(nvmeq, write_sq);
+	spin_unlock(&nvmeq->sq_lock);
+}
+
+static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+
+	spin_lock(&nvmeq->sq_lock);
+	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
+		nvme_write_sq_db(nvmeq, true);
+	spin_unlock(&nvmeq->sq_lock);
+}
+
+static void **nvme_pci_iod_list(struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
+}
+
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	int nseg = blk_rq_nr_phys_segments(req);
+	unsigned int avg_seg_size;
+
+	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+
+	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+		return false;
+	if (!iod->nvmeq->qid)
+		return false;
+	if (!sgl_threshold || avg_seg_size < sgl_threshold)
+		return false;
+	return true;
+}
+
+static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+{
+	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	dma_addr_t dma_addr = iod->first_dma;
+	int i;
+
+	for (i = 0; i < iod->npages; i++) {
+		__le64 *prp_list = nvme_pci_iod_list(req)[i];
+		dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
+
+		dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+		dma_addr = next_dma_addr;
+	}
+
+}
+
+static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
+{
+	const int last_sg = SGES_PER_PAGE - 1;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	dma_addr_t dma_addr = iod->first_dma;
+	int i;
+
+	for (i = 0; i < iod->npages; i++) {
+		struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
+		dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
+
+		dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
+		dma_addr = next_dma_addr;
+	}
+
+}
+
+static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+	if (is_pci_p2pdma_page(sg_page(iod->sg)))
+		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
+				    rq_dma_dir(req));
+	else
+		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
+}
+
+static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+	if (iod->dma_len) {
+		dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
+			       rq_dma_dir(req));
+		return;
+	}
+
+	WARN_ON_ONCE(!iod->nents);
+
+	nvme_unmap_sg(dev, req);
+	if (iod->npages == 0)
+		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+			      iod->first_dma);
+	else if (iod->use_sgl)
+		nvme_free_sgls(dev, req);
+	else
+		nvme_free_prps(dev, req);
+	mempool_free(iod->sg, dev->iod_mempool);
+}
+
+static void nvme_print_sgl(struct scatterlist *sgl, int nents)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sgl, sg, nents, i) {
+		dma_addr_t phys = sg_phys(sg);
+		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
+			"dma_address:%pad dma_length:%d\n",
+			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
+			sg_dma_len(sg));
+	}
+}
+
+static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+		struct request *req, struct nvme_rw_command *cmnd)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct dma_pool *pool;
+	int length = blk_rq_payload_bytes(req);
+	struct scatterlist *sg = iod->sg;
+	int dma_len = sg_dma_len(sg);
+	u64 dma_addr = sg_dma_address(sg);
+	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
+	__le64 *prp_list;
+	void **list = nvme_pci_iod_list(req);
+	dma_addr_t prp_dma;
+	int nprps, i;
+
+	length -= (NVME_CTRL_PAGE_SIZE - offset);
+	if (length <= 0) {
+		iod->first_dma = 0;
+		goto done;
+	}
+
+	dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
+	if (dma_len) {
+		dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
+	} else {
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+
+	if (length <= NVME_CTRL_PAGE_SIZE) {
+		iod->first_dma = dma_addr;
+		goto done;
+	}
+
+	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
+	if (nprps <= (256 / 8)) {
+		pool = dev->prp_small_pool;
+		iod->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		iod->npages = 1;
+	}
+
+	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+	if (!prp_list) {
+		iod->first_dma = dma_addr;
+		iod->npages = -1;
+		return BLK_STS_RESOURCE;
+	}
+	list[0] = prp_list;
+	iod->first_dma = prp_dma;
+	i = 0;
+	for (;;) {
+		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
+			__le64 *old_prp_list = prp_list;
+			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+			if (!prp_list)
+				goto free_prps;
+			list[iod->npages++] = prp_list;
+			prp_list[0] = old_prp_list[i - 1];
+			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+			i = 1;
+		}
+		prp_list[i++] = cpu_to_le64(dma_addr);
+		dma_len -= NVME_CTRL_PAGE_SIZE;
+		dma_addr += NVME_CTRL_PAGE_SIZE;
+		length -= NVME_CTRL_PAGE_SIZE;
+		if (length <= 0)
+			break;
+		if (dma_len > 0)
+			continue;
+		if (unlikely(dma_len < 0))
+			goto bad_sgl;
+		sg = sg_next(sg);
+		dma_addr = sg_dma_address(sg);
+		dma_len = sg_dma_len(sg);
+	}
+done:
+	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
+	return BLK_STS_OK;
+free_prps:
+	nvme_free_prps(dev, req);
+	return BLK_STS_RESOURCE;
+bad_sgl:
+	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
+			"Invalid SGL for payload:%d nents:%d\n",
+			blk_rq_payload_bytes(req), iod->nents);
+	return BLK_STS_IOERR;
+}
+
+static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
+		struct scatterlist *sg)
+{
+	sge->addr = cpu_to_le64(sg_dma_address(sg));
+	sge->length = cpu_to_le32(sg_dma_len(sg));
+	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
+}
+
+static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
+		dma_addr_t dma_addr, int entries)
+{
+	sge->addr = cpu_to_le64(dma_addr);
+	if (entries < SGES_PER_PAGE) {
+		sge->length = cpu_to_le32(entries * sizeof(*sge));
+		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
+	} else {
+		sge->length = cpu_to_le32(PAGE_SIZE);
+		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
+	}
+}
+
+static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+		struct request *req, struct nvme_rw_command *cmd, int entries)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct dma_pool *pool;
+	struct nvme_sgl_desc *sg_list;
+	struct scatterlist *sg = iod->sg;
+	dma_addr_t sgl_dma;
+	int i = 0;
+
+	/* setting the transfer type as SGL */
+	cmd->flags = NVME_CMD_SGL_METABUF;
+
+	if (entries == 1) {
+		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+		return BLK_STS_OK;
+	}
+
+	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
+		pool = dev->prp_small_pool;
+		iod->npages = 0;
+	} else {
+		pool = dev->prp_page_pool;
+		iod->npages = 1;
+	}
+
+	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+	if (!sg_list) {
+		iod->npages = -1;
+		return BLK_STS_RESOURCE;
+	}
+
+	nvme_pci_iod_list(req)[0] = sg_list;
+	iod->first_dma = sgl_dma;
+
+	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
+
+	do {
+		if (i == SGES_PER_PAGE) {
+			struct nvme_sgl_desc *old_sg_desc = sg_list;
+			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];
+
+			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
+			if (!sg_list)
+				goto free_sgls;
+
+			i = 0;
+			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
+			sg_list[i++] = *link;
+			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
+		}
+
+		nvme_pci_sgl_set_data(&sg_list[i++], sg);
+		sg = sg_next(sg);
+	} while (--entries > 0);
+
+	return BLK_STS_OK;
+free_sgls:
+	nvme_free_sgls(dev, req);
+	return BLK_STS_RESOURCE;
+}
+
+static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
+		struct request *req, struct nvme_rw_command *cmnd,
+		struct bio_vec *bv)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
+	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
+
+	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
+	if (dma_mapping_error(dev->dev, iod->first_dma))
+		return BLK_STS_RESOURCE;
+	iod->dma_len = bv->bv_len;
+
+	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
+	if (bv->bv_len > first_prp_len)
+		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
+	return BLK_STS_OK;
+}
+
+static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
+		struct request *req, struct nvme_rw_command *cmnd,
+		struct bio_vec *bv)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
+	if (dma_mapping_error(dev->dev, iod->first_dma))
+		return BLK_STS_RESOURCE;
+	iod->dma_len = bv->bv_len;
+
+	cmnd->flags = NVME_CMD_SGL_METABUF;
+	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
+	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
+	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
+	return BLK_STS_OK;
+}
+
+static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
+		struct nvme_command *cmnd)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	blk_status_t ret = BLK_STS_RESOURCE;
+	int nr_mapped;
+
+	if (blk_rq_nr_phys_segments(req) == 1) {
+		struct bio_vec bv = req_bvec(req);
+
+		if (!is_pci_p2pdma_page(bv.bv_page)) {
+			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
+				return nvme_setup_prp_simple(dev, req,
+							     &cmnd->rw, &bv);
+
+			if (iod->nvmeq->qid && sgl_threshold &&
+			    dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
+				return nvme_setup_sgl_simple(dev, req,
+							     &cmnd->rw, &bv);
+		}
+	}
+
+	iod->dma_len = 0;
+	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+	if (!iod->sg)
+		return BLK_STS_RESOURCE;
+	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
+	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
+	if (!iod->nents)
+		goto out_free_sg;
+
+	if (is_pci_p2pdma_page(sg_page(iod->sg)))
+		nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
+				iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
+	else
+		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
+					     rq_dma_dir(req), DMA_ATTR_NO_WARN);
+	if (!nr_mapped)
+		goto out_free_sg;
+
+	iod->use_sgl = nvme_pci_use_sgls(dev, req);
+	if (iod->use_sgl)
+		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
+	else
+		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+	if (ret != BLK_STS_OK)
+		goto out_unmap_sg;
+	return BLK_STS_OK;
+
+out_unmap_sg:
+	nvme_unmap_sg(dev, req);
+out_free_sg:
+	mempool_free(iod->sg, dev->iod_mempool);
+	return ret;
+}
+
+static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
+		struct nvme_command *cmnd)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
+			rq_dma_dir(req), 0);
+	if (dma_mapping_error(dev->dev, iod->meta_dma))
+		return BLK_STS_IOERR;
+	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+	return BLK_STS_OK;
+}
+
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
+static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *req = bd->rq;
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_command *cmnd = &iod->cmd;
+	blk_status_t ret;
+
+	iod->aborted = 0;
+	iod->npages = -1;
+	iod->nents = 0;
+
+	/*
+	 * We should not need to do this, but we're still using this to
+	 * ensure we can drain requests on a dying queue.
+	 */
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
+		return BLK_STS_IOERR;
+
+	ret = nvme_setup_cmd(ns, req, cmnd);
+	if (ret)
+		return ret;
+
+	if (blk_rq_nr_phys_segments(req)) {
+		ret = nvme_map_data(dev, req, cmnd);
+		if (ret)
+			goto out_free_cmd;
+	}
+
+	if (blk_integrity_rq(req)) {
+		ret = nvme_map_metadata(dev, req, cmnd);
+		if (ret)
+			goto out_unmap_data;
+	}
+
+	blk_mq_start_request(req);
+	nvme_submit_cmd(nvmeq, cmnd, bd->last);
+	return BLK_STS_OK;
+out_unmap_data:
+	nvme_unmap_data(dev, req);
+out_free_cmd:
+	nvme_cleanup_cmd(req);
+	return ret;
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_dev *dev = iod->nvmeq->dev;
+
+	if (blk_integrity_rq(req))
+		dma_unmap_page(dev->dev, iod->meta_dma,
+			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
+	if (blk_rq_nr_phys_segments(req))
+		nvme_unmap_data(dev, req);
+	nvme_complete_rq(req);
+}
+
+/* We read the CQE phase first to check if the rest of the entry is valid */
+static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
+{
+	struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];
+
+	return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
+}
+
+static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
+{
+	u16 head = nvmeq->cq_head;
+
+	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+					      nvmeq->dbbuf_cq_ei))
+		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+}
+
+static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
+{
+	if (!nvmeq->qid)
+		return nvmeq->dev->admin_tagset.tags[0];
+	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
+}
+
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+{
+	struct nvme_completion *cqe = &nvmeq->cqes[idx];
+	__u16 command_id = READ_ONCE(cqe->command_id);
+	struct request *req;
+
+	/*
+	 * AEN requests are special as they don't time out and can
+	 * survive any kind of queue freeze and often don't respond to
+	 * aborts.  We don't even bother to allocate a struct request
+	 * for them but rather special case them here.
+	 */
+	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
+		nvme_complete_async_event(&nvmeq->dev->ctrl,
+				cqe->status, &cqe->result);
+		return;
+	}
+
+	req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id);
+	if (unlikely(!req)) {
+		dev_warn(nvmeq->dev->ctrl.device,
+			"invalid id %d completed on queue %d\n",
+			command_id, le16_to_cpu(cqe->sq_id));
+		return;
+	}
+
+	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
+		nvme_pci_complete_rq(req);
+}
+
+static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
+{
+	u32 tmp = nvmeq->cq_head + 1;
+
+	if (tmp == nvmeq->q_depth) {
+		nvmeq->cq_head = 0;
+		nvmeq->cq_phase ^= 1;
+	} else {
+		nvmeq->cq_head = tmp;
+	}
+}
+
+static inline int nvme_process_cq(struct nvme_queue *nvmeq)
+{
+	int found = 0;
+
+	while (nvme_cqe_pending(nvmeq)) {
+		found++;
+		/*
+		 * load-load control dependency between phase and the rest of
+		 * the cqe requires a full read memory barrier
+		 */
+		dma_rmb();
+		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_update_cq_head(nvmeq);
+	}
+
+	if (found)
+		nvme_ring_cq_doorbell(nvmeq);
+	return found;
+}
+
+static irqreturn_t nvme_irq(int irq, void *data)
+{
+	struct nvme_queue *nvmeq = data;
+	irqreturn_t ret = IRQ_NONE;
+
+	/*
+	 * The rmb/wmb pair ensures we see all updates from a previous run of
+	 * the irq handler, even if that was on another CPU.
+	 */
+	rmb();
+	if (nvme_process_cq(nvmeq))
+		ret = IRQ_HANDLED;
+	wmb();
+
+	return ret;
+}
+
+static irqreturn_t nvme_irq_check(int irq, void *data)
+{
+	struct nvme_queue *nvmeq = data;
+
+	if (nvme_cqe_pending(nvmeq))
+		return IRQ_WAKE_THREAD;
+	return IRQ_NONE;
+}
+
+/*
+ * Poll for completions for any interrupt driven queue
+ * Can be called from any context.
+ */
+static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
+{
+	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
+
+	WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
+
+	disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+	nvme_process_cq(nvmeq);
+	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+}
+
+static int nvme_poll(struct blk_mq_hw_ctx *hctx)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	bool found;
+
+	if (!nvme_cqe_pending(nvmeq))
+		return 0;
+
+	spin_lock(&nvmeq->cq_poll_lock);
+	found = nvme_process_cq(nvmeq);
+	spin_unlock(&nvmeq->cq_poll_lock);
+
+	return found;
+}
+
+static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	struct nvme_queue *nvmeq = &dev->queues[0];
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_async_event;
+	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+	nvme_submit_cmd(nvmeq, &c, true);
+}
+
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.delete_queue.opcode = opcode;
+	c.delete_queue.qid = cpu_to_le16(id);
+
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+}
+
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+		struct nvme_queue *nvmeq, s16 vector)
+{
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG;
+
+	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
+		flags |= NVME_CQ_IRQ_ENABLED;
+
+	/*
+	 * Note: we (ab)use the fact that the prp fields survive if no data
+	 * is attached to the request.
+	 */
+	memset(&c, 0, sizeof(c));
+	c.create_cq.opcode = nvme_admin_create_cq;
+	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
+	c.create_cq.cqid = cpu_to_le16(qid);
+	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_cq.cq_flags = cpu_to_le16(flags);
+	c.create_cq.irq_vector = cpu_to_le16(vector);
+
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+}
+
+static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+						struct nvme_queue *nvmeq)
+{
+	struct nvme_ctrl *ctrl = &dev->ctrl;
+	struct nvme_command c;
+	int flags = NVME_QUEUE_PHYS_CONTIG;
+
+	/*
+	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
+	 * set. Since URGENT priority is zeroes, it makes all queues
+	 * URGENT.
+	 */
+	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
+		flags |= NVME_SQ_PRIO_MEDIUM;
+
+	/*
+	 * Note: we (ab)use the fact that the prp fields survive if no data
+	 * is attached to the request.
+	 */
+	memset(&c, 0, sizeof(c));
+	c.create_sq.opcode = nvme_admin_create_sq;
+	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
+	c.create_sq.sqid = cpu_to_le16(qid);
+	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+	c.create_sq.sq_flags = cpu_to_le16(flags);
+	c.create_sq.cqid = cpu_to_le16(qid);
+
+	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+}
+
+static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
+}
+
+static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
+{
+	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
+}
+
+static void abort_endio(struct request *req, blk_status_t error)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = iod->nvmeq;
+
+	dev_warn(nvmeq->dev->ctrl.device,
+		 "Abort status: 0x%x", nvme_req(req)->status);
+	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
+	blk_mq_free_request(req);
+}
+
+static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
+{
+	/* If true, indicates loss of adapter communication, possibly by a
+	 * NVMe Subsystem reset.
+	 */
+	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
+
+	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
+	switch (dev->ctrl.state) {
+	case NVME_CTRL_RESETTING:
+	case NVME_CTRL_CONNECTING:
+		return false;
+	default:
+		break;
+	}
+
+	/* We shouldn't reset unless the controller is on fatal error state
+	 * _or_ if we lost the communication with it.
+	 */
+	if (!(csts & NVME_CSTS_CFS) && !nssro)
+		return false;
+
+	return true;
+}
+
+static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
+{
+	/* Read a config register to help see what died. */
+	u16 pci_status;
+	int result;
+
+	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
+				      &pci_status);
+	if (result == PCIBIOS_SUCCESSFUL)
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
+			 csts, pci_status);
+	else
+		dev_warn(dev->ctrl.device,
+			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
+			 csts, result);
+}
+
+static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
+{
+	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+	struct nvme_queue *nvmeq = iod->nvmeq;
+	struct nvme_dev *dev = nvmeq->dev;
+	struct request *abort_req;
+	struct nvme_command cmd;
+	u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+	/* If PCI error recovery process is happening, we cannot reset or
+	 * the recovery mechanism will surely fail.
+	 */
+	mb();
+	if (pci_channel_offline(to_pci_dev(dev->dev)))
+		return BLK_EH_RESET_TIMER;
+
+	/*
+	 * Reset immediately if the controller is failed
+	 */
+	if (nvme_should_reset(dev, csts)) {
+		nvme_warn_reset(dev, csts);
+		nvme_dev_disable(dev, false);
+		nvme_reset_ctrl(&dev->ctrl);
+		return BLK_EH_DONE;
+	}
+
+	/*
+	 * Did we miss an interrupt?
+	 */
+	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
+		nvme_poll(req->mq_hctx);
+	else
+		nvme_poll_irqdisable(nvmeq);
+
+	if (blk_mq_request_completed(req)) {
+		dev_warn(dev->ctrl.device,
+			 "I/O %d QID %d timeout, completion polled\n",
+			 req->tag, nvmeq->qid);
+		return BLK_EH_DONE;
+	}
+
+	/*
+	 * Shutdown immediately if controller times out while starting. The
+	 * reset work will see the pci device disabled when it gets the forced
+	 * cancellation error. All outstanding requests are completed on
+	 * shutdown, so we return BLK_EH_DONE.
+	 */
+	switch (dev->ctrl.state) {
+	case NVME_CTRL_CONNECTING:
+		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+		fallthrough;
+	case NVME_CTRL_DELETING:
+		dev_warn_ratelimited(dev->ctrl.device,
+			 "I/O %d QID %d timeout, disable controller\n",
+			 req->tag, nvmeq->qid);
+		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+		nvme_dev_disable(dev, true);
+		return BLK_EH_DONE;
+	case NVME_CTRL_RESETTING:
+		return BLK_EH_RESET_TIMER;
+	default:
+		break;
+	}
+
+	/*
+	 * Shutdown the controller immediately and schedule a reset if the
+	 * command was already aborted once before and still hasn't been
+	 * returned to the driver, or if this is the admin queue.
+	 */
+	if (!nvmeq->qid || iod->aborted) {
+		dev_warn(dev->ctrl.device,
+			 "I/O %d QID %d timeout, reset controller\n",
+			 req->tag, nvmeq->qid);
+		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+		nvme_dev_disable(dev, false);
+		nvme_reset_ctrl(&dev->ctrl);
+
+		return BLK_EH_DONE;
+	}
+
+	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
+		atomic_inc(&dev->ctrl.abort_limit);
+		return BLK_EH_RESET_TIMER;
+	}
+	iod->aborted = 1;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.abort.opcode = nvme_admin_abort_cmd;
+	cmd.abort.cid = nvme_cid(req);
+	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
+
+	dev_warn(nvmeq->dev->ctrl.device,
+		"I/O %d QID %d timeout, aborting\n",
+		 req->tag, nvmeq->qid);
+
+	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
+			BLK_MQ_REQ_NOWAIT);
+	if (IS_ERR(abort_req)) {
+		atomic_inc(&dev->ctrl.abort_limit);
+		return BLK_EH_RESET_TIMER;
+	}
+
+	abort_req->end_io_data = NULL;
+	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
+
+	/*
+	 * The aborted req will be completed on receiving the abort req.
+	 * We enable the timer again. If hit twice, it'll cause a device reset,
+	 * as the device then is in a faulty state.
+	 */
+	return BLK_EH_RESET_TIMER;
+}
+
+static void nvme_free_queue(struct nvme_queue *nvmeq)
+{
+	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
+				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	if (!nvmeq->sq_cmds)
+		return;
+
+	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
+		pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
+				nvmeq->sq_cmds, SQ_SIZE(nvmeq));
+	} else {
+		dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
+				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+	}
+}
+
+static void nvme_free_queues(struct nvme_dev *dev, int lowest)
+{
+	int i;
+
+	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
+		dev->ctrl.queue_count--;
+		nvme_free_queue(&dev->queues[i]);
+	}
+}
+
+/**
+ * nvme_suspend_queue - put queue into suspended state
+ * @nvmeq: queue to suspend
+ */
+static int nvme_suspend_queue(struct nvme_queue *nvmeq)
+{
+	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
+		return 1;
+
+	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
+	mb();
+
+	nvmeq->dev->online_queues--;
+	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
+		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
+	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
+		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
+	return 0;
+}
+
+static void nvme_suspend_io_queues(struct nvme_dev *dev)
+{
+	int i;
+
+	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
+		nvme_suspend_queue(&dev->queues[i]);
+}
+
+static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
+{
+	struct nvme_queue *nvmeq = &dev->queues[0];
+
+	if (shutdown)
+		nvme_shutdown_ctrl(&dev->ctrl);
+	else
+		nvme_disable_ctrl(&dev->ctrl);
+
+	nvme_poll_irqdisable(nvmeq);
+}
+
+/*
+ * Called only on a device that has been disabled and after all other threads
+ * that can check this device's completion queues have synced, except
+ * nvme_poll(). This is the last chance for the driver to see a natural
+ * completion before nvme_cancel_request() terminates all incomplete requests.
+ */
+static void nvme_reap_pending_cqes(struct nvme_dev *dev)
+{
+	int i;
+
+	for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
+		spin_lock(&dev->queues[i].cq_poll_lock);
+		nvme_process_cq(&dev->queues[i]);
+		spin_unlock(&dev->queues[i].cq_poll_lock);
+	}
+}
+
+static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
+				int entry_size)
+{
+	int q_depth = dev->q_depth;
+	unsigned q_size_aligned = roundup(q_depth * entry_size,
+					  NVME_CTRL_PAGE_SIZE);
+
+	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
+		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
+
+		mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
+		q_depth = div_u64(mem_per_q, entry_size);
+
+		/*
+		 * Ensure the reduced q_depth is above some threshold where it
+		 * would be better to map queues in system memory with the
+		 * original depth
+		 */
+		if (q_depth < 64)
+			return -ENOMEM;
+	}
+
+	return q_depth;
+}
+
+static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+				int qid)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
+		if (nvmeq->sq_cmds) {
+			nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
+							nvmeq->sq_cmds);
+			if (nvmeq->sq_dma_addr) {
+				set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
+				return 0;
+			}
+
+			pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
+		}
+	}
+
+	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
+				&nvmeq->sq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->sq_cmds)
+		return -ENOMEM;
+	return 0;
+}
+
+static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
+{
+	struct nvme_queue *nvmeq = &dev->queues[qid];
+
+	if (dev->ctrl.queue_count > qid)
+		return 0;
+
+	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
+	nvmeq->q_depth = depth;
+	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
+					 &nvmeq->cq_dma_addr, GFP_KERNEL);
+	if (!nvmeq->cqes)
+		goto free_nvmeq;
+
+	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
+		goto free_cqdma;
+
+	nvmeq->dev = dev;
+	spin_lock_init(&nvmeq->sq_lock);
+	spin_lock_init(&nvmeq->cq_poll_lock);
+	nvmeq->cq_head = 0;
+	nvmeq->cq_phase = 1;
+	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+	nvmeq->qid = qid;
+	dev->ctrl.queue_count++;
+
+	return 0;
+
+ free_cqdma:
+	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
+			  nvmeq->cq_dma_addr);
+ free_nvmeq:
+	return -ENOMEM;
+}
+
+static int queue_request_irq(struct nvme_queue *nvmeq)
+{
+	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
+	int nr = nvmeq->dev->ctrl.instance;
+
+	if (use_threaded_interrupts) {
+		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
+				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+	} else {
+		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
+				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+	}
+}
+
+static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
+{
+	struct nvme_dev *dev = nvmeq->dev;
+
+	nvmeq->sq_tail = 0;
+	nvmeq->last_sq_tail = 0;
+	nvmeq->cq_head = 0;
+	nvmeq->cq_phase = 1;
+	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
+	nvme_dbbuf_init(dev, nvmeq, qid);
+	dev->online_queues++;
+	wmb(); /* ensure the first interrupt sees the initialization */
+}
+
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
+{
+	struct nvme_dev *dev = nvmeq->dev;
+	int result;
+	u16 vector = 0;
+
+	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
+
+	/*
+	 * A queue's vector matches the queue identifier unless the controller
+	 * has only one vector available.
+	 */
+	if (!polled)
+		vector = dev->num_vecs == 1 ? 0 : qid;
+	else
+		set_bit(NVMEQ_POLLED, &nvmeq->flags);
+
+	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
+	if (result)
+		return result;
+
+	result = adapter_alloc_sq(dev, qid, nvmeq);
+	if (result < 0)
+		return result;
+	if (result)
+		goto release_cq;
+
+	nvmeq->cq_vector = vector;
+	nvme_init_queue(nvmeq, qid);
+
+	if (!polled) {
+		result = queue_request_irq(nvmeq);
+		if (result < 0)
+			goto release_sq;
+	}
+
+	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
+	return result;
+
+release_sq:
+	dev->online_queues--;
+	adapter_delete_sq(dev, qid);
+release_cq:
+	adapter_delete_cq(dev, qid);
+	return result;
+}
+
+static const struct blk_mq_ops nvme_mq_admin_ops = {
+	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_pci_complete_rq,
+	.init_hctx	= nvme_admin_init_hctx,
+	.init_request	= nvme_init_request,
+	.timeout	= nvme_timeout,
+};
+
+static const struct blk_mq_ops nvme_mq_ops = {
+	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_pci_complete_rq,
+	.commit_rqs	= nvme_commit_rqs,
+	.init_hctx	= nvme_init_hctx,
+	.init_request	= nvme_init_request,
+	.map_queues	= nvme_pci_map_queues,
+	.timeout	= nvme_timeout,
+	.poll		= nvme_poll,
+};
+
+static void nvme_dev_remove_admin(struct nvme_dev *dev)
+{
+	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+		/*
+		 * If the controller was reset during removal, it's possible
+		 * user requests may be waiting on a stopped queue. Start the
+		 * queue to flush these to completion.
+		 */
+		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+		blk_cleanup_queue(dev->ctrl.admin_q);
+		blk_mq_free_tag_set(&dev->admin_tagset);
+	}
+}
+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+	if (!dev->ctrl.admin_q) {
+		dev->admin_tagset.ops = &nvme_mq_admin_ops;
+		dev->admin_tagset.nr_hw_queues = 1;
+
+		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+		dev->admin_tagset.numa_node = dev->ctrl.numa_node;
+		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
+		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
+		dev->admin_tagset.driver_data = dev;
+
+		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+			return -ENOMEM;
+		dev->ctrl.admin_tagset = &dev->admin_tagset;
+
+		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
+		if (IS_ERR(dev->ctrl.admin_q)) {
+			blk_mq_free_tag_set(&dev->admin_tagset);
+			dev->ctrl.admin_q = NULL;
+			return -ENOMEM;
+		}
+		if (!blk_get_queue(dev->ctrl.admin_q)) {
+			nvme_dev_remove_admin(dev);
+			dev->ctrl.admin_q = NULL;
+			return -ENODEV;
+		}
+	} else
+		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+
+	return 0;
+}
+
+static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (size <= dev->bar_mapped_size)
+		return 0;
+	if (size > pci_resource_len(pdev, 0))
+		return -ENOMEM;
+	if (dev->bar)
+		iounmap(dev->bar);
+	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+	if (!dev->bar) {
+		dev->bar_mapped_size = 0;
+		return -ENOMEM;
+	}
+	dev->bar_mapped_size = size;
+	dev->dbs = dev->bar + NVME_REG_DBS;
+
+	return 0;
+}
+
+static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
+{
+	int result;
+	u32 aqa;
+	struct nvme_queue *nvmeq;
+
+	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
+	if (result < 0)
+		return result;
+
+	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
+				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
+
+	if (dev->subsystem &&
+	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
+		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
+
+	result = nvme_disable_ctrl(&dev->ctrl);
+	if (result < 0)
+		return result;
+
+	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
+	if (result)
+		return result;
+
+	dev->ctrl.numa_node = dev_to_node(dev->dev);
+
+	nvmeq = &dev->queues[0];
+	aqa = nvmeq->q_depth - 1;
+	aqa |= aqa << 16;
+
+	writel(aqa, dev->bar + NVME_REG_AQA);
+	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
+	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
+
+	result = nvme_enable_ctrl(&dev->ctrl);
+	if (result)
+		return result;
+
+	nvmeq->cq_vector = 0;
+	nvme_init_queue(nvmeq, 0);
+	result = queue_request_irq(nvmeq);
+	if (result) {
+		dev->online_queues--;
+		return result;
+	}
+
+	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
+	return result;
+}
+
+static int nvme_create_io_queues(struct nvme_dev *dev)
+{
+	unsigned i, max, rw_queues;
+	int ret = 0;
+
+	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
+		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
+			ret = -ENOMEM;
+			break;
+		}
+	}
+
+	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
+	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
+		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
+				dev->io_queues[HCTX_TYPE_READ];
+	} else {
+		rw_queues = max;
+	}
+
+	for (i = dev->online_queues; i <= max; i++) {
+		bool polled = i > rw_queues;
+
+		ret = nvme_create_queue(&dev->queues[i], i, polled);
+		if (ret)
+			break;
+	}
+
+	/*
+	 * Ignore failing Create SQ/CQ commands, we can continue with less
+	 * than the desired amount of queues, and even a controller without
+	 * I/O queues can still be used to issue admin commands.  This might
+	 * be useful to upgrade a buggy firmware for example.
+	 */
+	return ret >= 0 ? 0 : ret;
+}
+
+static ssize_t nvme_cmb_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
+
+	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
+		       ndev->cmbloc, ndev->cmbsz);
+}
+static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
+
+static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
+{
+	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
+
+	return 1ULL << (12 + 4 * szu);
+}
+
+static u32 nvme_cmb_size(struct nvme_dev *dev)
+{
+	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
+}
+
+static void nvme_map_cmb(struct nvme_dev *dev)
+{
+	u64 size, offset;
+	resource_size_t bar_size;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	int bar;
+
+	if (dev->cmb_size)
+		return;
+
+	if (NVME_CAP_CMBS(dev->ctrl.cap))
+		writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC);
+
+	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
+	if (!dev->cmbsz)
+		return;
+	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
+
+	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
+	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
+	bar = NVME_CMB_BIR(dev->cmbloc);
+	bar_size = pci_resource_len(pdev, bar);
+
+	if (offset > bar_size)
+		return;
+
+	/*
+	 * Tell the controller about the host side address mapping the CMB,
+	 * and enable CMB decoding for the NVMe 1.4+ scheme:
+	 */
+	if (NVME_CAP_CMBS(dev->ctrl.cap)) {
+		hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE |
+			     (pci_bus_address(pdev, bar) + offset),
+			     dev->bar + NVME_REG_CMBMSC);
+	}
+
+	/*
+	 * Controllers may support a CMB size larger than their BAR,
+	 * for example, due to being behind a bridge. Reduce the CMB to
+	 * the reported size of the BAR
+	 */
+	if (size > bar_size - offset)
+		size = bar_size - offset;
+
+	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
+		dev_warn(dev->ctrl.device,
+			 "failed to register the CMB\n");
+		return;
+	}
+
+	dev->cmb_size = size;
+	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);
+
+	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
+			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
+		pci_p2pmem_publish(pdev, true);
+
+	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
+				    &dev_attr_cmb.attr, NULL))
+		dev_warn(dev->ctrl.device,
+			 "failed to add sysfs attribute for CMB\n");
+}
+
+static inline void nvme_release_cmb(struct nvme_dev *dev)
+{
+	if (dev->cmb_size) {
+		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
+					     &dev_attr_cmb.attr, NULL);
+		dev->cmb_size = 0;
+	}
+}
+
+static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
+{
+	u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
+	u64 dma_addr = dev->host_mem_descs_dma;
+	struct nvme_command c;
+	int ret;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode	= nvme_admin_set_features;
+	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
+	c.features.dword11	= cpu_to_le32(bits);
+	c.features.dword12	= cpu_to_le32(host_mem_size);
+	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
+	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
+	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);
+
+	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
+	if (ret) {
+		dev_warn(dev->ctrl.device,
+			 "failed to set host mem (err %d, flags %#x).\n",
+			 ret, bits);
+	}
+	return ret;
+}
+
+static void nvme_free_host_mem(struct nvme_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->nr_host_mem_descs; i++) {
+		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
+		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
+
+		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
+			       le64_to_cpu(desc->addr),
+			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
+	}
+
+	kfree(dev->host_mem_desc_bufs);
+	dev->host_mem_desc_bufs = NULL;
+	dma_free_coherent(dev->dev,
+			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
+			dev->host_mem_descs, dev->host_mem_descs_dma);
+	dev->host_mem_descs = NULL;
+	dev->nr_host_mem_descs = 0;
+}
+
+static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
+		u32 chunk_size)
+{
+	struct nvme_host_mem_buf_desc *descs;
+	u32 max_entries, len;
+	dma_addr_t descs_dma;
+	int i = 0;
+	void **bufs;
+	u64 size, tmp;
+
+	tmp = (preferred + chunk_size - 1);
+	do_div(tmp, chunk_size);
+	max_entries = tmp;
+
+	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
+		max_entries = dev->ctrl.hmmaxd;
+
+	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
+				   &descs_dma, GFP_KERNEL);
+	if (!descs)
+		goto out;
+
+	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
+	if (!bufs)
+		goto out_free_descs;
+
+	for (size = 0; size < preferred && i < max_entries; size += len) {
+		dma_addr_t dma_addr;
+
+		len = min_t(u64, chunk_size, preferred - size);
+		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
+				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
+		if (!bufs[i])
+			break;
+
+		descs[i].addr = cpu_to_le64(dma_addr);
+		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
+		i++;
+	}
+
+	if (!size)
+		goto out_free_bufs;
+
+	dev->nr_host_mem_descs = i;
+	dev->host_mem_size = size;
+	dev->host_mem_descs = descs;
+	dev->host_mem_descs_dma = descs_dma;
+	dev->host_mem_desc_bufs = bufs;
+	return 0;
+
+out_free_bufs:
+	while (--i >= 0) {
+		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
+
+		dma_free_attrs(dev->dev, size, bufs[i],
+			       le64_to_cpu(descs[i].addr),
+			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
+	}
+
+	kfree(bufs);
+out_free_descs:
+	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
+			descs_dma);
+out:
+	dev->host_mem_descs = NULL;
+	return -ENOMEM;
+}
+
+static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
+{
+	u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
+	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
+	u64 chunk_size;
+
+	/* start big and work our way down */
+	for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
+		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
+			if (!min || dev->host_mem_size >= min)
+				return 0;
+			nvme_free_host_mem(dev);
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int nvme_setup_host_mem(struct nvme_dev *dev)
+{
+	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
+	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
+	u64 min = (u64)dev->ctrl.hmmin * 4096;
+	u32 enable_bits = NVME_HOST_MEM_ENABLE;
+	int ret;
+
+	preferred = min(preferred, max);
+	if (min > max) {
+		dev_warn(dev->ctrl.device,
+			"min host memory (%lld MiB) above limit (%d MiB).\n",
+			min >> ilog2(SZ_1M), max_host_mem_size_mb);
+		nvme_free_host_mem(dev);
+		return 0;
+	}
+
+	/*
+	 * If we already have a buffer allocated check if we can reuse it.
+	 */
+	if (dev->host_mem_descs) {
+		if (dev->host_mem_size >= min)
+			enable_bits |= NVME_HOST_MEM_RETURN;
+		else
+			nvme_free_host_mem(dev);
+	}
+
+	if (!dev->host_mem_descs) {
+		if (nvme_alloc_host_mem(dev, min, preferred)) {
+			dev_warn(dev->ctrl.device,
+				"failed to allocate host memory buffer.\n");
+			return 0; /* controller must work without HMB */
+		}
+
+		dev_info(dev->ctrl.device,
+			"allocated %lld MiB host memory buffer.\n",
+			dev->host_mem_size >> ilog2(SZ_1M));
+	}
+
+	ret = nvme_set_host_mem(dev, enable_bits);
+	if (ret)
+		nvme_free_host_mem(dev);
+	return ret;
+}
+
+/*
+ * nirqs is the number of interrupts available for write and read
+ * queues. The core already reserved an interrupt for the admin queue.
+ */
+static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
+{
+	struct nvme_dev *dev = affd->priv;
+	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
+
+	/*
+	 * If there is no interrupt available for queues, ensure that
+	 * the default queue is set to 1. The affinity set size is
+	 * also set to one, but the irq core ignores it for this case.
+	 *
+	 * If only one interrupt is available or 'write_queue' == 0, combine
+	 * write and read queues.
+	 *
+	 * If 'write_queues' > 0, ensure it leaves room for at least one read
+	 * queue.
+	 */
+	if (!nrirqs) {
+		nrirqs = 1;
+		nr_read_queues = 0;
+	} else if (nrirqs == 1 || !nr_write_queues) {
+		nr_read_queues = 0;
+	} else if (nr_write_queues >= nrirqs) {
+		nr_read_queues = 1;
+	} else {
+		nr_read_queues = nrirqs - nr_write_queues;
+	}
+
+	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
+	affd->nr_sets = nr_read_queues ? 2 : 1;
+}
+
+static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	struct irq_affinity affd = {
+		.pre_vectors	= 1,
+		.calc_sets	= nvme_calc_irq_sets,
+		.priv		= dev,
+	};
+	unsigned int irq_queues, poll_queues;
+
+	/*
+	 * Poll queues don't need interrupts, but we need at least one I/O queue
+	 * left over for non-polled I/O.
+	 */
+	poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
+	dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
+
+	/*
+	 * Initialize for the single interrupt case, will be updated in
+	 * nvme_calc_irq_sets().
+	 */
+	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+	dev->io_queues[HCTX_TYPE_READ] = 0;
+
+	/*
+	 * We need interrupts for the admin queue and each non-polled I/O queue,
+	 * but some Apple controllers require all queues to use the first
+	 * vector.
+	 */
+	irq_queues = 1;
+	if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
+		irq_queues += (nr_io_queues - poll_queues);
+	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+}
+
+static void nvme_disable_io_queues(struct nvme_dev *dev)
+{
+	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
+		__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
+}
+
+static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
+{
+	return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+}
+
+static int nvme_setup_io_queues(struct nvme_dev *dev)
+{
+	struct nvme_queue *adminq = &dev->queues[0];
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	unsigned int nr_io_queues;
+	unsigned long size;
+	int result;
+
+	/*
+	 * Sample the module parameters once at reset time so that we have
+	 * stable values to work with.
+	 */
+	dev->nr_write_queues = write_queues;
+	dev->nr_poll_queues = poll_queues;
+
+	/*
+	 * If tags are shared with admin queue (Apple bug), then
+	 * make sure we only use one IO queue.
+	 */
+	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+		nr_io_queues = 1;
+	else
+		nr_io_queues = min(nvme_max_io_queues(dev),
+				   dev->nr_allocated_queues - 1);
+
+	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
+	if (result < 0)
+		return result;
+
+	if (nr_io_queues == 0)
+		return 0;
+	
+	clear_bit(NVMEQ_ENABLED, &adminq->flags);
+
+	if (dev->cmb_use_sqes) {
+		result = nvme_cmb_qdepth(dev, nr_io_queues,
+				sizeof(struct nvme_command));
+		if (result > 0)
+			dev->q_depth = result;
+		else
+			dev->cmb_use_sqes = false;
+	}
+
+	do {
+		size = db_bar_size(dev, nr_io_queues);
+		result = nvme_remap_bar(dev, size);
+		if (!result)
+			break;
+		if (!--nr_io_queues)
+			return -ENOMEM;
+	} while (1);
+	adminq->q_db = dev->dbs;
+
+ retry:
+	/* Deregister the admin queue's interrupt */
+	pci_free_irq(pdev, 0, adminq);
+
+	/*
+	 * If we enable msix early due to not intx, disable it again before
+	 * setting up the full range we need.
+	 */
+	pci_free_irq_vectors(pdev);
+
+	result = nvme_setup_irqs(dev, nr_io_queues);
+	if (result <= 0)
+		return -EIO;
+
+	dev->num_vecs = result;
+	result = max(result - 1, 1);
+	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
+
+	/*
+	 * Should investigate if there's a performance win from allocating
+	 * more queues than interrupt vectors; it might allow the submission
+	 * path to scale better, even if the receive path is limited by the
+	 * number of interrupts.
+	 */
+	result = queue_request_irq(adminq);
+	if (result)
+		return result;
+	set_bit(NVMEQ_ENABLED, &adminq->flags);
+
+	result = nvme_create_io_queues(dev);
+	if (result || dev->online_queues < 2)
+		return result;
+
+	if (dev->online_queues - 1 < dev->max_qid) {
+		nr_io_queues = dev->online_queues - 1;
+		nvme_disable_io_queues(dev);
+		nvme_suspend_io_queues(dev);
+		goto retry;
+	}
+	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+					dev->io_queues[HCTX_TYPE_DEFAULT],
+					dev->io_queues[HCTX_TYPE_READ],
+					dev->io_queues[HCTX_TYPE_POLL]);
+	return 0;
+}
+
+static void nvme_del_queue_end(struct request *req, blk_status_t error)
+{
+	struct nvme_queue *nvmeq = req->end_io_data;
+
+	blk_mq_free_request(req);
+	complete(&nvmeq->delete_done);
+}
+
+static void nvme_del_cq_end(struct request *req, blk_status_t error)
+{
+	struct nvme_queue *nvmeq = req->end_io_data;
+
+	if (error)
+		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
+
+	nvme_del_queue_end(req, error);
+}
+
+static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
+{
+	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
+	struct request *req;
+	struct nvme_command cmd;
+
+	memset(&cmd, 0, sizeof(cmd));
+	cmd.delete_queue.opcode = opcode;
+	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
+
+	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->end_io_data = nvmeq;
+
+	init_completion(&nvmeq->delete_done);
+	blk_execute_rq_nowait(q, NULL, req, false,
+			opcode == nvme_admin_delete_cq ?
+				nvme_del_cq_end : nvme_del_queue_end);
+	return 0;
+}
+
+static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
+{
+	int nr_queues = dev->online_queues - 1, sent = 0;
+	unsigned long timeout;
+
+ retry:
+	timeout = ADMIN_TIMEOUT;
+	while (nr_queues > 0) {
+		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
+			break;
+		nr_queues--;
+		sent++;
+	}
+	while (sent) {
+		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
+
+		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
+				timeout);
+		if (timeout == 0)
+			return false;
+
+		sent--;
+		if (nr_queues)
+			goto retry;
+	}
+	return true;
+}
+
+static void nvme_dev_add(struct nvme_dev *dev)
+{
+	int ret;
+
+	if (!dev->ctrl.tagset) {
+		dev->tagset.ops = &nvme_mq_ops;
+		dev->tagset.nr_hw_queues = dev->online_queues - 1;
+		dev->tagset.nr_maps = 2; /* default + read */
+		if (dev->io_queues[HCTX_TYPE_POLL])
+			dev->tagset.nr_maps++;
+		dev->tagset.timeout = NVME_IO_TIMEOUT;
+		dev->tagset.numa_node = dev->ctrl.numa_node;
+		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
+						BLK_MQ_MAX_DEPTH) - 1;
+		dev->tagset.cmd_size = sizeof(struct nvme_iod);
+		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+		dev->tagset.driver_data = dev;
+
+		/*
+		 * Some Apple controllers requires tags to be unique
+		 * across admin and IO queue, so reserve the first 32
+		 * tags of the IO queue.
+		 */
+		if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+			dev->tagset.reserved_tags = NVME_AQ_DEPTH;
+
+		ret = blk_mq_alloc_tag_set(&dev->tagset);
+		if (ret) {
+			dev_warn(dev->ctrl.device,
+				"IO queues tagset allocation failed %d\n", ret);
+			return;
+		}
+		dev->ctrl.tagset = &dev->tagset;
+	} else {
+		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
+
+		/* Free previously allocated queues that are no longer usable */
+		nvme_free_queues(dev, dev->online_queues);
+	}
+
+	nvme_dbbuf_set(dev);
+}
+
+static int nvme_pci_enable(struct nvme_dev *dev)
+{
+	int result = -ENOMEM;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (pci_enable_device_mem(pdev))
+		return result;
+
+	pci_set_master(pdev);
+
+	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
+		goto disable;
+
+	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
+		result = -ENODEV;
+		goto disable;
+	}
+
+	/*
+	 * Some devices and/or platforms don't advertise or work with INTx
+	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
+	 * adjust this later.
+	 */
+	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+	if (result < 0)
+		return result;
+
+	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
+
+	dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
+				io_queue_depth);
+	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
+	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
+	dev->dbs = dev->bar + 4096;
+
+	/*
+	 * Some Apple controllers require a non-standard SQE size.
+	 * Interestingly they also seem to ignore the CC:IOSQES register
+	 * so we don't bother updating it here.
+	 */
+	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
+		dev->io_sqes = 7;
+	else
+		dev->io_sqes = NVME_NVM_IOSQES;
+
+	/*
+	 * Temporary fix for the Apple controller found in the MacBook8,1 and
+	 * some MacBook7,1 to avoid controller resets and data loss.
+	 */
+	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
+		dev->q_depth = 2;
+		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
+			"set queue depth=%u to work around controller resets\n",
+			dev->q_depth);
+	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
+		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
+		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
+		dev->q_depth = 64;
+		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
+                        "set queue depth=%u\n", dev->q_depth);
+	}
+
+	/*
+	 * Controllers with the shared tags quirk need the IO queue to be
+	 * big enough so that we get 32 tags for the admin queue
+	 */
+	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
+	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
+		dev->q_depth = NVME_AQ_DEPTH + 2;
+		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
+			 dev->q_depth);
+	}
+
+
+	nvme_map_cmb(dev);
+
+	pci_enable_pcie_error_reporting(pdev);
+	pci_save_state(pdev);
+	return 0;
+
+ disable:
+	pci_disable_device(pdev);
+	return result;
+}
+
+static void nvme_dev_unmap(struct nvme_dev *dev)
+{
+	if (dev->bar)
+		iounmap(dev->bar);
+	pci_release_mem_regions(to_pci_dev(dev->dev));
+}
+
+static void nvme_pci_disable(struct nvme_dev *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	pci_free_irq_vectors(pdev);
+
+	if (pci_is_enabled(pdev)) {
+		pci_disable_pcie_error_reporting(pdev);
+		pci_disable_device(pdev);
+	}
+}
+
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
+{
+	bool dead = true, freeze = false;
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	mutex_lock(&dev->shutdown_lock);
+	if (pci_is_enabled(pdev)) {
+		u32 csts = readl(dev->bar + NVME_REG_CSTS);
+
+		if (dev->ctrl.state == NVME_CTRL_LIVE ||
+		    dev->ctrl.state == NVME_CTRL_RESETTING) {
+			freeze = true;
+			nvme_start_freeze(&dev->ctrl);
+		}
+		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
+			pdev->error_state  != pci_channel_io_normal);
+	}
+
+	/*
+	 * Give the controller a chance to complete all entered requests if
+	 * doing a safe shutdown.
+	 */
+	if (!dead && shutdown && freeze)
+		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
+
+	nvme_stop_queues(&dev->ctrl);
+
+	if (!dead && dev->ctrl.queue_count > 0) {
+		nvme_disable_io_queues(dev);
+		nvme_disable_admin_queue(dev, shutdown);
+	}
+	nvme_suspend_io_queues(dev);
+	nvme_suspend_queue(&dev->queues[0]);
+	nvme_pci_disable(dev);
+	nvme_reap_pending_cqes(dev);
+
+	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
+	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
+	blk_mq_tagset_wait_completed_request(&dev->tagset);
+	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
+
+	/*
+	 * The driver will not be starting up queues again if shutting down so
+	 * must flush all entered requests to their failed completion to avoid
+	 * deadlocking blk-mq hot-cpu notifier.
+	 */
+	if (shutdown) {
+		nvme_start_queues(&dev->ctrl);
+		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
+			blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+	}
+	mutex_unlock(&dev->shutdown_lock);
+}
+
+static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
+{
+	if (!nvme_wait_reset(&dev->ctrl))
+		return -EBUSY;
+	nvme_dev_disable(dev, shutdown);
+	return 0;
+}
+
+static int nvme_setup_prp_pools(struct nvme_dev *dev)
+{
+	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
+						NVME_CTRL_PAGE_SIZE,
+						NVME_CTRL_PAGE_SIZE, 0);
+	if (!dev->prp_page_pool)
+		return -ENOMEM;
+
+	/* Optimisation for I/Os between 4k and 128k */
+	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
+						256, 256, 0);
+	if (!dev->prp_small_pool) {
+		dma_pool_destroy(dev->prp_page_pool);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+	dma_pool_destroy(dev->prp_page_pool);
+	dma_pool_destroy(dev->prp_small_pool);
+}
+
+static void nvme_free_tagset(struct nvme_dev *dev)
+{
+	if (dev->tagset.tags)
+		blk_mq_free_tag_set(&dev->tagset);
+	dev->ctrl.tagset = NULL;
+}
+
+static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+	nvme_dbbuf_dma_free(dev);
+	nvme_free_tagset(dev);
+	if (dev->ctrl.admin_q)
+		blk_put_queue(dev->ctrl.admin_q);
+	free_opal_dev(dev->ctrl.opal_dev);
+	mempool_destroy(dev->iod_mempool);
+	put_device(dev->dev);
+	kfree(dev->queues);
+	kfree(dev);
+}
+
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
+{
+	/*
+	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
+	 * may be holding this pci_dev's device lock.
+	 */
+	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+	nvme_get_ctrl(&dev->ctrl);
+	nvme_dev_disable(dev, false);
+	nvme_kill_queues(&dev->ctrl);
+	if (!queue_work(nvme_wq, &dev->remove_work))
+		nvme_put_ctrl(&dev->ctrl);
+}
+
+static void nvme_reset_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, ctrl.reset_work);
+	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
+	int result;
+
+	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
+		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
+			 dev->ctrl.state);
+		result = -ENODEV;
+		goto out;
+	}
+
+	/*
+	 * If we're called to reset a live controller first shut it down before
+	 * moving on.
+	 */
+	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
+		nvme_dev_disable(dev, false);
+	nvme_sync_queues(&dev->ctrl);
+
+	mutex_lock(&dev->shutdown_lock);
+	result = nvme_pci_enable(dev);
+	if (result)
+		goto out_unlock;
+
+	result = nvme_pci_configure_admin_queue(dev);
+	if (result)
+		goto out_unlock;
+
+	result = nvme_alloc_admin_tags(dev);
+	if (result)
+		goto out_unlock;
+
+	/*
+	 * Limit the max command size to prevent iod->sg allocations going
+	 * over a single page.
+	 */
+	dev->ctrl.max_hw_sectors = min_t(u32,
+		NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
+	dev->ctrl.max_segments = NVME_MAX_SEGS;
+
+	/*
+	 * Don't limit the IOMMU merged segment size.
+	 */
+	dma_set_max_seg_size(dev->dev, 0xffffffff);
+	dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
+
+	mutex_unlock(&dev->shutdown_lock);
+
+	/*
+	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
+	 * initializing procedure here.
+	 */
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+		dev_warn(dev->ctrl.device,
+			"failed to mark controller CONNECTING\n");
+		result = -EBUSY;
+		goto out;
+	}
+
+	/*
+	 * We do not support an SGL for metadata (yet), so we are limited to a
+	 * single integrity segment for the separate metadata pointer.
+	 */
+	dev->ctrl.max_integrity_segments = 1;
+
+	result = nvme_init_identify(&dev->ctrl);
+	if (result)
+		goto out;
+
+	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
+		if (!dev->ctrl.opal_dev)
+			dev->ctrl.opal_dev =
+				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
+		else if (was_suspend)
+			opal_unlock_from_suspend(dev->ctrl.opal_dev);
+	} else {
+		free_opal_dev(dev->ctrl.opal_dev);
+		dev->ctrl.opal_dev = NULL;
+	}
+
+	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
+		result = nvme_dbbuf_dma_alloc(dev);
+		if (result)
+			dev_warn(dev->dev,
+				 "unable to allocate dma for dbbuf\n");
+	}
+
+	if (dev->ctrl.hmpre) {
+		result = nvme_setup_host_mem(dev);
+		if (result < 0)
+			goto out;
+	}
+
+	result = nvme_setup_io_queues(dev);
+	if (result)
+		goto out;
+
+	/*
+	 * Keep the controller around but remove all namespaces if we don't have
+	 * any working I/O queue.
+	 */
+	if (dev->online_queues < 2) {
+		dev_warn(dev->ctrl.device, "IO queues not created\n");
+		nvme_kill_queues(&dev->ctrl);
+		nvme_remove_namespaces(&dev->ctrl);
+		nvme_free_tagset(dev);
+	} else {
+		nvme_start_queues(&dev->ctrl);
+		nvme_wait_freeze(&dev->ctrl);
+		nvme_dev_add(dev);
+		nvme_unfreeze(&dev->ctrl);
+	}
+
+	/*
+	 * If only admin queue live, keep it to do further investigation or
+	 * recovery.
+	 */
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+		dev_warn(dev->ctrl.device,
+			"failed to mark controller live state\n");
+		result = -ENODEV;
+		goto out;
+	}
+
+	nvme_start_ctrl(&dev->ctrl);
+	return;
+
+ out_unlock:
+	mutex_unlock(&dev->shutdown_lock);
+ out:
+	if (result)
+		dev_warn(dev->ctrl.device,
+			 "Removing after probe failure status: %d\n", result);
+	nvme_remove_dead_ctrl(dev);
+}
+
+static void nvme_remove_dead_ctrl_work(struct work_struct *work)
+{
+	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (pci_get_drvdata(pdev))
+		device_release_driver(&pdev->dev);
+	nvme_put_ctrl(&dev->ctrl);
+}
+
+static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
+{
+	*val = readl(to_nvme_dev(ctrl)->bar + off);
+	return 0;
+}
+
+static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
+{
+	writel(val, to_nvme_dev(ctrl)->bar + off);
+	return 0;
+}
+
+static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
+{
+	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
+	return 0;
+}
+
+static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
+{
+	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
+
+	return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
+}
+
+static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
+	.name			= "pcie",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_METADATA_SUPPORTED |
+				  NVME_F_PCI_P2PDMA,
+	.reg_read32		= nvme_pci_reg_read32,
+	.reg_write32		= nvme_pci_reg_write32,
+	.reg_read64		= nvme_pci_reg_read64,
+	.free_ctrl		= nvme_pci_free_ctrl,
+	.submit_async_event	= nvme_pci_submit_async_event,
+	.get_address		= nvme_pci_get_address,
+};
+
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+	if (pci_request_mem_regions(pdev, "nvme"))
+		return -ENODEV;
+
+	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
+		goto release;
+
+	return 0;
+  release:
+	pci_release_mem_regions(pdev);
+	return -ENODEV;
+}
+
+static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
+{
+	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
+		/*
+		 * Several Samsung devices seem to drop off the PCIe bus
+		 * randomly when APST is on and uses the deepest sleep state.
+		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
+		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
+		 * 950 PRO 256GB", but it seems to be restricted to two Dell
+		 * laptops.
+		 */
+		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
+		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
+		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
+			return NVME_QUIRK_NO_DEEPEST_PS;
+	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
+		/*
+		 * Samsung SSD 960 EVO drops off the PCIe bus after system
+		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
+		 * within few minutes after bootup on a Coffee Lake board -
+		 * ASUS PRIME Z370-A
+		 */
+		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
+		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
+		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
+			return NVME_QUIRK_NO_APST;
+	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
+		    pdev->device == 0xa808 || pdev->device == 0xa809)) ||
+		   (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
+		/*
+		 * Forcing to use host managed nvme power settings for
+		 * lowest idle power with quick resume latency on
+		 * Samsung and Toshiba SSDs based on suspend behavior
+		 * on Coffee Lake board for LENOVO C640
+		 */
+		if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
+		     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
+			return NVME_QUIRK_SIMPLE_SUSPEND;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_ACPI
+static bool nvme_acpi_storage_d3(struct pci_dev *dev)
+{
+	struct acpi_device *adev = ACPI_COMPANION(&dev->dev);
+	u8 val;
+
+	/*
+	 * Look for _DSD property specifying that the storage device on the port
+	 * must use D3 to support deep platform power savings during
+	 * suspend-to-idle.
+	 */
+
+	if (!adev)
+		return false;
+	if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
+			&val))
+		return false;
+	return val == 1;
+}
+#else
+static inline bool nvme_acpi_storage_d3(struct pci_dev *dev)
+{
+	return false;
+}
+#endif /* CONFIG_ACPI */
+
+static void nvme_async_probe(void *data, async_cookie_t cookie)
+{
+	struct nvme_dev *dev = data;
+
+	flush_work(&dev->ctrl.reset_work);
+	flush_work(&dev->ctrl.scan_work);
+	nvme_put_ctrl(&dev->ctrl);
+}
+
+static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	int node, result = -ENOMEM;
+	struct nvme_dev *dev;
+	unsigned long quirks = id->driver_data;
+	size_t alloc_size;
+
+	node = dev_to_node(&pdev->dev);
+	if (node == NUMA_NO_NODE)
+		set_dev_node(&pdev->dev, first_memory_node);
+
+	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->nr_write_queues = write_queues;
+	dev->nr_poll_queues = poll_queues;
+	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
+	dev->queues = kcalloc_node(dev->nr_allocated_queues,
+			sizeof(struct nvme_queue), GFP_KERNEL, node);
+	if (!dev->queues)
+		goto free;
+
+	dev->dev = get_device(&pdev->dev);
+	pci_set_drvdata(pdev, dev);
+
+	result = nvme_dev_map(dev);
+	if (result)
+		goto put_pci;
+
+	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
+	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	mutex_init(&dev->shutdown_lock);
+
+	result = nvme_setup_prp_pools(dev);
+	if (result)
+		goto unmap;
+
+	quirks |= check_vendor_combination_bug(pdev);
+
+	if (!noacpi && nvme_acpi_storage_d3(pdev)) {
+		/*
+		 * Some systems use a bios work around to ask for D3 on
+		 * platforms that support kernel managed suspend.
+		 */
+		dev_info(&pdev->dev,
+			 "platform quirk: setting simple suspend\n");
+		quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
+	}
+
+	/*
+	 * Double check that our mempool alloc size will cover the biggest
+	 * command we support.
+	 */
+	alloc_size = nvme_pci_iod_alloc_size();
+	WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+
+	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
+						mempool_kfree,
+						(void *) alloc_size,
+						GFP_KERNEL, node);
+	if (!dev->iod_mempool) {
+		result = -ENOMEM;
+		goto release_pools;
+	}
+
+	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
+			quirks);
+	if (result)
+		goto release_mempool;
+
+	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
+
+	nvme_reset_ctrl(&dev->ctrl);
+	async_schedule(nvme_async_probe, dev);
+
+	return 0;
+
+ release_mempool:
+	mempool_destroy(dev->iod_mempool);
+ release_pools:
+	nvme_release_prp_pools(dev);
+ unmap:
+	nvme_dev_unmap(dev);
+ put_pci:
+	put_device(dev->dev);
+ free:
+	kfree(dev->queues);
+	kfree(dev);
+	return result;
+}
+
+static void nvme_reset_prepare(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	/*
+	 * We don't need to check the return value from waiting for the reset
+	 * state as pci_dev device lock is held, making it impossible to race
+	 * with ->remove().
+	 */
+	nvme_disable_prepare_reset(dev, false);
+	nvme_sync_queues(&dev->ctrl);
+}
+
+static void nvme_reset_done(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	if (!nvme_try_sched_reset(&dev->ctrl))
+		flush_work(&dev->ctrl.reset_work);
+}
+
+static void nvme_shutdown(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	nvme_disable_prepare_reset(dev, true);
+}
+
+/*
+ * The driver's remove may be called on a device in a partially initialized
+ * state. This function must not have any dependencies on the device state in
+ * order to proceed.
+ */
+static void nvme_remove(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+	pci_set_drvdata(pdev, NULL);
+
+	if (!pci_device_is_present(pdev)) {
+		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
+		nvme_dev_disable(dev, true);
+	}
+
+	flush_work(&dev->ctrl.reset_work);
+	nvme_stop_ctrl(&dev->ctrl);
+	nvme_remove_namespaces(&dev->ctrl);
+	nvme_dev_disable(dev, true);
+	nvme_release_cmb(dev);
+	nvme_free_host_mem(dev);
+	nvme_dev_remove_admin(dev);
+	nvme_free_queues(dev, 0);
+	nvme_release_prp_pools(dev);
+	nvme_dev_unmap(dev);
+	nvme_uninit_ctrl(&dev->ctrl);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
+{
+	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
+}
+
+static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
+{
+	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
+}
+
+static int nvme_resume(struct device *dev)
+{
+	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
+	struct nvme_ctrl *ctrl = &ndev->ctrl;
+
+	if (ndev->last_ps == U32_MAX ||
+	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
+		return nvme_try_sched_reset(&ndev->ctrl);
+	return 0;
+}
+
+static int nvme_suspend(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+	struct nvme_ctrl *ctrl = &ndev->ctrl;
+	int ret = -EBUSY;
+
+	ndev->last_ps = U32_MAX;
+
+	/*
+	 * The platform does not remove power for a kernel managed suspend so
+	 * use host managed nvme power settings for lowest idle power if
+	 * possible. This should have quicker resume latency than a full device
+	 * shutdown.  But if the firmware is involved after the suspend or the
+	 * device does not support any non-default power states, shut down the
+	 * device fully.
+	 *
+	 * If ASPM is not enabled for the device, shut down the device and allow
+	 * the PCI bus layer to put it into D3 in order to take the PCIe link
+	 * down, so as to allow the platform to achieve its minimum low-power
+	 * state (which may not be possible if the link is up).
+	 *
+	 * If a host memory buffer is enabled, shut down the device as the NVMe
+	 * specification allows the device to access the host memory buffer in
+	 * host DRAM from all power states, but hosts will fail access to DRAM
+	 * during S3.
+	 */
+	if (pm_suspend_via_firmware() || !ctrl->npss ||
+	    !pcie_aspm_enabled(pdev) ||
+	    ndev->nr_host_mem_descs ||
+	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
+		return nvme_disable_prepare_reset(ndev, true);
+
+	nvme_start_freeze(ctrl);
+	nvme_wait_freeze(ctrl);
+	nvme_sync_queues(ctrl);
+
+	if (ctrl->state != NVME_CTRL_LIVE)
+		goto unfreeze;
+
+	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
+	if (ret < 0)
+		goto unfreeze;
+
+	/*
+	 * A saved state prevents pci pm from generically controlling the
+	 * device's power. If we're using protocol specific settings, we don't
+	 * want pci interfering.
+	 */
+	pci_save_state(pdev);
+
+	ret = nvme_set_power_state(ctrl, ctrl->npss);
+	if (ret < 0)
+		goto unfreeze;
+
+	if (ret) {
+		/* discard the saved state */
+		pci_load_saved_state(pdev, NULL);
+
+		/*
+		 * Clearing npss forces a controller reset on resume. The
+		 * correct value will be rediscovered then.
+		 */
+		ret = nvme_disable_prepare_reset(ndev, true);
+		ctrl->npss = 0;
+	}
+unfreeze:
+	nvme_unfreeze(ctrl);
+	return ret;
+}
+
+static int nvme_simple_suspend(struct device *dev)
+{
+	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
+
+	return nvme_disable_prepare_reset(ndev, true);
+}
+
+static int nvme_simple_resume(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+	return nvme_try_sched_reset(&ndev->ctrl);
+}
+
+static const struct dev_pm_ops nvme_dev_pm_ops = {
+	.suspend	= nvme_suspend,
+	.resume		= nvme_resume,
+	.freeze		= nvme_simple_suspend,
+	.thaw		= nvme_simple_resume,
+	.poweroff	= nvme_simple_suspend,
+	.restore	= nvme_simple_resume,
+};
+#endif /* CONFIG_PM_SLEEP */
+
+static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
+						pci_channel_state_t state)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	/*
+	 * A frozen channel requires a reset. When detected, this method will
+	 * shutdown the controller to quiesce. The controller will be restarted
+	 * after the slot reset through driver's slot_reset callback.
+	 */
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+	case pci_channel_io_frozen:
+		dev_warn(dev->ctrl.device,
+			"frozen state error detected, reset controller\n");
+		nvme_dev_disable(dev, false);
+		return PCI_ERS_RESULT_NEED_RESET;
+	case pci_channel_io_perm_failure:
+		dev_warn(dev->ctrl.device,
+			"failure state error detected, request disconnect\n");
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	dev_info(dev->ctrl.device, "restart after slot reset\n");
+	pci_restore_state(pdev);
+	nvme_reset_ctrl(&dev->ctrl);
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
+static void nvme_error_resume(struct pci_dev *pdev)
+{
+	struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+	flush_work(&dev->ctrl.reset_work);
+}
+
+static const struct pci_error_handlers nvme_err_handler = {
+	.error_detected	= nvme_error_detected,
+	.slot_reset	= nvme_slot_reset,
+	.resume		= nvme_error_resume,
+	.reset_prepare	= nvme_reset_prepare,
+	.reset_done	= nvme_reset_done,
+};
+
+static const struct pci_device_id nvme_id_table[] = {
+	{ PCI_VDEVICE(INTEL, 0x0953),	/* Intel 750/P3500/P3600/P3700 */
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
+	{ PCI_VDEVICE(INTEL, 0x0a53),	/* Intel P3520 */
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
+	{ PCI_VDEVICE(INTEL, 0x0a54),	/* Intel P4500/P4600 */
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DEALLOCATE_ZEROES |
+				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+	{ PCI_VDEVICE(INTEL, 0x0a55),	/* Dell Express Flash P4600 */
+		.driver_data = NVME_QUIRK_STRIPE_SIZE |
+				NVME_QUIRK_DEALLOCATE_ZEROES, },
+	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+				NVME_QUIRK_MEDIUM_PRIO_SQ |
+				NVME_QUIRK_NO_TEMP_THRESH_CHANGE |
+				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+	{ PCI_VDEVICE(INTEL, 0xf1a6),	/* Intel 760p/Pro 7600p */
+		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
+		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
+				NVME_QUIRK_DISABLE_WRITE_ZEROES |
+				NVME_QUIRK_BOGUS_NID, },
+	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
+		.driver_data = NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
+		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
+	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
+				NVME_QUIRK_NO_NS_DESC_LIST, },
+	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
+		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
+				NVME_QUIRK_DISABLE_WRITE_ZEROES|
+				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+	{ PCI_DEVICE(0x1987, 0x5016),	/* Phison E16 */
+		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
+				NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x1b4b, 0x1092),	/* Lexar 256 GB SSD */
+		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
+				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
+		.driver_data = NVME_QUIRK_LIGHTNVM, },
+	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
+		.driver_data = NVME_QUIRK_LIGHTNVM, },
+	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
+		.driver_data = NVME_QUIRK_LIGHTNVM, },
+	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
+		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
+				NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+	{ PCI_DEVICE(0x1c5c, 0x1504),   /* SK Hynix PC400 */
+		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+	{ PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
+		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+	{ PCI_DEVICE(0x2646, 0x2262),   /* KINGSTON SKC2000 NVMe SSD */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+	{ PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
+		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
+		.driver_data = NVME_QUIRK_SINGLE_VECTOR |
+				NVME_QUIRK_128_BYTES_SQES |
+				NVME_QUIRK_SHARED_TAGS |
+				NVME_QUIRK_SKIP_CID_GEN },
+	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, nvme_id_table);
+
+static struct pci_driver nvme_driver = {
+	.name		= "nvme",
+	.id_table	= nvme_id_table,
+	.probe		= nvme_probe,
+	.remove		= nvme_remove,
+	.shutdown	= nvme_shutdown,
+#ifdef CONFIG_PM_SLEEP
+	.driver		= {
+		.pm	= &nvme_dev_pm_ops,
+	},
+#endif
+	.sriov_configure = pci_sriov_configure_simple,
+	.err_handler	= &nvme_err_handler,
+};
+
+static int __init nvme_init(void)
+{
+	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
+
+	return pci_register_driver(&nvme_driver);
+}
+
+static void __exit nvme_exit(void)
+{
+	pci_unregister_driver(&nvme_driver);
+	flush_workqueue(nvme_wq);
+}
+
+MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+module_init(nvme_init);
+module_exit(nvme_exit);
--- a/feed/kmod-nvme/src/trace.h
+++ b/feed/kmod-nvme/src/trace.h
@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NVM Express device driver tracepoints
+ * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nvme
+
+#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NVME_H
+
+#include <linux/nvme.h>
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "nvme.h"
+
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
+		u8 *cdw10);
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
+		u8 *cdw10);
+const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype,
+		u8 *spc);
+
+#define parse_nvme_cmd(qid, opcode, fctype, cdw10)			\
+	((opcode) == nvme_fabrics_command ?				\
+	 nvme_trace_parse_fabrics_cmd(p, fctype, cdw10) :		\
+	((qid) ?							\
+	 nvme_trace_parse_nvm_cmd(p, opcode, cdw10) :			\
+	 nvme_trace_parse_admin_cmd(p, opcode, cdw10)))
+
+const char *nvme_trace_disk_name(struct trace_seq *p, char *name);
+#define __print_disk_name(name)				\
+	nvme_trace_disk_name(p, name)
+
+#ifndef TRACE_HEADER_MULTI_READ
+static inline void __assign_disk_name(char *name, struct gendisk *disk)
+{
+	if (disk)
+		memcpy(name, disk->disk_name, DISK_NAME_LEN);
+	else
+		memset(name, 0, DISK_NAME_LEN);
+}
+#endif
+
+TRACE_EVENT(nvme_setup_cmd,
+	    TP_PROTO(struct request *req, struct nvme_command *cmd),
+	    TP_ARGS(req, cmd),
+	    TP_STRUCT__entry(
+		__array(char, disk, DISK_NAME_LEN)
+		__field(int, ctrl_id)
+		__field(int, qid)
+		__field(u8, opcode)
+		__field(u8, flags)
+		__field(u8, fctype)
+		__field(u16, cid)
+		__field(u32, nsid)
+		__field(bool, metadata)
+		__array(u8, cdw10, 24)
+	    ),
+	    TP_fast_assign(
+		__entry->ctrl_id = nvme_req(req)->ctrl->instance;
+		__entry->qid = nvme_req_qid(req);
+		__entry->opcode = cmd->common.opcode;
+		__entry->flags = cmd->common.flags;
+		__entry->cid = cmd->common.command_id;
+		__entry->nsid = le32_to_cpu(cmd->common.nsid);
+		__entry->metadata = !!blk_integrity_rq(req);
+		__entry->fctype = cmd->fabrics.fctype;
+		__assign_disk_name(__entry->disk, req->rq_disk);
+		memcpy(__entry->cdw10, &cmd->common.cdw10,
+			sizeof(__entry->cdw10));
+	    ),
+	    TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%x, cmd=(%s %s)",
+		      __entry->ctrl_id, __print_disk_name(__entry->disk),
+		      __entry->qid, __entry->cid, __entry->nsid,
+		      __entry->flags, __entry->metadata,
+		      show_opcode_name(__entry->qid, __entry->opcode,
+				__entry->fctype),
+		      parse_nvme_cmd(__entry->qid, __entry->opcode,
+				__entry->fctype, __entry->cdw10))
+);
+
+TRACE_EVENT(nvme_complete_rq,
+	    TP_PROTO(struct request *req),
+	    TP_ARGS(req),
+	    TP_STRUCT__entry(
+		__array(char, disk, DISK_NAME_LEN)
+		__field(int, ctrl_id)
+		__field(int, qid)
+		__field(int, cid)
+		__field(u64, result)
+		__field(u8, retries)
+		__field(u8, flags)
+		__field(u16, status)
+	    ),
+	    TP_fast_assign(
+		__entry->ctrl_id = nvme_req(req)->ctrl->instance;
+		__entry->qid = nvme_req_qid(req);
+		__entry->cid = nvme_req(req)->cmd->common.command_id;
+		__entry->result = le64_to_cpu(nvme_req(req)->result.u64);
+		__entry->retries = nvme_req(req)->retries;
+		__entry->flags = nvme_req(req)->flags;
+		__entry->status = nvme_req(req)->status;
+		__assign_disk_name(__entry->disk, req->rq_disk);
+	    ),
+	    TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x",
+		      __entry->ctrl_id, __print_disk_name(__entry->disk),
+		      __entry->qid, __entry->cid, __entry->result,
+		      __entry->retries, __entry->flags, __entry->status)
+
+);
+
+#define aer_name(aer) { aer, #aer }
+
+TRACE_EVENT(nvme_async_event,
+	TP_PROTO(struct nvme_ctrl *ctrl, u32 result),
+	TP_ARGS(ctrl, result),
+	TP_STRUCT__entry(
+		__field(int, ctrl_id)
+		__field(u32, result)
+	),
+	TP_fast_assign(
+		__entry->ctrl_id = ctrl->instance;
+		__entry->result = result;
+	),
+	TP_printk("nvme%d: NVME_AEN=%#08x [%s]",
+		__entry->ctrl_id, __entry->result,
+		__print_symbolic(__entry->result,
+		aer_name(NVME_AER_NOTICE_NS_CHANGED),
+		aer_name(NVME_AER_NOTICE_ANA),
+		aer_name(NVME_AER_NOTICE_FW_ACT_STARTING),
+		aer_name(NVME_AER_NOTICE_DISC_CHANGED),
+		aer_name(NVME_AER_ERROR),
+		aer_name(NVME_AER_SMART),
+		aer_name(NVME_AER_CSS),
+		aer_name(NVME_AER_VS))
+	)
+);
+
+#undef aer_name
+
+TRACE_EVENT(nvme_sq,
+	TP_PROTO(struct request *req, __le16 sq_head, int sq_tail),
+	TP_ARGS(req, sq_head, sq_tail),
+	TP_STRUCT__entry(
+		__field(int, ctrl_id)
+		__array(char, disk, DISK_NAME_LEN)
+		__field(int, qid)
+		__field(u16, sq_head)
+		__field(u16, sq_tail)
+	),
+	TP_fast_assign(
+		__entry->ctrl_id = nvme_req(req)->ctrl->instance;
+		__assign_disk_name(__entry->disk, req->rq_disk);
+		__entry->qid = nvme_req_qid(req);
+		__entry->sq_head = le16_to_cpu(sq_head);
+		__entry->sq_tail = sq_tail;
+	),
+	TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u",
+		__entry->ctrl_id, __print_disk_name(__entry->disk),
+		__entry->qid, __entry->sq_head, __entry->sq_tail
+	)
+);
+
+#endif /* _TRACE_NVME_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- a/feed/kmod-rtc-pcf85063/Makefile
+++ b/feed/kmod-rtc-pcf85063/Makefile
@ -0,0 +1,27 @@
+include $(TOPDIR)/rules.mk
+include $(INCLUDE_DIR)/kernel.mk
+
+PKG_NAME:=rtc-pcf85063
+
+include $(INCLUDE_DIR)/package.mk
+
+define KernelPackage/$(PKG_NAME)
+  SUBMENU:=Other modules
+  TITLE:=Philips PCF85063 RTC support
+  DEPENDS:=+kmod-i2c-core +kmod-regmap-i2c
+  AUTOLOAD:=$(call AutoProbe,rtc-pcf85063)
+  FILES:=$(PKG_BUILD_DIR)/rtc-pcf85063.ko
+  KCONFIG:=
+endef
+
+define KernelPackage/$(PKG_NAME)/description
+ Kernel module for Philips PCF85063 RTC chip.
+endef
+
+EXTRA_KCONFIG:= \
+        CONFIG_RTC_DRV_PCF85063=m \
+        CONFIG_RTC_CLASS=y
+
+include ../kmod.mk
+
+$(eval $(call KernelPackage,$(PKG_NAME)))
--- a/feed/kmod-rtc-pcf85063/src/Makefile
+++ b/feed/kmod-rtc-pcf85063/src/Makefile
@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for RTC class/drivers.
+#
+
+ccflags-$(CONFIG_RTC_DEBUG)	:= -DDEBUG
+
+obj-$(CONFIG_RTC_DRV_PCF85063)	+= rtc-pcf85063.o
--- a/feed/kmod-rtc-pcf85063/src/rtc-pcf85063.c
+++ b/feed/kmod-rtc-pcf85063/src/rtc-pcf85063.c
@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * An I2C driver for the PCF85063 RTC
+ * Copyright 2014 Rose Technology
+ *
+ * Author: Søren Andersen <san@rosetechnology.dk>
+ * Maintainers: http://www.nslu2-linux.org/
+ *
+ * Copyright (C) 2019 Micro Crystal AG
+ * Author: Alexandre Belloni <alexandre.belloni@bootlin.com>
+ */
+#include <linux/clk-provider.h>
+#include <linux/i2c.h>
+#include <linux/bcd.h>
+#include <linux/rtc.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/pm_wakeirq.h>
+#include <linux/regmap.h>
+
+/*
+ * Information for this driver was pulled from the following datasheets.
+ *
+ *  https://www.nxp.com/documents/data_sheet/PCF85063A.pdf
+ *  https://www.nxp.com/documents/data_sheet/PCF85063TP.pdf
+ *
+ *  PCF85063A -- Rev. 6 — 18 November 2015
+ *  PCF85063TP -- Rev. 4 — 6 May 2015
+ *
+ *  https://www.microcrystal.com/fileadmin/Media/Products/RTC/App.Manual/RV-8263-C7_App-Manual.pdf
+ *  RV8263 -- Rev. 1.0 — January 2019
+ */
+
+#define PCF85063_REG_CTRL1		0x00 /* status */
+#define PCF85063_REG_CTRL1_CAP_SEL	BIT(0)
+#define PCF85063_REG_CTRL1_STOP		BIT(5)
+
+#define PCF85063_REG_CTRL2		0x01
+#define PCF85063_CTRL2_AF		BIT(6)
+#define PCF85063_CTRL2_AIE		BIT(7)
+
+#define PCF85063_REG_OFFSET		0x02
+#define PCF85063_OFFSET_SIGN_BIT	6	/* 2's complement sign bit */
+#define PCF85063_OFFSET_MODE		BIT(7)
+#define PCF85063_OFFSET_STEP0		4340
+#define PCF85063_OFFSET_STEP1		4069
+
+#define PCF85063_REG_CLKO_F_MASK	0x07 /* frequency mask */
+#define PCF85063_REG_CLKO_F_32768HZ	0x00
+#define PCF85063_REG_CLKO_F_OFF		0x07
+
+#define PCF85063_REG_RAM		0x03
+
+#define PCF85063_REG_SC			0x04 /* datetime */
+#define PCF85063_REG_SC_OS		0x80
+
+#define PCF85063_REG_ALM_S		0x0b
+#define PCF85063_AEN			BIT(7)
+
+struct pcf85063_config {
+	struct regmap_config regmap;
+	unsigned has_alarms:1;
+	unsigned force_cap_7000:1;
+};
+
+struct pcf85063 {
+	struct rtc_device	*rtc;
+	struct regmap		*regmap;
+#ifdef CONFIG_COMMON_CLK
+	struct clk_hw		clkout_hw;
+#endif
+};
+
+static int pcf85063_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct pcf85063 *pcf85063 = dev_get_drvdata(dev);
+	int rc;
+	u8 regs[7];
+
+	/*
+	 * while reading, the time/date registers are blocked and not updated
+	 * anymore until the access is finished. To not lose a second
+	 * event, the access must be finished within one second. So, read all
+	 * time/date registers in one turn.
+	 */
+	rc = regmap_bulk_read(pcf85063->regmap, PCF85063_REG_SC, regs,
+			      sizeof(regs));
+	if (rc)
+		return rc;
+
+	/* if the clock has lost its power it makes no sense to use its time */
+	if (regs[0] & PCF85063_REG_SC_OS) {
+		dev_warn(&pcf85063->rtc->dev, "Power loss detected, invalid time\n");
+		return -EINVAL;
+	}
+
+	tm->tm_sec = bcd2bin(regs[0] & 0x7F);
+	tm->tm_min = bcd2bin(regs[1] & 0x7F);
+	tm->tm_hour = bcd2bin(regs[2] & 0x3F); /* rtc hr 0-23 */
+	tm->tm_mday = bcd2bin(regs[3] & 0x3F);
+	tm->tm_wday = regs[4] & 0x07;
+	tm->tm_mon = bcd2bin(regs[5] & 0x1F) - 1; /* rtc mn 1-12 */
+	tm->tm_year = bcd2bin(regs[6]);
+	tm->tm_year += 100;
+
+	return 0;
+}
+
+static int pcf85063_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct pcf85063 *pcf85063 = dev_get_drvdata(dev);
+	int rc;
+	u8 regs[7];
+
+	/*
+	 * to accurately set the time, reset the divider chain and keep it in
+	 * reset state until all time/date registers are written
+	 */
+	rc = regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL1,
+				PCF85063_REG_CTRL1_STOP,
+				PCF85063_REG_CTRL1_STOP);
+	if (rc)
+		return rc;
+
+	/* hours, minutes and seconds */
+	regs[0] = bin2bcd(tm->tm_sec) & 0x7F; /* clear OS flag */
+
+	regs[1] = bin2bcd(tm->tm_min);
+	regs[2] = bin2bcd(tm->tm_hour);
+
+	/* Day of month, 1 - 31 */
+	regs[3] = bin2bcd(tm->tm_mday);
+
+	/* Day, 0 - 6 */
+	regs[4] = tm->tm_wday & 0x07;
+
+	/* month, 1 - 12 */
+	regs[5] = bin2bcd(tm->tm_mon + 1);
+
+	/* year and century */
+	regs[6] = bin2bcd(tm->tm_year - 100);
+
+	/* write all registers at once */
+	rc = regmap_bulk_write(pcf85063->regmap, PCF85063_REG_SC,
+			       regs, sizeof(regs));
+	if (rc)
+		return rc;
+
+	/*
+	 * Write the control register as a separate action since the size of
+	 * the register space is different between the PCF85063TP and
+	 * PCF85063A devices.  The rollover point can not be used.
+	 */
+	return regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL1,
+				  PCF85063_REG_CTRL1_STOP, 0);
+}
+
+static int pcf85063_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct pcf85063 *pcf85063 = dev_get_drvdata(dev);
+	u8 buf[4];
+	unsigned int val;
+	int ret;
+
+	ret = regmap_bulk_read(pcf85063->regmap, PCF85063_REG_ALM_S,
+			       buf, sizeof(buf));
+	if (ret)
+		return ret;
+
+	alrm->time.tm_sec = bcd2bin(buf[0]);
+	alrm->time.tm_min = bcd2bin(buf[1]);
+	alrm->time.tm_hour = bcd2bin(buf[2]);
+	alrm->time.tm_mday = bcd2bin(buf[3]);
+
+	ret = regmap_read(pcf85063->regmap, PCF85063_REG_CTRL2, &val);
+	if (ret)
+		return ret;
+
+	alrm->enabled =  !!(val & PCF85063_CTRL2_AIE);
+
+	return 0;
+}
+
+static int pcf85063_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
+{
+	struct pcf85063 *pcf85063 = dev_get_drvdata(dev);
+	u8 buf[5];
+	int ret;
+
+	buf[0] = bin2bcd(alrm->time.tm_sec);
+	buf[1] = bin2bcd(alrm->time.tm_min);
+	buf[2] = bin2bcd(alrm->time.tm_hour);
+	buf[3] = bin2bcd(alrm->time.tm_mday);
+	buf[4] = PCF85063_AEN; /* Do not match on week day */
+
+	ret = regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL2,
+				 PCF85063_CTRL2_AIE | PCF85063_CTRL2_AF, 0);
+	if (ret)
+		return ret;
+
+	ret = regmap_bulk_write(pcf85063->regmap, PCF85063_REG_ALM_S,
+				buf, sizeof(buf));
+	if (ret)
+		return ret;
+
+	return regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL2,
+				  PCF85063_CTRL2_AIE | PCF85063_CTRL2_AF,
+				  alrm->enabled ? PCF85063_CTRL2_AIE | PCF85063_CTRL2_AF : PCF85063_CTRL2_AF);
+}
+
+static int pcf85063_rtc_alarm_irq_enable(struct device *dev,
+					 unsigned int enabled)
+{
+	struct pcf85063 *pcf85063 = dev_get_drvdata(dev);
+
+	return regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL2,
+				  PCF85063_CTRL2_AIE,
+				  enabled ? PCF85063_CTRL2_AIE : 0);
+}
+
+static irqreturn_t pcf85063_rtc_handle_irq(int irq, void *dev_id)
+{
+	struct pcf85063 *pcf85063 = dev_id;
+	unsigned int val;
+	int err;
+
+	err = regmap_read(pcf85063->regmap, PCF85063_REG_CTRL2, &val);
+	if (err)
+		return IRQ_NONE;
+
+	if (val & PCF85063_CTRL2_AF) {
+		rtc_update_irq(pcf85063->rtc, 1, RTC_IRQF | RTC_AF);
+		regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL2,
+				   PCF85063_CTRL2_AIE | PCF85063_CTRL2_AF,
+				   0);
+		return IRQ_HANDLED;
+	}
+
+	return IRQ_NONE;
+}
+
+static int pcf85063_read_offset(struct device *dev, long *offset)
+{
+	struct pcf85063 *pcf85063 = dev_get_drvdata(dev);
+	long val;
+	u32 reg;
+	int ret;
+
+	ret = regmap_read(pcf85063->regmap, PCF85063_REG_OFFSET, &reg);
+	if (ret < 0)
+		return ret;
+
+	val = sign_extend32(reg & ~PCF85063_OFFSET_MODE,
+			    PCF85063_OFFSET_SIGN_BIT);
+
+	if (reg & PCF85063_OFFSET_MODE)
+		*offset = val * PCF85063_OFFSET_STEP1;
+	else
+		*offset = val * PCF85063_OFFSET_STEP0;
+
+	return 0;
+}
+
+static int pcf85063_set_offset(struct device *dev, long offset)
+{
+	struct pcf85063 *pcf85063 = dev_get_drvdata(dev);
+	s8 mode0, mode1, reg;
+	unsigned int error0, error1;
+
+	if (offset > PCF85063_OFFSET_STEP0 * 63)
+		return -ERANGE;
+	if (offset < PCF85063_OFFSET_STEP0 * -64)
+		return -ERANGE;
+
+	mode0 = DIV_ROUND_CLOSEST(offset, PCF85063_OFFSET_STEP0);
+	mode1 = DIV_ROUND_CLOSEST(offset, PCF85063_OFFSET_STEP1);
+
+	error0 = abs(offset - (mode0 * PCF85063_OFFSET_STEP0));
+	error1 = abs(offset - (mode1 * PCF85063_OFFSET_STEP1));
+	if (mode1 > 63 || mode1 < -64 || error0 < error1)
+		reg = mode0 & ~PCF85063_OFFSET_MODE;
+	else
+		reg = mode1 | PCF85063_OFFSET_MODE;
+
+	return regmap_write(pcf85063->regmap, PCF85063_REG_OFFSET, reg);
+}
+
+static int pcf85063_ioctl(struct device *dev, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct pcf85063 *pcf85063 = dev_get_drvdata(dev);
+	int status, ret = 0;
+
+	switch (cmd) {
+	case RTC_VL_READ:
+		ret = regmap_read(pcf85063->regmap, PCF85063_REG_SC, &status);
+		if (ret < 0)
+			return ret;
+
+		status = status & PCF85063_REG_SC_OS ? RTC_VL_DATA_INVALID : 0;
+
+		return put_user(status, (unsigned int __user *)arg);
+
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+static const struct rtc_class_ops pcf85063_rtc_ops = {
+	.read_time	= pcf85063_rtc_read_time,
+	.set_time	= pcf85063_rtc_set_time,
+	.read_offset	= pcf85063_read_offset,
+	.set_offset	= pcf85063_set_offset,
+	.ioctl		= pcf85063_ioctl,
+};
+
+static const struct rtc_class_ops pcf85063_rtc_ops_alarm = {
+	.read_time	= pcf85063_rtc_read_time,
+	.set_time	= pcf85063_rtc_set_time,
+	.read_offset	= pcf85063_read_offset,
+	.set_offset	= pcf85063_set_offset,
+	.read_alarm	= pcf85063_rtc_read_alarm,
+	.set_alarm	= pcf85063_rtc_set_alarm,
+	.alarm_irq_enable = pcf85063_rtc_alarm_irq_enable,
+	.ioctl		= pcf85063_ioctl,
+};
+
+static int pcf85063_nvmem_read(void *priv, unsigned int offset,
+			       void *val, size_t bytes)
+{
+	return regmap_read(priv, PCF85063_REG_RAM, val);
+}
+
+static int pcf85063_nvmem_write(void *priv, unsigned int offset,
+				void *val, size_t bytes)
+{
+	return regmap_write(priv, PCF85063_REG_RAM, *(u8 *)val);
+}
+
+static int pcf85063_load_capacitance(struct pcf85063 *pcf85063,
+				     const struct device_node *np,
+				     unsigned int force_cap)
+{
+	u32 load = 7000;
+	u8 reg = 0;
+
+	if (force_cap)
+		load = force_cap;
+	else
+		of_property_read_u32(np, "quartz-load-femtofarads", &load);
+
+	switch (load) {
+	default:
+		dev_warn(&pcf85063->rtc->dev, "Unknown quartz-load-femtofarads value: %d. Assuming 7000",
+			 load);
+		fallthrough;
+	case 7000:
+		break;
+	case 12500:
+		reg = PCF85063_REG_CTRL1_CAP_SEL;
+		break;
+	}
+
+	return regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL1,
+				  PCF85063_REG_CTRL1_CAP_SEL, reg);
+}
+
+#ifdef CONFIG_COMMON_CLK
+/*
+ * Handling of the clkout
+ */
+
+#define clkout_hw_to_pcf85063(_hw) container_of(_hw, struct pcf85063, clkout_hw)
+
+static int clkout_rates[] = {
+	32768,
+	16384,
+	8192,
+	4096,
+	2048,
+	1024,
+	1,
+	0
+};
+
+static unsigned long pcf85063_clkout_recalc_rate(struct clk_hw *hw,
+						 unsigned long parent_rate)
+{
+	struct pcf85063 *pcf85063 = clkout_hw_to_pcf85063(hw);
+	unsigned int buf;
+	int ret = regmap_read(pcf85063->regmap, PCF85063_REG_CTRL2, &buf);
+
+	if (ret < 0)
+		return 0;
+
+	buf &= PCF85063_REG_CLKO_F_MASK;
+	return clkout_rates[buf];
+}
+
+static long pcf85063_clkout_round_rate(struct clk_hw *hw, unsigned long rate,
+				       unsigned long *prate)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(clkout_rates); i++)
+		if (clkout_rates[i] <= rate)
+			return clkout_rates[i];
+
+	return 0;
+}
+
+static int pcf85063_clkout_set_rate(struct clk_hw *hw, unsigned long rate,
+				    unsigned long parent_rate)
+{
+	struct pcf85063 *pcf85063 = clkout_hw_to_pcf85063(hw);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(clkout_rates); i++)
+		if (clkout_rates[i] == rate)
+			return regmap_update_bits(pcf85063->regmap,
+				PCF85063_REG_CTRL2,
+				PCF85063_REG_CLKO_F_MASK, i);
+
+	return -EINVAL;
+}
+
+static int pcf85063_clkout_control(struct clk_hw *hw, bool enable)
+{
+	struct pcf85063 *pcf85063 = clkout_hw_to_pcf85063(hw);
+	unsigned int buf;
+	int ret;
+
+	ret = regmap_read(pcf85063->regmap, PCF85063_REG_OFFSET, &buf);
+	if (ret < 0)
+		return ret;
+	buf &= PCF85063_REG_CLKO_F_MASK;
+
+	if (enable) {
+		if (buf == PCF85063_REG_CLKO_F_OFF)
+			buf = PCF85063_REG_CLKO_F_32768HZ;
+		else
+			return 0;
+	} else {
+		if (buf != PCF85063_REG_CLKO_F_OFF)
+			buf = PCF85063_REG_CLKO_F_OFF;
+		else
+			return 0;
+	}
+
+	return regmap_update_bits(pcf85063->regmap, PCF85063_REG_CTRL2,
+					PCF85063_REG_CLKO_F_MASK, buf);
+}
+
+static int pcf85063_clkout_prepare(struct clk_hw *hw)
+{
+	return pcf85063_clkout_control(hw, 1);
+}
+
+static void pcf85063_clkout_unprepare(struct clk_hw *hw)
+{
+	pcf85063_clkout_control(hw, 0);
+}
+
+static int pcf85063_clkout_is_prepared(struct clk_hw *hw)
+{
+	struct pcf85063 *pcf85063 = clkout_hw_to_pcf85063(hw);
+	unsigned int buf;
+	int ret = regmap_read(pcf85063->regmap, PCF85063_REG_CTRL2, &buf);
+
+	if (ret < 0)
+		return 0;
+
+	return (buf & PCF85063_REG_CLKO_F_MASK) != PCF85063_REG_CLKO_F_OFF;
+}
+
+static const struct clk_ops pcf85063_clkout_ops = {
+	.prepare = pcf85063_clkout_prepare,
+	.unprepare = pcf85063_clkout_unprepare,
+	.is_prepared = pcf85063_clkout_is_prepared,
+	.recalc_rate = pcf85063_clkout_recalc_rate,
+	.round_rate = pcf85063_clkout_round_rate,
+	.set_rate = pcf85063_clkout_set_rate,
+};
+
+static struct clk *pcf85063_clkout_register_clk(struct pcf85063 *pcf85063)
+{
+	struct clk *clk;
+	struct clk_init_data init;
+	struct device_node *node = pcf85063->rtc->dev.parent->of_node;
+
+	init.name = "pcf85063-clkout";
+	init.ops = &pcf85063_clkout_ops;
+	init.flags = 0;
+	init.parent_names = NULL;
+	init.num_parents = 0;
+	pcf85063->clkout_hw.init = &init;
+
+	/* optional override of the clockname */
+	of_property_read_string(node, "clock-output-names", &init.name);
+
+	/* register the clock */
+	clk = devm_clk_register(&pcf85063->rtc->dev, &pcf85063->clkout_hw);
+
+	if (!IS_ERR(clk))
+		of_clk_add_provider(node, of_clk_src_simple_get, clk);
+
+	return clk;
+}
+#endif
+
+static const struct pcf85063_config pcf85063a_config = {
+	.regmap = {
+		.reg_bits = 8,
+		.val_bits = 8,
+		.max_register = 0x11,
+	},
+	.has_alarms = 1,
+};
+
+static const struct pcf85063_config pcf85063tp_config = {
+	.regmap = {
+		.reg_bits = 8,
+		.val_bits = 8,
+		.max_register = 0x0a,
+	},
+};
+
+static const struct pcf85063_config rv8263_config = {
+	.regmap = {
+		.reg_bits = 8,
+		.val_bits = 8,
+		.max_register = 0x11,
+	},
+	.has_alarms = 1,
+	.force_cap_7000 = 1,
+};
+
+static int pcf85063_probe(struct i2c_client *client)
+{
+	struct pcf85063 *pcf85063;
+	unsigned int tmp;
+	int err;
+	const struct pcf85063_config *config = &pcf85063tp_config;
+	const void *data = of_device_get_match_data(&client->dev);
+	struct nvmem_config nvmem_cfg = {
+		.name = "pcf85063_nvram",
+		.reg_read = pcf85063_nvmem_read,
+		.reg_write = pcf85063_nvmem_write,
+		.type = NVMEM_TYPE_BATTERY_BACKED,
+		.size = 1,
+	};
+
+	dev_dbg(&client->dev, "%s\n", __func__);
+
+	pcf85063 = devm_kzalloc(&client->dev, sizeof(struct pcf85063),
+				GFP_KERNEL);
+	if (!pcf85063)
+		return -ENOMEM;
+
+	if (data)
+		config = data;
+
+	pcf85063->regmap = devm_regmap_init_i2c(client, &config->regmap);
+	if (IS_ERR(pcf85063->regmap))
+		return PTR_ERR(pcf85063->regmap);
+
+	i2c_set_clientdata(client, pcf85063);
+
+	err = regmap_read(pcf85063->regmap, PCF85063_REG_CTRL1, &tmp);
+	if (err) {
+		dev_err(&client->dev, "RTC chip is not present\n");
+		return err;
+	}
+
+	pcf85063->rtc = devm_rtc_allocate_device(&client->dev);
+	if (IS_ERR(pcf85063->rtc))
+		return PTR_ERR(pcf85063->rtc);
+
+	err = pcf85063_load_capacitance(pcf85063, client->dev.of_node,
+					config->force_cap_7000 ? 7000 : 0);
+	if (err < 0)
+		dev_warn(&client->dev, "failed to set xtal load capacitance: %d",
+			 err);
+
+	pcf85063->rtc->ops = &pcf85063_rtc_ops;
+	pcf85063->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000;
+	pcf85063->rtc->range_max = RTC_TIMESTAMP_END_2099;
+	pcf85063->rtc->uie_unsupported = 1;
+
+	if (config->has_alarms && client->irq > 0) {
+		err = devm_request_threaded_irq(&client->dev, client->irq,
+						NULL, pcf85063_rtc_handle_irq,
+						IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+						"pcf85063", pcf85063);
+		if (err) {
+			dev_warn(&pcf85063->rtc->dev,
+				 "unable to request IRQ, alarms disabled\n");
+		} else {
+			pcf85063->rtc->ops = &pcf85063_rtc_ops_alarm;
+			device_init_wakeup(&client->dev, true);
+			err = dev_pm_set_wake_irq(&client->dev, client->irq);
+			if (err)
+				dev_err(&pcf85063->rtc->dev,
+					"failed to enable irq wake\n");
+		}
+	}
+
+	nvmem_cfg.priv = pcf85063->regmap;
+	rtc_nvmem_register(pcf85063->rtc, &nvmem_cfg);
+
+#ifdef CONFIG_COMMON_CLK
+	/* register clk in common clk framework */
+	pcf85063_clkout_register_clk(pcf85063);
+#endif
+
+	return rtc_register_device(pcf85063->rtc);
+}
+
+#ifdef CONFIG_OF
+static const struct of_device_id pcf85063_of_match[] = {
+	{ .compatible = "nxp,pcf85063", .data = &pcf85063tp_config },
+	{ .compatible = "nxp,pcf85063tp", .data = &pcf85063tp_config },
+	{ .compatible = "nxp,pcf85063a", .data = &pcf85063a_config },
+	{ .compatible = "microcrystal,rv8263", .data = &rv8263_config },
+	{}
+};
+MODULE_DEVICE_TABLE(of, pcf85063_of_match);
+#endif
+
+static struct i2c_driver pcf85063_driver = {
+	.driver		= {
+		.name	= "rtc-pcf85063",
+		.of_match_table = of_match_ptr(pcf85063_of_match),
+	},
+	.probe_new	= pcf85063_probe,
+};
+
+module_i2c_driver(pcf85063_driver);
+
+MODULE_AUTHOR("Søren Andersen <san@rosetechnology.dk>");
+MODULE_DESCRIPTION("PCF85063 RTC driver");
+MODULE_LICENSE("GPL");
--- a/feed/kmod.mk
+++ b/feed/kmod.mk
@ -0,0 +1,16 @@
+
+PKG_RELEASE?=1
+
+EXTRA_CFLAGS:= \
+        $(patsubst CONFIG_%, -DCONFIG_%=1, $(patsubst %=m,%,$(filter %=m,$(EXTRA_KCONFIG)))) \
+        $(patsubst CONFIG_%, -DCONFIG_%=1, $(patsubst %=y,%,$(filter %=y,$(EXTRA_KCONFIG)))) \
+
+MAKE_OPTS:= \
+        $(KERNEL_MAKE_FLAGS) \
+        M="$(PKG_BUILD_DIR)" \
+        EXTRA_CFLAGS="$(EXTRA_CFLAGS)" \
+        $(EXTRA_KCONFIG)
+
+define Build/Compile
+	$(MAKE) -C "$(LINUX_DIR)" $(MAKE_OPTS) modules
+endef
--- a/feed/nvme-cli/Makefile
+++ b/feed/nvme-cli/Makefile
@ -0,0 +1,53 @@
+include $(TOPDIR)/rules.mk
+
+PKG_NAME:=nvme-cli
+PKG_VERSION:=2.4
+PKG_RELEASE:=1
+
+PKG_SOURCE:=v$(PKG_VERSION).tar.gz
+PKG_SOURCE_URL:=https://github.com/linux-nvme/nvme-cli/archive/refs/tags/
+PKG_HASH:=7f80102a933e3bf46f4f2089cad119c827a363478235f66b89ddaad9ca57d019
+PKG_MAINTAINER:=Lee Miller <lee.miller@tutanota.com>
+PKG_LICENSE:=GPL-2.0
+PKG_LICENSE_FILES:=LICENSE
+
+include $(INCLUDE_DIR)/package.mk
+include $(INCLUDE_DIR)/meson.mk
+
+define Package/$(PKG_NAME)
+  SECTION:=utils
+  CATEGORY:=Utilities
+  URL:=https://nvmexpress.org/
+  TITLE:=NVMe management command line interface
+  DEPENDS += +libopenssl
+endef
+
+define Package/$(PKG_NAME)/description
+  NVM-Express user space tooling for Linux.
+endef
+
+MESON_ARGS += --force-fallback-for=libnvme
+
+define Package/$(PKG_NAME)/conffiles
+	/etc/nvme/discovery.conf
+endef
+
+define Package/$(PKG_NAME)/install
+	$(INSTALL_DIR) $(1)/usr/sbin
+	$(INSTALL_BIN) $(PKG_INSTALL_DIR)/usr/sbin/nvme $(1)/usr/sbin/
+	$(INSTALL_DIR) $(1)/usr/include
+	$(CP) $(PKG_INSTALL_DIR)/usr/include/json-c $(1)/usr/include/
+	$(CP) $(PKG_INSTALL_DIR)/usr/include/nvme $(1)/usr/include/
+	$(CP) $(PKG_INSTALL_DIR)/usr/include/libnvme*.h $(1)/usr/include/
+	$(INSTALL_DIR) $(1)/usr/lib
+	$(CP) $(PKG_INSTALL_DIR)/usr/lib/libjson-c.so $(1)/usr/lib/
+	$(CP) $(PKG_INSTALL_DIR)/usr/lib/lib{nvme,nvme-mi}.so* $(1)/usr/lib/
+	$(INSTALL_DIR) $(1)/etc/nvme
+	$(INSTALL_DATA) $(PKG_INSTALL_DIR)/usr/etc/nvme/discovery.conf \
+		$(1)/etc/nvme
+	$(INSTALL_DIR) $(1)/lib/udev/rules.d
+	$(INSTALL_DATA) $(PKG_INSTALL_DIR)/usr/lib/udev/rules.d/*.rules \
+		$(1)/lib/udev/rules.d
+endef
+
+$(eval $(call BuildPackage,$(PKG_NAME)))
--- a/feed/resolvelib/Makefile
+++ b/feed/resolvelib/Makefile
@ -0,0 +1,40 @@
+# This is free software, licensed under the GNU General Public License v2.
+# See /LICENSE for more information.
+#
+
+include $(TOPDIR)/rules.mk
+
+PKG_NAME:=resolvelib
+PKG_VERSION:=0.9.0
+PKG_RELEASE:=2
+
+PYPI_NAME:=$(PKG_NAME)
+PKG_HASH:=40ab05117c3281b1b160105e10075094c5ab118315003c922b77673a365290e1
+PKG_MAINTAINER:=Lee Miller <lee.miller@tutanota.com>
+PKG_LICENSE:=ISC
+
+PKG_BUILD_DEPENDS:=python-setuptools-scm/host python-toml/host
+
+include $(TOPDIR)/feeds/packages/lang/python/pypi.mk
+include $(INCLUDE_DIR)/package.mk
+include $(TOPDIR)/feeds/packages/lang/python/python3-package.mk
+
+define Package/python3-$(PKG_NAME)
+  SECTION:=lang
+  CATEGORY:=Languages
+  SUBMENU:=Python
+  TITLE:=Resolve abstract dependencies into concrete ones
+  URL:=https://github.com/sarugaku/resolvelib
+  DEPENDS:=+python3 +python3-setuptools
+endef
+
+define Package/python3-$(PKG_NAME)/description
+   ResolveLib at the highest level provides a Resolver class that includes
+   dependency resolution logic. You give it some things, and a little
+   information on how it should interact with them, and it will spit out
+   a resolution result.
+endef
+
+$(eval $(call Py3Package,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)-src))
--- a/feed/smmap/Makefile
+++ b/feed/smmap/Makefile
@ -0,0 +1,38 @@
+# This is free software, licensed under the GNU General Public License v2.
+# See /LICENSE for more information.
+#
+
+include $(TOPDIR)/rules.mk
+
+PKG_NAME:=smmap
+PKG_VERSION:=5.0.0
+PKG_RELEASE:=$(AUTORELEASE)
+
+PYPI_NAME:=$(PKG_NAME)
+PKG_HASH:=c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936
+PKG_MAINTAINER:=Lee Miller <lee.miller@tutanota.com>
+PKG_LICENSE:=BSD-3-Clause
+
+include $(TOPDIR)/feeds/packages/lang/python/pypi.mk
+include $(INCLUDE_DIR)/package.mk
+include $(TOPDIR)/feeds/packages/lang/python/python3-package.mk
+
+define Package/python3-$(PKG_NAME)
+  SECTION:=lang
+  CATEGORY:=Languages
+  SUBMENU:=Python
+  TITLE:=A pure Python implementation of a sliding window memory map manager
+  URL:=https://github.com/gitpython-developers/smmap
+  DEPENDS:=+python3 +python3-setuptools
+endef
+
+define Package/python3-$(PKG_NAME)/description
+   Smmap wraps an interface around mmap and tracks the mapped files as well
+   as the amount of clients who use it. If the system runs out of resources,
+   or if a memory limit is reached, it will automatically unload unused maps
+   to allow continued operation.
+endef
+
+$(eval $(call Py3Package,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)))
+$(eval $(call BuildPackage,python3-$(PKG_NAME)-src))
--- a/feeds.conf
+++ b/feeds.conf
@ -1,2 +1,2 @@

-src-link quectel ../feed
+src-link local ../feed
--- a/overlays/dualeth.txt
+++ b/overlays/dualeth.txt
@ -0,0 +1,7 @@
+dtparam=i2c_vc=on
+dtoverlay=i2c-rtc,pcf85063a,i2c_csi_dsi
+
+[all]
+
+[cm4]
+otg_mode=1
--- a/overlays/sensing.txt
+++ b/overlays/sensing.txt
@ -0,0 +1,18 @@
+dtparam=i2c_arm=on
+enable_uart=1
+
+dtoverlay=ed-sdhost
+dtoverlay=i2c-rtc,pcf8563
+
+dtoverlay=spi1-1cs
+dtparam=cs0_pin=18,cs0_spidev=disabled
+dtoverlay=ed-mcp2515-spi1-can0
+dtoverlay=uart2
+dtoverlay=uart3
+dtoverlay=uart4
+dtoverlay=uart5
+gpio=11=op,dl
+[all]
+
+[cm4]
+otg_mode=1
--- a/patches/bootconfig-add.patch
+++ b/patches/bootconfig-add.patch
@ -0,0 +1,10 @@
+--- a/Makefile	2023-05-17 03:13:19.345089802 +0300
+++ b/Makefile	2023-05-17 03:15:55.564872084 +0300
+@@ -22,6 +22,7 @@
+ 	mcopy -i $@.boot cmdline.txt ::
+ 	mcopy -i $@.boot config.txt ::
+ 	mcopy -i $@.boot distroconfig.txt ::
+	mcopy -i $@.boot current.txt ::
+ 	mcopy -i $@.boot $(IMAGE_KERNEL) ::$(KERNEL_IMG)
+ 	$(foreach dts,$(shell echo $(DEVICE_DTS)),mcopy -i $@.boot $(DTS_DIR)/$(dts).dtb ::;)
+ 	mmd -i $@.boot ::/overlays
--- a/patches/overlay-add.patch
+++ b/patches/overlay-add.patch
@ -0,0 +1,11 @@
+--- a/arch/arm64/boot/dts/overlays/Makefile	2023-02-24 04:54:15.978815530 +0200
+++ b/arch/arm64/boot/dts/overlays/Makefile	2023-02-24 04:57:05.155496795 +0200
+@@ -3,6 +3,8 @@
+ dtb-$(CONFIG_ARCH_BCM2835) += overlay_map.dtb
+ 
+ dtbo-$(CONFIG_ARCH_BCM2835) += \
+	ed-mcp2515-spi1-can0.dtbo \
+	ed-sdhost.dtbo \
+ 	act-led.dtbo \
+ 	adafruit18.dtbo \
+ 	adau1977-adc.dtbo \
Author	SHA1	Message	Date
Lee Miller	d56af4d393	Replace packages branch literal by an expression All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-12-01 23:14:04 +02:00
Lee Miller	a3d0b97c8f	Apply optimisations from #30 All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-11-27 05:54:42 +02:00
Lee Miller	42ca3e439b	Remove reverting of the i2c-tools package - fixed upstream All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-11-22 01:28:47 +02:00
Lee Miller	2b5cbe32df	Use EXTRA_IMAGE_NAME Image Builder variable instead of moving All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-11-07 00:25:56 +02:00
Peter Surda	5f8faea941	Add docker config All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-10-26 13:13:43 +08:00
Lee Miller	86890bf436	Update resolvelib package All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-10-24 04:52:07 +03:00
Lee Miller	b5eb13dcb6	Add python3-ansible-core-src package into base PACKAGES	2023-10-24 04:52:05 +03:00
Lee Miller	dca142cb68	Add also GitPython package and its depends	2023-10-24 04:51:25 +03:00
Lee Miller	9cb4cbebf4	Add missing dependencies	2023-10-24 04:51:25 +03:00
Lee Miller	96883344fe	Add a package for ansible-core	2023-10-24 04:51:11 +03:00
Lee Miller	400678c759	Revert i2c-tools to release 1 and make All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-10-19 05:56:35 +03:00
Lee Miller	be26956e3e	Add a PKG_ARCH variable, reuse packages built from feed base Some checks failed buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-08-07 22:26:30 +03:00
Lee Miller	d876323952	Improve Dockerfile: - fully parametrize downloads, - move env parameters down after apt packages install, - add python3-distutils for building without buildbot.	2023-08-07 22:25:56 +03:00
Lee Miller	8fd3afcdb3	Try replacing a revision of packages feed by the branch name All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-08-07 07:29:30 +03:00
Peter Surda	04440d8f55	Add docker to waveshare and bump partition size All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-07-04 16:02:21 +08:00
Lee Miller	aed9b64b71	Make a package for nvme-cli and include it into the waveshare image. All checks were successful buildbot/travis_bionic Build done. Details buildbot/multibuild_parent Build done. Details buildbot/job/openwrt Build done. Details Also move there kmod-nvme.	2023-06-19 14:48:04 +03:00
Lee Miller	dc15708ee0	Bundle asterisk packages only into sensing images All checks were successful buildbot/travis_bionic Build done. Details buildbot/multibuild_parent Build done. Details buildbot/job/openwrt Build done. Details	2023-06-10 19:01:25 +03:00
Lee Miller	0c1230e507	Make a package for kmod-nvme and include it into the image. All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-06-10 02:58:06 +03:00
Lee Miller	98b353fbc7	Introduce kmod.mk for common code building the kernel modules	2023-06-10 02:58:06 +03:00
Lee Miller	45a1c1d736	Make a separate image for waveshare All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-06-08 03:13:48 +03:00
Lee Miller	92ab2ebe7a	In uci-defaults detect a SATA controller and install needed packages	2023-05-30 23:42:55 +03:00
Lee Miller	a3ddbfac67	Exit early if imagebuilder fails All checks were successful buildbot/travis_bionic Build done. Details buildbot/multibuild_parent Build done. Details buildbot/job/openwrt Build done. Details	2023-05-26 01:50:19 +03:00
Lee Miller	afa793799c	Rename feed to local	2023-05-26 01:48:20 +03:00
Lee Miller	78bcedaf60	Cleanup build.sh: - show the manifest existing after make image instead of making a new one; - don't move packages to out - the directory is ignored by multibuild; - don't make package index, because imagebuilder builds it's own.	2023-05-24 17:29:22 +03:00
Lee Miller	604299e7ce	Comment out downloading of toolchain - it's not used	2023-05-24 17:26:05 +03:00
Peter Surda	ffc564720c	Typo All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details - this is self-explanatory, the typo prevented the script from running, this script is needed to reconfigure the default network system, basically swapping eth0 and eth1, so that eth0 is wan and eth1 is lan. This means that if there is only one ethernet interface, it doesn't assign it a fixed IP address and start responding to DHCP requests on it. It was broken for a long time but I didn't have time to investigate until now.	2023-05-23 12:47:00 +08:00
Lee Miller	644618d9c4	Copy additional bootconfig as current.txt All checks were successful buildbot/multibuild_parent Build done. Details buildbot/travis_bionic Build done. Details buildbot/job/openwrt Build done. Details	2023-05-17 18:17:58 +03:00
Lee Miller	0a0851250a	Make packages for kmod-i2c-mux-pinctrl and kmod-rtc-pcf85063 using sources taken from linux-5.10.146 and include them into the image.	2023-05-11 05:55:23 +03:00
Lee Miller	18475fca74	Try enabling standard ds1307 for dualeth	2023-03-03 03:34:56 +02:00
Lee Miller	3c0b752465	Add kmod-i2c-mux to PACKAGES	2023-03-03 03:30:44 +02:00
Lee Miller	af4e1053b6	Work around inability to add files to boot via FILES make parameter	2023-03-03 03:30:44 +02:00
Lee Miller	31107a2ef5	Build a separate image for each bootconfig, add pcf8563 kmod for sensing	2023-03-03 03:30:44 +02:00
Lee Miller	68bee355e2	Add i2c kmod and the files	2023-03-03 03:30:43 +02:00