nvme: do not revert o single I/O queue when per-CPU queues not possible

Previously nvme(4) would revert to a signle I/O queue if it could not allocate enought interrupt vectors or NVMe submission/completion queues to have one I/O queue per core. This patch determines how to utilize a smaller number of available interrupt vectors, and assigns (as closely as possible) an equal number of cores to each associated I/O queue. MFC after: 3 days Sponsored by: Intel
svn path=/head/; revision=293328
2025-01-16 15:11:52 +00:00 · 2016-01-07 16:18:32 +00:00 · 2016-01-07 16:18:32 +00:00 · 2b647da7a0 · 2020-12-20 02:59:44 +00:00
commit 2b647da7a0
parent d400f790b1
3 changed files with 106 additions and 64 deletions
--- a/sys/dev/nvme/nvme.c
+++ b/sys/dev/nvme/nvme.c
@ -270,8 +270,6 @@ nvme_attach(device_t dev)
 		return (status);
 	}

-	nvme_sysctl_initialize_ctrlr(ctrlr);
-
 	pci_enable_busmaster(dev);

 	ctrlr->config_hook.ich_func = nvme_ctrlr_start_config_hook;
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@ -42,6 +42,12 @@ __FBSDID("$FreeBSD$");

 #include "nvme_private.h"

+/*
+ * Used for calculating number of CPUs to assign to each core and number of I/O
+ *  queues to allocate per controller.
+ */
+#define NVME_CEILING(num, div)	((((num) - 1) / (div)) + 1)
+
 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
 						struct nvme_async_event_request *aer);
 static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr);
@ -141,6 +147,13 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
 	 */
 	num_trackers = min(num_trackers, (num_entries-1));

+	/*
+	 * This was calculated previously when setting up interrupts, but
+	 *  a controller could theoretically support fewer I/O queues than
+	 *  MSI-X vectors.  So calculate again here just to be safe.
+	 */
+	ctrlr->num_cpus_per_ioq = NVME_CEILING(mp_ncpus, ctrlr->num_io_queues);
+
 	ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
 	    M_NVME, M_ZERO | M_WAITOK);

@ -161,8 +174,13 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
 				     num_trackers,
 				     ctrlr);

+		/*
+		 * Do not bother binding interrupts if we only have one I/O
+		 *  interrupt thread for this controller.
+		 */
 		if (ctrlr->num_io_queues > 1)
-			bus_bind_intr(ctrlr->dev, qpair->res, i);
+			bus_bind_intr(ctrlr->dev, qpair->res,
+			    i * ctrlr->num_cpus_per_ioq);
 	}

 	return (0);
@ -307,8 +325,15 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 	int i;

 	nvme_admin_qpair_disable(&ctrlr->adminq);
-	for (i = 0; i < ctrlr->num_io_queues; i++)
-		nvme_io_qpair_disable(&ctrlr->ioq[i]);
+	/*
+	 * I/O queues are not allocated before the initial HW
+	 *  reset, so do not try to disable them.  Use is_initialized
+	 *  to determine if this is the initial HW reset.
+	 */
+	if (ctrlr->is_initialized) {
+		for (i = 0; i < ctrlr->num_io_queues; i++)
+			nvme_io_qpair_disable(&ctrlr->ioq[i]);
+	}

 	DELAY(100*1000);

@ -364,7 +389,7 @@ static int
 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
-	int					cq_allocated, i, sq_allocated;
+	int					cq_allocated, sq_allocated;

 	status.done = FALSE;
 	nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
@ -385,25 +410,12 @@ nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
 	cq_allocated = (status.cpl.cdw0 >> 16) + 1;

 	/*
-	 * Check that the controller was able to allocate the number of
-	 *  queues we requested.  If not, revert to one IO queue pair.
+	 * Controller may allocate more queues than we requested,
+	 *  so use the minimum of the number requested and what was
+	 *  actually allocated.
 	 */
-	if (sq_allocated < ctrlr->num_io_queues ||
-	    cq_allocated < ctrlr->num_io_queues) {
-
-		/*
-		 * Destroy extra IO queue pairs that were created at
-		 *  controller construction time but are no longer
-		 *  needed.  This will only happen when a controller
-		 *  supports fewer queues than MSI-X vectors.  This
-		 *  is not the normal case, but does occur with the
-		 *  Chatham prototype board.
-		 */
-		for (i = 1; i < ctrlr->num_io_queues; i++)
-			nvme_io_qpair_destroy(&ctrlr->ioq[i]);
-
-		ctrlr->num_io_queues = 1;
-	}
+	ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
+	ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);

 	return (0);
 }
@ -687,9 +699,20 @@ static void
 nvme_ctrlr_start(void *ctrlr_arg)
 {
 	struct nvme_controller *ctrlr = ctrlr_arg;
+	uint32_t old_num_io_queues;
 	int i;

-	nvme_qpair_reset(&ctrlr->adminq);
+	/*
+	 * Only reset adminq here when we are restarting the
+	 *  controller after a reset.  During initialization,
+	 *  we have already submitted admin commands to get
+	 *  the number of I/O queues supported, so cannot reset
+	 *  the adminq again here.
+	 */
+	if (ctrlr->is_resetting) {
+		nvme_qpair_reset(&ctrlr->adminq);
+	}
+
 	for (i = 0; i < ctrlr->num_io_queues; i++)
 		nvme_qpair_reset(&ctrlr->ioq[i]);

@ -700,11 +723,25 @@ nvme_ctrlr_start(void *ctrlr_arg)
 		return;
 	}

+	/*
+	 * The number of qpairs are determined during controller initialization,
+	 *  including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
+	 *  HW limit.  We call SET_FEATURES again here so that it gets called
+	 *  after any reset for controllers that depend on the driver to
+	 *  explicit specify how many queues it will use.  This value should
+	 *  never change between resets, so panic if somehow that does happen.
+	 */
+	old_num_io_queues = ctrlr->num_io_queues;
 	if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
 		nvme_ctrlr_fail(ctrlr);
 		return;
 	}

+	if (old_num_io_queues != ctrlr->num_io_queues) {
+		panic("num_io_queues changed from %u to %u", old_num_io_queues,
+		    ctrlr->num_io_queues);
+	}
+
 	if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
 		nvme_ctrlr_fail(ctrlr);
 		return;
@ -727,7 +764,16 @@ nvme_ctrlr_start_config_hook(void *arg)
 {
 	struct nvme_controller *ctrlr = arg;

-	nvme_ctrlr_start(ctrlr);
+	nvme_qpair_reset(&ctrlr->adminq);
+	nvme_admin_qpair_enable(&ctrlr->adminq);
+
+	if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
+	    nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
+		nvme_ctrlr_start(ctrlr);
+	else
+		nvme_ctrlr_fail(ctrlr);
+
+	nvme_sysctl_initialize_ctrlr(ctrlr);
 	config_intrhook_disestablish(&ctrlr->config_hook);

 	ctrlr->is_initialized = 1;
@ -780,6 +826,7 @@ nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)

 	ctrlr->msix_enabled = 0;
 	ctrlr->num_io_queues = 1;
+	ctrlr->num_cpus_per_ioq = mp_ncpus;
 	ctrlr->rid = 0;
 	ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
 	    &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
@ -932,6 +979,7 @@ nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr)
 	device_t	dev;
 	int		per_cpu_io_queues;
 	int		num_vectors_requested, num_vectors_allocated;
+	int		num_vectors_available;

 	dev = ctrlr->dev;
 	per_cpu_io_queues = 1;
@ -940,52 +988,55 @@ nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr)
 	ctrlr->force_intx = 0;
 	TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);

-	if (ctrlr->force_intx || pci_msix_count(dev) < 2) {
+	/*
+	 * FreeBSD currently cannot allocate more than about 190 vectors at
+	 *  boot, meaning that systems with high core count and many devices
+	 *  requesting per-CPU interrupt vectors will not get their full
+	 *  allotment.  So first, try to allocate as many as we may need to
+	 *  understand what is available, then immediately release them.
+	 *  Then figure out how many of those we will actually use, based on
+	 *  assigning an equal number of cores to each I/O queue.
+	 */
+
+	/* One vector for per core I/O queue, plus one vector for admin queue. */
+	num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1);
+	if (pci_alloc_msix(dev, &num_vectors_available) != 0) {
+		num_vectors_available = 0;
+	}
+	pci_release_msi(dev);
+
+	if (ctrlr->force_intx || num_vectors_available < 2) {
 		nvme_ctrlr_configure_intx(ctrlr);
 		return;
 	}

-	ctrlr->msix_enabled = 1;
-
 	if (per_cpu_io_queues)
-		ctrlr->num_io_queues = mp_ncpus;
+		ctrlr->num_cpus_per_ioq = NVME_CEILING(mp_ncpus, num_vectors_available + 1);
 	else
-		ctrlr->num_io_queues = 1;
+		ctrlr->num_cpus_per_ioq = mp_ncpus;

-	/* One vector per IO queue, plus one vector for admin queue. */
+	ctrlr->num_io_queues = NVME_CEILING(mp_ncpus, ctrlr->num_cpus_per_ioq);
 	num_vectors_requested = ctrlr->num_io_queues + 1;
-
-	if (pci_msix_count(dev) < num_vectors_requested) {
-		ctrlr->num_io_queues = 1;
-		num_vectors_requested = 2; /* one for admin, one for I/O */
-	}
-
 	num_vectors_allocated = num_vectors_requested;
+
+	/*
+	 * Now just allocate the number of vectors we need.  This should
+	 *  succeed, since we previously called pci_alloc_msix()
+	 *  successfully returning at least this many vectors, but just to
+	 *  be safe, if something goes wrong just revert to INTx.
+	 */
 	if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) {
 		nvme_ctrlr_configure_intx(ctrlr);
 		return;
 	}

 	if (num_vectors_allocated < num_vectors_requested) {
-		if (num_vectors_allocated < 2) {
-			pci_release_msi(dev);
-			nvme_ctrlr_configure_intx(ctrlr);
-			return;
-		}
-
-		ctrlr->num_io_queues = 1;
-		/*
-		 * Release whatever vectors were allocated, and just
-		 *  reallocate the two needed for the admin and single
-		 *  I/O qpair.
-		 */
-		num_vectors_allocated = 2;
 		pci_release_msi(dev);
-		if (pci_alloc_msix(dev, &num_vectors_allocated) != 0)
-			panic("could not reallocate any vectors\n");
-		if (num_vectors_allocated != 2)
-			panic("could not reallocate 2 vectors\n");
+		nvme_ctrlr_configure_intx(ctrlr);
+		return;
 	}
+
+	ctrlr->msix_enabled = 1;
 }

 int
@ -1034,10 +1085,6 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)

 	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
 	nvme_ctrlr_construct_admin_qpair(ctrlr);
-	status = nvme_ctrlr_construct_io_qpairs(ctrlr);
-
-	if (status != 0)
-		return (status);

 	ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, device_get_unit(dev),
 	    UID_ROOT, GID_WHEEL, 0600, "nvme%d", device_get_unit(dev));
@ -1149,11 +1196,7 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
 {
 	struct nvme_qpair       *qpair;

-	if (ctrlr->num_io_queues > 1)
-		qpair = &ctrlr->ioq[curcpu];
-	else
-		qpair = &ctrlr->ioq[0];
-
+	qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq];
 	nvme_qpair_submit_request(qpair, req);
 }

--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@ -265,6 +265,7 @@ struct nvme_controller {
 	uint32_t		enable_aborts;

 	uint32_t		num_io_queues;
+	uint32_t		num_cpus_per_ioq;

 	/* Fields for tracking progress during controller initialization. */
 	struct intr_config_hook	config_hook;