mirror of
https://git.FreeBSD.org/src.git
synced 2025-01-16 15:11:52 +00:00
nvme: do not revert o single I/O queue when per-CPU queues not possible
Previously nvme(4) would revert to a signle I/O queue if it could not allocate enought interrupt vectors or NVMe submission/completion queues to have one I/O queue per core. This patch determines how to utilize a smaller number of available interrupt vectors, and assigns (as closely as possible) an equal number of cores to each associated I/O queue. MFC after: 3 days Sponsored by: Intel
This commit is contained in:
parent
d400f790b1
commit
2b647da7a0
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=293328
@ -270,8 +270,6 @@ nvme_attach(device_t dev)
|
||||
return (status);
|
||||
}
|
||||
|
||||
nvme_sysctl_initialize_ctrlr(ctrlr);
|
||||
|
||||
pci_enable_busmaster(dev);
|
||||
|
||||
ctrlr->config_hook.ich_func = nvme_ctrlr_start_config_hook;
|
||||
|
@ -42,6 +42,12 @@ __FBSDID("$FreeBSD$");
|
||||
|
||||
#include "nvme_private.h"
|
||||
|
||||
/*
|
||||
* Used for calculating number of CPUs to assign to each core and number of I/O
|
||||
* queues to allocate per controller.
|
||||
*/
|
||||
#define NVME_CEILING(num, div) ((((num) - 1) / (div)) + 1)
|
||||
|
||||
static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
|
||||
struct nvme_async_event_request *aer);
|
||||
static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr);
|
||||
@ -141,6 +147,13 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
|
||||
*/
|
||||
num_trackers = min(num_trackers, (num_entries-1));
|
||||
|
||||
/*
|
||||
* This was calculated previously when setting up interrupts, but
|
||||
* a controller could theoretically support fewer I/O queues than
|
||||
* MSI-X vectors. So calculate again here just to be safe.
|
||||
*/
|
||||
ctrlr->num_cpus_per_ioq = NVME_CEILING(mp_ncpus, ctrlr->num_io_queues);
|
||||
|
||||
ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
|
||||
M_NVME, M_ZERO | M_WAITOK);
|
||||
|
||||
@ -161,8 +174,13 @@ nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
|
||||
num_trackers,
|
||||
ctrlr);
|
||||
|
||||
/*
|
||||
* Do not bother binding interrupts if we only have one I/O
|
||||
* interrupt thread for this controller.
|
||||
*/
|
||||
if (ctrlr->num_io_queues > 1)
|
||||
bus_bind_intr(ctrlr->dev, qpair->res, i);
|
||||
bus_bind_intr(ctrlr->dev, qpair->res,
|
||||
i * ctrlr->num_cpus_per_ioq);
|
||||
}
|
||||
|
||||
return (0);
|
||||
@ -307,8 +325,15 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
|
||||
int i;
|
||||
|
||||
nvme_admin_qpair_disable(&ctrlr->adminq);
|
||||
for (i = 0; i < ctrlr->num_io_queues; i++)
|
||||
nvme_io_qpair_disable(&ctrlr->ioq[i]);
|
||||
/*
|
||||
* I/O queues are not allocated before the initial HW
|
||||
* reset, so do not try to disable them. Use is_initialized
|
||||
* to determine if this is the initial HW reset.
|
||||
*/
|
||||
if (ctrlr->is_initialized) {
|
||||
for (i = 0; i < ctrlr->num_io_queues; i++)
|
||||
nvme_io_qpair_disable(&ctrlr->ioq[i]);
|
||||
}
|
||||
|
||||
DELAY(100*1000);
|
||||
|
||||
@ -364,7 +389,7 @@ static int
|
||||
nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
|
||||
{
|
||||
struct nvme_completion_poll_status status;
|
||||
int cq_allocated, i, sq_allocated;
|
||||
int cq_allocated, sq_allocated;
|
||||
|
||||
status.done = FALSE;
|
||||
nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
|
||||
@ -385,25 +410,12 @@ nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
|
||||
cq_allocated = (status.cpl.cdw0 >> 16) + 1;
|
||||
|
||||
/*
|
||||
* Check that the controller was able to allocate the number of
|
||||
* queues we requested. If not, revert to one IO queue pair.
|
||||
* Controller may allocate more queues than we requested,
|
||||
* so use the minimum of the number requested and what was
|
||||
* actually allocated.
|
||||
*/
|
||||
if (sq_allocated < ctrlr->num_io_queues ||
|
||||
cq_allocated < ctrlr->num_io_queues) {
|
||||
|
||||
/*
|
||||
* Destroy extra IO queue pairs that were created at
|
||||
* controller construction time but are no longer
|
||||
* needed. This will only happen when a controller
|
||||
* supports fewer queues than MSI-X vectors. This
|
||||
* is not the normal case, but does occur with the
|
||||
* Chatham prototype board.
|
||||
*/
|
||||
for (i = 1; i < ctrlr->num_io_queues; i++)
|
||||
nvme_io_qpair_destroy(&ctrlr->ioq[i]);
|
||||
|
||||
ctrlr->num_io_queues = 1;
|
||||
}
|
||||
ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
|
||||
ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
|
||||
|
||||
return (0);
|
||||
}
|
||||
@ -687,9 +699,20 @@ static void
|
||||
nvme_ctrlr_start(void *ctrlr_arg)
|
||||
{
|
||||
struct nvme_controller *ctrlr = ctrlr_arg;
|
||||
uint32_t old_num_io_queues;
|
||||
int i;
|
||||
|
||||
nvme_qpair_reset(&ctrlr->adminq);
|
||||
/*
|
||||
* Only reset adminq here when we are restarting the
|
||||
* controller after a reset. During initialization,
|
||||
* we have already submitted admin commands to get
|
||||
* the number of I/O queues supported, so cannot reset
|
||||
* the adminq again here.
|
||||
*/
|
||||
if (ctrlr->is_resetting) {
|
||||
nvme_qpair_reset(&ctrlr->adminq);
|
||||
}
|
||||
|
||||
for (i = 0; i < ctrlr->num_io_queues; i++)
|
||||
nvme_qpair_reset(&ctrlr->ioq[i]);
|
||||
|
||||
@ -700,11 +723,25 @@ nvme_ctrlr_start(void *ctrlr_arg)
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The number of qpairs are determined during controller initialization,
|
||||
* including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
|
||||
* HW limit. We call SET_FEATURES again here so that it gets called
|
||||
* after any reset for controllers that depend on the driver to
|
||||
* explicit specify how many queues it will use. This value should
|
||||
* never change between resets, so panic if somehow that does happen.
|
||||
*/
|
||||
old_num_io_queues = ctrlr->num_io_queues;
|
||||
if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
|
||||
nvme_ctrlr_fail(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
if (old_num_io_queues != ctrlr->num_io_queues) {
|
||||
panic("num_io_queues changed from %u to %u", old_num_io_queues,
|
||||
ctrlr->num_io_queues);
|
||||
}
|
||||
|
||||
if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
|
||||
nvme_ctrlr_fail(ctrlr);
|
||||
return;
|
||||
@ -727,7 +764,16 @@ nvme_ctrlr_start_config_hook(void *arg)
|
||||
{
|
||||
struct nvme_controller *ctrlr = arg;
|
||||
|
||||
nvme_ctrlr_start(ctrlr);
|
||||
nvme_qpair_reset(&ctrlr->adminq);
|
||||
nvme_admin_qpair_enable(&ctrlr->adminq);
|
||||
|
||||
if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
|
||||
nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
|
||||
nvme_ctrlr_start(ctrlr);
|
||||
else
|
||||
nvme_ctrlr_fail(ctrlr);
|
||||
|
||||
nvme_sysctl_initialize_ctrlr(ctrlr);
|
||||
config_intrhook_disestablish(&ctrlr->config_hook);
|
||||
|
||||
ctrlr->is_initialized = 1;
|
||||
@ -780,6 +826,7 @@ nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
|
||||
|
||||
ctrlr->msix_enabled = 0;
|
||||
ctrlr->num_io_queues = 1;
|
||||
ctrlr->num_cpus_per_ioq = mp_ncpus;
|
||||
ctrlr->rid = 0;
|
||||
ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
|
||||
&ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
|
||||
@ -932,6 +979,7 @@ nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr)
|
||||
device_t dev;
|
||||
int per_cpu_io_queues;
|
||||
int num_vectors_requested, num_vectors_allocated;
|
||||
int num_vectors_available;
|
||||
|
||||
dev = ctrlr->dev;
|
||||
per_cpu_io_queues = 1;
|
||||
@ -940,52 +988,55 @@ nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr)
|
||||
ctrlr->force_intx = 0;
|
||||
TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
|
||||
|
||||
if (ctrlr->force_intx || pci_msix_count(dev) < 2) {
|
||||
/*
|
||||
* FreeBSD currently cannot allocate more than about 190 vectors at
|
||||
* boot, meaning that systems with high core count and many devices
|
||||
* requesting per-CPU interrupt vectors will not get their full
|
||||
* allotment. So first, try to allocate as many as we may need to
|
||||
* understand what is available, then immediately release them.
|
||||
* Then figure out how many of those we will actually use, based on
|
||||
* assigning an equal number of cores to each I/O queue.
|
||||
*/
|
||||
|
||||
/* One vector for per core I/O queue, plus one vector for admin queue. */
|
||||
num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1);
|
||||
if (pci_alloc_msix(dev, &num_vectors_available) != 0) {
|
||||
num_vectors_available = 0;
|
||||
}
|
||||
pci_release_msi(dev);
|
||||
|
||||
if (ctrlr->force_intx || num_vectors_available < 2) {
|
||||
nvme_ctrlr_configure_intx(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
ctrlr->msix_enabled = 1;
|
||||
|
||||
if (per_cpu_io_queues)
|
||||
ctrlr->num_io_queues = mp_ncpus;
|
||||
ctrlr->num_cpus_per_ioq = NVME_CEILING(mp_ncpus, num_vectors_available + 1);
|
||||
else
|
||||
ctrlr->num_io_queues = 1;
|
||||
ctrlr->num_cpus_per_ioq = mp_ncpus;
|
||||
|
||||
/* One vector per IO queue, plus one vector for admin queue. */
|
||||
ctrlr->num_io_queues = NVME_CEILING(mp_ncpus, ctrlr->num_cpus_per_ioq);
|
||||
num_vectors_requested = ctrlr->num_io_queues + 1;
|
||||
|
||||
if (pci_msix_count(dev) < num_vectors_requested) {
|
||||
ctrlr->num_io_queues = 1;
|
||||
num_vectors_requested = 2; /* one for admin, one for I/O */
|
||||
}
|
||||
|
||||
num_vectors_allocated = num_vectors_requested;
|
||||
|
||||
/*
|
||||
* Now just allocate the number of vectors we need. This should
|
||||
* succeed, since we previously called pci_alloc_msix()
|
||||
* successfully returning at least this many vectors, but just to
|
||||
* be safe, if something goes wrong just revert to INTx.
|
||||
*/
|
||||
if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) {
|
||||
nvme_ctrlr_configure_intx(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
if (num_vectors_allocated < num_vectors_requested) {
|
||||
if (num_vectors_allocated < 2) {
|
||||
pci_release_msi(dev);
|
||||
nvme_ctrlr_configure_intx(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
ctrlr->num_io_queues = 1;
|
||||
/*
|
||||
* Release whatever vectors were allocated, and just
|
||||
* reallocate the two needed for the admin and single
|
||||
* I/O qpair.
|
||||
*/
|
||||
num_vectors_allocated = 2;
|
||||
pci_release_msi(dev);
|
||||
if (pci_alloc_msix(dev, &num_vectors_allocated) != 0)
|
||||
panic("could not reallocate any vectors\n");
|
||||
if (num_vectors_allocated != 2)
|
||||
panic("could not reallocate 2 vectors\n");
|
||||
nvme_ctrlr_configure_intx(ctrlr);
|
||||
return;
|
||||
}
|
||||
|
||||
ctrlr->msix_enabled = 1;
|
||||
}
|
||||
|
||||
int
|
||||
@ -1034,10 +1085,6 @@ nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
|
||||
|
||||
ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
|
||||
nvme_ctrlr_construct_admin_qpair(ctrlr);
|
||||
status = nvme_ctrlr_construct_io_qpairs(ctrlr);
|
||||
|
||||
if (status != 0)
|
||||
return (status);
|
||||
|
||||
ctrlr->cdev = make_dev(&nvme_ctrlr_cdevsw, device_get_unit(dev),
|
||||
UID_ROOT, GID_WHEEL, 0600, "nvme%d", device_get_unit(dev));
|
||||
@ -1149,11 +1196,7 @@ nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
|
||||
{
|
||||
struct nvme_qpair *qpair;
|
||||
|
||||
if (ctrlr->num_io_queues > 1)
|
||||
qpair = &ctrlr->ioq[curcpu];
|
||||
else
|
||||
qpair = &ctrlr->ioq[0];
|
||||
|
||||
qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq];
|
||||
nvme_qpair_submit_request(qpair, req);
|
||||
}
|
||||
|
||||
|
@ -265,6 +265,7 @@ struct nvme_controller {
|
||||
uint32_t enable_aborts;
|
||||
|
||||
uint32_t num_io_queues;
|
||||
uint32_t num_cpus_per_ioq;
|
||||
|
||||
/* Fields for tracking progress during controller initialization. */
|
||||
struct intr_config_hook config_hook;
|
||||
|
Loading…
Reference in New Issue
Block a user