Files
src/sys/dev/nvmf/host/nvmf_ns.c
T
John Baldwin aacaeeee8e nvmf: Permit failing I/O requests while disconnected
Add a kern.nvmf.fail_on_disconnection sysctl similar to the
kern.iscsi.fail_on_disconnection sysctl.  This causes pending I/O
requests to fail with an error if an association is disconnected
instead of requeueing to be retried once the association is
reconnected.  As with iSCSI, the default is to queue and retry
operations.

Reviewed by:	imp
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D45308
2024-06-05 12:59:07 -07:00

503 lines
11 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/disk.h>
#include <sys/fcntl.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/sbuf.h>
#include <machine/stdarg.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/host/nvmf_var.h>
struct nvmf_namespace {
struct nvmf_softc *sc;
uint64_t size;
uint32_t id;
u_int flags;
uint32_t lba_size;
bool disconnected;
TAILQ_HEAD(, bio) pending_bios;
struct mtx lock;
volatile u_int active_bios;
struct cdev *cdev;
};
static void nvmf_ns_strategy(struct bio *bio);
static void
ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
{
char buf[128];
struct sbuf sb;
va_list ap;
sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev),
ns->id);
va_start(ap, fmt);
sbuf_vprintf(&sb, fmt, ap);
va_end(ap);
sbuf_finish(&sb);
sbuf_delete(&sb);
}
/*
* The I/O completion may trigger after the received CQE if the I/O
* used a zero-copy mbuf that isn't harvested until after the NIC
* driver processes TX completions. Abuse bio_driver1 as a refcount.
* Store I/O errors in bio_driver2.
*/
static __inline u_int *
bio_refs(struct bio *bio)
{
return ((u_int *)&bio->bio_driver1);
}
static void
nvmf_ns_biodone(struct bio *bio)
{
struct nvmf_namespace *ns;
int error;
if (!refcount_release(bio_refs(bio)))
return;
ns = bio->bio_dev->si_drv1;
/* If a request is aborted, resubmit or queue it for resubmission. */
if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) {
bio->bio_error = 0;
bio->bio_driver2 = 0;
mtx_lock(&ns->lock);
if (ns->disconnected) {
if (nvmf_fail_disconnect) {
mtx_unlock(&ns->lock);
bio->bio_error = ECONNABORTED;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
} else {
TAILQ_INSERT_TAIL(&ns->pending_bios, bio,
bio_queue);
mtx_unlock(&ns->lock);
}
} else {
mtx_unlock(&ns->lock);
nvmf_ns_strategy(bio);
}
} else {
/*
* I/O errors take precedence over generic EIO from
* CQE errors.
*/
error = (intptr_t)bio->bio_driver2;
if (error != 0)
bio->bio_error = error;
if (bio->bio_error != 0)
bio->bio_flags |= BIO_ERROR;
biodone(bio);
}
if (refcount_release(&ns->active_bios))
wakeup(ns);
}
static void
nvmf_ns_io_complete(void *arg, size_t xfered, int error)
{
struct bio *bio = arg;
KASSERT(xfered <= bio->bio_bcount,
("%s: xfered > bio_bcount", __func__));
bio->bio_driver2 = (void *)(intptr_t)error;
bio->bio_resid = bio->bio_bcount - xfered;
nvmf_ns_biodone(bio);
}
static void
nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
{
struct bio *bio = arg;
if (error != 0)
bio->bio_resid = bio->bio_bcount;
else
bio->bio_resid = 0;
free(bio->bio_driver2, M_NVMF);
bio->bio_driver2 = (void *)(intptr_t)error;
nvmf_ns_biodone(bio);
}
static void
nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
{
struct bio *bio = arg;
if (nvmf_cqe_aborted(cqe))
bio->bio_error = ECONNABORTED;
else if (cqe->status != 0)
bio->bio_error = EIO;
nvmf_ns_biodone(bio);
}
static int
nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
{
struct nvme_command cmd;
struct nvmf_request *req;
struct nvme_dsm_range *dsm_range;
struct memdesc mem;
uint64_t lba, lba_count;
int error;
dsm_range = NULL;
memset(&cmd, 0, sizeof(cmd));
switch (bio->bio_cmd) {
case BIO_READ:
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
break;
case BIO_WRITE:
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
break;
case BIO_FLUSH:
nvme_ns_flush_cmd(&cmd, ns->id);
break;
case BIO_DELETE:
dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
M_ZERO);
if (dsm_range == NULL)
return (ENOMEM);
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
dsm_range->starting_lba = htole64(lba);
dsm_range->length = htole32(lba_count);
cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
cmd.nsid = htole32(ns->id);
cmd.cdw10 = htole32(0); /* 1 range */
cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
break;
default:
return (EOPNOTSUPP);
}
mtx_lock(&ns->lock);
if (ns->disconnected) {
if (nvmf_fail_disconnect) {
error = ECONNABORTED;
} else {
TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
error = 0;
}
mtx_unlock(&ns->lock);
free(dsm_range, M_NVMF);
return (error);
}
req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
nvmf_ns_bio_complete, bio, M_NOWAIT);
if (req == NULL) {
mtx_unlock(&ns->lock);
free(dsm_range, M_NVMF);
return (ENOMEM);
}
switch (bio->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
refcount_init(bio_refs(bio), 2);
mem = memdesc_bio(bio);
nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
break;
case BIO_DELETE:
refcount_init(bio_refs(bio), 2);
mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
true, nvmf_ns_delete_complete, bio);
bio->bio_driver2 = dsm_range;
break;
default:
refcount_init(bio_refs(bio), 1);
KASSERT(bio->bio_resid == 0,
("%s: input bio_resid != 0", __func__));
break;
}
refcount_acquire(&ns->active_bios);
nvmf_submit_request(req);
mtx_unlock(&ns->lock);
return (0);
}
static int
nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
struct thread *td)
{
struct nvmf_namespace *ns = dev->si_drv1;
struct nvme_get_nsid *gnsid;
struct nvme_pt_command *pt;
switch (cmd) {
case NVME_PASSTHROUGH_CMD:
pt = (struct nvme_pt_command *)arg;
pt->cmd.nsid = htole32(ns->id);
return (nvmf_passthrough_cmd(ns->sc, pt, false));
case NVME_GET_NSID:
gnsid = (struct nvme_get_nsid *)arg;
strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
sizeof(gnsid->cdev));
gnsid->nsid = ns->id;
return (0);
case DIOCGMEDIASIZE:
*(off_t *)arg = ns->size;
return (0);
case DIOCGSECTORSIZE:
*(u_int *)arg = ns->lba_size;
return (0);
default:
return (ENOTTY);
}
}
static int
nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
int error;
error = 0;
if ((oflags & FWRITE) != 0)
error = securelevel_gt(td->td_ucred, 0);
return (error);
}
void
nvmf_ns_strategy(struct bio *bio)
{
struct nvmf_namespace *ns;
int error;
ns = bio->bio_dev->si_drv1;
error = nvmf_ns_submit_bio(ns, bio);
if (error != 0) {
bio->bio_error = error;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
}
}
static struct cdevsw nvmf_ns_cdevsw = {
.d_version = D_VERSION,
.d_flags = D_DISK,
.d_open = nvmf_ns_open,
.d_read = physread,
.d_write = physwrite,
.d_strategy = nvmf_ns_strategy,
.d_ioctl = nvmf_ns_ioctl
};
struct nvmf_namespace *
nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
const struct nvme_namespace_data *data)
{
struct make_dev_args mda;
struct nvmf_namespace *ns;
int error;
uint8_t lbads, lbaf;
ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
ns->sc = sc;
ns->id = id;
TAILQ_INIT(&ns->pending_bios);
mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
/* One dummy bio avoids dropping to 0 until destroy. */
refcount_init(&ns->active_bios, 1);
if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
ns_printf(ns, "End-to-end data protection not supported\n");
goto fail;
}
lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
if (lbaf > data->nlbaf) {
ns_printf(ns, "Invalid LBA format index\n");
goto fail;
}
if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
ns_printf(ns, "Namespaces with metadata are not supported\n");
goto fail;
}
lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
if (lbads == 0) {
ns_printf(ns, "Invalid LBA format index\n");
goto fail;
}
ns->lba_size = 1 << lbads;
ns->size = data->nsze * ns->lba_size;
if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
ns->flags |= NVME_NS_FLUSH_SUPPORTED;
/*
* XXX: Does any of the boundary splitting for NOIOB make any
* sense for Fabrics?
*/
make_dev_args_init(&mda);
mda.mda_devsw = &nvmf_ns_cdevsw;
mda.mda_uid = UID_ROOT;
mda.mda_gid = GID_WHEEL;
mda.mda_mode = 0600;
mda.mda_si_drv1 = ns;
error = make_dev_s(&mda, &ns->cdev, "%sn%u",
device_get_nameunit(sc->dev), id);
if (error != 0)
goto fail;
ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u",
device_get_nameunit(sc->dev), id);
ns->cdev->si_flags |= SI_UNMAPPED;
return (ns);
fail:
mtx_destroy(&ns->lock);
free(ns, M_NVMF);
return (NULL);
}
void
nvmf_disconnect_ns(struct nvmf_namespace *ns)
{
mtx_lock(&ns->lock);
ns->disconnected = true;
mtx_unlock(&ns->lock);
}
void
nvmf_reconnect_ns(struct nvmf_namespace *ns)
{
TAILQ_HEAD(, bio) bios;
struct bio *bio;
mtx_lock(&ns->lock);
ns->disconnected = false;
TAILQ_INIT(&bios);
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
mtx_unlock(&ns->lock);
while (!TAILQ_EMPTY(&bios)) {
bio = TAILQ_FIRST(&bios);
TAILQ_REMOVE(&bios, bio, bio_queue);
nvmf_ns_strategy(bio);
}
}
void
nvmf_destroy_ns(struct nvmf_namespace *ns)
{
TAILQ_HEAD(, bio) bios;
struct bio *bio;
if (ns->cdev->si_drv2 != NULL)
destroy_dev(ns->cdev->si_drv2);
destroy_dev(ns->cdev);
/*
* Wait for active I/O requests to drain. The release drops
* the reference on the "dummy bio" when the namespace is
* created.
*/
mtx_lock(&ns->lock);
if (!refcount_release(&ns->active_bios)) {
while (ns->active_bios != 0)
mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
}
/* Abort any pending I/O requests. */
TAILQ_INIT(&bios);
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
mtx_unlock(&ns->lock);
while (!TAILQ_EMPTY(&bios)) {
bio = TAILQ_FIRST(&bios);
TAILQ_REMOVE(&bios, bio, bio_queue);
bio->bio_error = ECONNABORTED;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
}
mtx_destroy(&ns->lock);
free(ns, M_NVMF);
}
bool
nvmf_update_ns(struct nvmf_namespace *ns,
const struct nvme_namespace_data *data)
{
uint8_t lbads, lbaf;
if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
ns_printf(ns, "End-to-end data protection not supported\n");
return (false);
}
lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
if (lbaf > data->nlbaf) {
ns_printf(ns, "Invalid LBA format index\n");
return (false);
}
if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
ns_printf(ns, "Namespaces with metadata are not supported\n");
return (false);
}
lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
if (lbads == 0) {
ns_printf(ns, "Invalid LBA format index\n");
return (false);
}
ns->lba_size = 1 << lbads;
ns->size = data->nsze * ns->lba_size;
return (true);
}