aacaeeee8e
Add a kern.nvmf.fail_on_disconnection sysctl similar to the kern.iscsi.fail_on_disconnection sysctl. This causes pending I/O requests to fail with an error if an association is disconnected instead of requeueing to be retried once the association is reconnected. As with iSCSI, the default is to queue and retry operations. Reviewed by: imp Sponsored by: Chelsio Communications Differential Revision: https://reviews.freebsd.org/D45308
503 lines
11 KiB
C
503 lines
11 KiB
C
/*-
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*
|
|
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
|
|
* Written by: John Baldwin <jhb@FreeBSD.org>
|
|
*/
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/bio.h>
|
|
#include <sys/bus.h>
|
|
#include <sys/conf.h>
|
|
#include <sys/disk.h>
|
|
#include <sys/fcntl.h>
|
|
#include <sys/lock.h>
|
|
#include <sys/malloc.h>
|
|
#include <sys/memdesc.h>
|
|
#include <sys/mutex.h>
|
|
#include <sys/proc.h>
|
|
#include <sys/refcount.h>
|
|
#include <sys/sbuf.h>
|
|
#include <machine/stdarg.h>
|
|
#include <dev/nvme/nvme.h>
|
|
#include <dev/nvmf/host/nvmf_var.h>
|
|
|
|
struct nvmf_namespace {
|
|
struct nvmf_softc *sc;
|
|
uint64_t size;
|
|
uint32_t id;
|
|
u_int flags;
|
|
uint32_t lba_size;
|
|
bool disconnected;
|
|
|
|
TAILQ_HEAD(, bio) pending_bios;
|
|
struct mtx lock;
|
|
volatile u_int active_bios;
|
|
|
|
struct cdev *cdev;
|
|
};
|
|
|
|
static void nvmf_ns_strategy(struct bio *bio);
|
|
|
|
static void
|
|
ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
|
|
{
|
|
char buf[128];
|
|
struct sbuf sb;
|
|
va_list ap;
|
|
|
|
sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
|
|
sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
|
|
|
|
sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev),
|
|
ns->id);
|
|
|
|
va_start(ap, fmt);
|
|
sbuf_vprintf(&sb, fmt, ap);
|
|
va_end(ap);
|
|
|
|
sbuf_finish(&sb);
|
|
sbuf_delete(&sb);
|
|
}
|
|
|
|
/*
|
|
* The I/O completion may trigger after the received CQE if the I/O
|
|
* used a zero-copy mbuf that isn't harvested until after the NIC
|
|
* driver processes TX completions. Abuse bio_driver1 as a refcount.
|
|
* Store I/O errors in bio_driver2.
|
|
*/
|
|
static __inline u_int *
|
|
bio_refs(struct bio *bio)
|
|
{
|
|
return ((u_int *)&bio->bio_driver1);
|
|
}
|
|
|
|
static void
|
|
nvmf_ns_biodone(struct bio *bio)
|
|
{
|
|
struct nvmf_namespace *ns;
|
|
int error;
|
|
|
|
if (!refcount_release(bio_refs(bio)))
|
|
return;
|
|
|
|
ns = bio->bio_dev->si_drv1;
|
|
|
|
/* If a request is aborted, resubmit or queue it for resubmission. */
|
|
if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) {
|
|
bio->bio_error = 0;
|
|
bio->bio_driver2 = 0;
|
|
mtx_lock(&ns->lock);
|
|
if (ns->disconnected) {
|
|
if (nvmf_fail_disconnect) {
|
|
mtx_unlock(&ns->lock);
|
|
bio->bio_error = ECONNABORTED;
|
|
bio->bio_flags |= BIO_ERROR;
|
|
bio->bio_resid = bio->bio_bcount;
|
|
biodone(bio);
|
|
} else {
|
|
TAILQ_INSERT_TAIL(&ns->pending_bios, bio,
|
|
bio_queue);
|
|
mtx_unlock(&ns->lock);
|
|
}
|
|
} else {
|
|
mtx_unlock(&ns->lock);
|
|
nvmf_ns_strategy(bio);
|
|
}
|
|
} else {
|
|
/*
|
|
* I/O errors take precedence over generic EIO from
|
|
* CQE errors.
|
|
*/
|
|
error = (intptr_t)bio->bio_driver2;
|
|
if (error != 0)
|
|
bio->bio_error = error;
|
|
if (bio->bio_error != 0)
|
|
bio->bio_flags |= BIO_ERROR;
|
|
biodone(bio);
|
|
}
|
|
|
|
if (refcount_release(&ns->active_bios))
|
|
wakeup(ns);
|
|
}
|
|
|
|
static void
|
|
nvmf_ns_io_complete(void *arg, size_t xfered, int error)
|
|
{
|
|
struct bio *bio = arg;
|
|
|
|
KASSERT(xfered <= bio->bio_bcount,
|
|
("%s: xfered > bio_bcount", __func__));
|
|
|
|
bio->bio_driver2 = (void *)(intptr_t)error;
|
|
bio->bio_resid = bio->bio_bcount - xfered;
|
|
|
|
nvmf_ns_biodone(bio);
|
|
}
|
|
|
|
static void
|
|
nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
|
|
{
|
|
struct bio *bio = arg;
|
|
|
|
if (error != 0)
|
|
bio->bio_resid = bio->bio_bcount;
|
|
else
|
|
bio->bio_resid = 0;
|
|
|
|
free(bio->bio_driver2, M_NVMF);
|
|
bio->bio_driver2 = (void *)(intptr_t)error;
|
|
|
|
nvmf_ns_biodone(bio);
|
|
}
|
|
|
|
static void
|
|
nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
|
|
{
|
|
struct bio *bio = arg;
|
|
|
|
if (nvmf_cqe_aborted(cqe))
|
|
bio->bio_error = ECONNABORTED;
|
|
else if (cqe->status != 0)
|
|
bio->bio_error = EIO;
|
|
|
|
nvmf_ns_biodone(bio);
|
|
}
|
|
|
|
static int
|
|
nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
|
|
{
|
|
struct nvme_command cmd;
|
|
struct nvmf_request *req;
|
|
struct nvme_dsm_range *dsm_range;
|
|
struct memdesc mem;
|
|
uint64_t lba, lba_count;
|
|
int error;
|
|
|
|
dsm_range = NULL;
|
|
memset(&cmd, 0, sizeof(cmd));
|
|
switch (bio->bio_cmd) {
|
|
case BIO_READ:
|
|
lba = bio->bio_offset / ns->lba_size;
|
|
lba_count = bio->bio_bcount / ns->lba_size;
|
|
nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
|
|
break;
|
|
case BIO_WRITE:
|
|
lba = bio->bio_offset / ns->lba_size;
|
|
lba_count = bio->bio_bcount / ns->lba_size;
|
|
nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
|
|
break;
|
|
case BIO_FLUSH:
|
|
nvme_ns_flush_cmd(&cmd, ns->id);
|
|
break;
|
|
case BIO_DELETE:
|
|
dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
|
|
M_ZERO);
|
|
if (dsm_range == NULL)
|
|
return (ENOMEM);
|
|
lba = bio->bio_offset / ns->lba_size;
|
|
lba_count = bio->bio_bcount / ns->lba_size;
|
|
dsm_range->starting_lba = htole64(lba);
|
|
dsm_range->length = htole32(lba_count);
|
|
|
|
cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
|
|
cmd.nsid = htole32(ns->id);
|
|
cmd.cdw10 = htole32(0); /* 1 range */
|
|
cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
|
|
break;
|
|
default:
|
|
return (EOPNOTSUPP);
|
|
}
|
|
|
|
mtx_lock(&ns->lock);
|
|
if (ns->disconnected) {
|
|
if (nvmf_fail_disconnect) {
|
|
error = ECONNABORTED;
|
|
} else {
|
|
TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
|
|
error = 0;
|
|
}
|
|
mtx_unlock(&ns->lock);
|
|
free(dsm_range, M_NVMF);
|
|
return (error);
|
|
}
|
|
|
|
req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
|
|
nvmf_ns_bio_complete, bio, M_NOWAIT);
|
|
if (req == NULL) {
|
|
mtx_unlock(&ns->lock);
|
|
free(dsm_range, M_NVMF);
|
|
return (ENOMEM);
|
|
}
|
|
|
|
switch (bio->bio_cmd) {
|
|
case BIO_READ:
|
|
case BIO_WRITE:
|
|
refcount_init(bio_refs(bio), 2);
|
|
mem = memdesc_bio(bio);
|
|
nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
|
|
bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
|
|
break;
|
|
case BIO_DELETE:
|
|
refcount_init(bio_refs(bio), 2);
|
|
mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
|
|
nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
|
|
true, nvmf_ns_delete_complete, bio);
|
|
bio->bio_driver2 = dsm_range;
|
|
break;
|
|
default:
|
|
refcount_init(bio_refs(bio), 1);
|
|
KASSERT(bio->bio_resid == 0,
|
|
("%s: input bio_resid != 0", __func__));
|
|
break;
|
|
}
|
|
|
|
refcount_acquire(&ns->active_bios);
|
|
nvmf_submit_request(req);
|
|
mtx_unlock(&ns->lock);
|
|
return (0);
|
|
}
|
|
|
|
static int
|
|
nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
|
|
struct thread *td)
|
|
{
|
|
struct nvmf_namespace *ns = dev->si_drv1;
|
|
struct nvme_get_nsid *gnsid;
|
|
struct nvme_pt_command *pt;
|
|
|
|
switch (cmd) {
|
|
case NVME_PASSTHROUGH_CMD:
|
|
pt = (struct nvme_pt_command *)arg;
|
|
pt->cmd.nsid = htole32(ns->id);
|
|
return (nvmf_passthrough_cmd(ns->sc, pt, false));
|
|
case NVME_GET_NSID:
|
|
gnsid = (struct nvme_get_nsid *)arg;
|
|
strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
|
|
sizeof(gnsid->cdev));
|
|
gnsid->nsid = ns->id;
|
|
return (0);
|
|
case DIOCGMEDIASIZE:
|
|
*(off_t *)arg = ns->size;
|
|
return (0);
|
|
case DIOCGSECTORSIZE:
|
|
*(u_int *)arg = ns->lba_size;
|
|
return (0);
|
|
default:
|
|
return (ENOTTY);
|
|
}
|
|
}
|
|
|
|
static int
|
|
nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
|
|
{
|
|
int error;
|
|
|
|
error = 0;
|
|
if ((oflags & FWRITE) != 0)
|
|
error = securelevel_gt(td->td_ucred, 0);
|
|
return (error);
|
|
}
|
|
|
|
void
|
|
nvmf_ns_strategy(struct bio *bio)
|
|
{
|
|
struct nvmf_namespace *ns;
|
|
int error;
|
|
|
|
ns = bio->bio_dev->si_drv1;
|
|
|
|
error = nvmf_ns_submit_bio(ns, bio);
|
|
if (error != 0) {
|
|
bio->bio_error = error;
|
|
bio->bio_flags |= BIO_ERROR;
|
|
bio->bio_resid = bio->bio_bcount;
|
|
biodone(bio);
|
|
}
|
|
}
|
|
|
|
static struct cdevsw nvmf_ns_cdevsw = {
|
|
.d_version = D_VERSION,
|
|
.d_flags = D_DISK,
|
|
.d_open = nvmf_ns_open,
|
|
.d_read = physread,
|
|
.d_write = physwrite,
|
|
.d_strategy = nvmf_ns_strategy,
|
|
.d_ioctl = nvmf_ns_ioctl
|
|
};
|
|
|
|
struct nvmf_namespace *
|
|
nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
|
|
const struct nvme_namespace_data *data)
|
|
{
|
|
struct make_dev_args mda;
|
|
struct nvmf_namespace *ns;
|
|
int error;
|
|
uint8_t lbads, lbaf;
|
|
|
|
ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
|
|
ns->sc = sc;
|
|
ns->id = id;
|
|
TAILQ_INIT(&ns->pending_bios);
|
|
mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
|
|
|
|
/* One dummy bio avoids dropping to 0 until destroy. */
|
|
refcount_init(&ns->active_bios, 1);
|
|
|
|
if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
|
|
ns_printf(ns, "End-to-end data protection not supported\n");
|
|
goto fail;
|
|
}
|
|
|
|
lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
|
|
if (lbaf > data->nlbaf) {
|
|
ns_printf(ns, "Invalid LBA format index\n");
|
|
goto fail;
|
|
}
|
|
|
|
if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
|
|
ns_printf(ns, "Namespaces with metadata are not supported\n");
|
|
goto fail;
|
|
}
|
|
|
|
lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
|
|
if (lbads == 0) {
|
|
ns_printf(ns, "Invalid LBA format index\n");
|
|
goto fail;
|
|
}
|
|
|
|
ns->lba_size = 1 << lbads;
|
|
ns->size = data->nsze * ns->lba_size;
|
|
|
|
if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
|
|
ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
|
|
|
|
if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
|
|
ns->flags |= NVME_NS_FLUSH_SUPPORTED;
|
|
|
|
/*
|
|
* XXX: Does any of the boundary splitting for NOIOB make any
|
|
* sense for Fabrics?
|
|
*/
|
|
|
|
make_dev_args_init(&mda);
|
|
mda.mda_devsw = &nvmf_ns_cdevsw;
|
|
mda.mda_uid = UID_ROOT;
|
|
mda.mda_gid = GID_WHEEL;
|
|
mda.mda_mode = 0600;
|
|
mda.mda_si_drv1 = ns;
|
|
error = make_dev_s(&mda, &ns->cdev, "%sn%u",
|
|
device_get_nameunit(sc->dev), id);
|
|
if (error != 0)
|
|
goto fail;
|
|
ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u",
|
|
device_get_nameunit(sc->dev), id);
|
|
|
|
ns->cdev->si_flags |= SI_UNMAPPED;
|
|
|
|
return (ns);
|
|
fail:
|
|
mtx_destroy(&ns->lock);
|
|
free(ns, M_NVMF);
|
|
return (NULL);
|
|
}
|
|
|
|
void
|
|
nvmf_disconnect_ns(struct nvmf_namespace *ns)
|
|
{
|
|
mtx_lock(&ns->lock);
|
|
ns->disconnected = true;
|
|
mtx_unlock(&ns->lock);
|
|
}
|
|
|
|
void
|
|
nvmf_reconnect_ns(struct nvmf_namespace *ns)
|
|
{
|
|
TAILQ_HEAD(, bio) bios;
|
|
struct bio *bio;
|
|
|
|
mtx_lock(&ns->lock);
|
|
ns->disconnected = false;
|
|
TAILQ_INIT(&bios);
|
|
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
|
|
mtx_unlock(&ns->lock);
|
|
|
|
while (!TAILQ_EMPTY(&bios)) {
|
|
bio = TAILQ_FIRST(&bios);
|
|
TAILQ_REMOVE(&bios, bio, bio_queue);
|
|
nvmf_ns_strategy(bio);
|
|
}
|
|
}
|
|
|
|
void
|
|
nvmf_destroy_ns(struct nvmf_namespace *ns)
|
|
{
|
|
TAILQ_HEAD(, bio) bios;
|
|
struct bio *bio;
|
|
|
|
if (ns->cdev->si_drv2 != NULL)
|
|
destroy_dev(ns->cdev->si_drv2);
|
|
destroy_dev(ns->cdev);
|
|
|
|
/*
|
|
* Wait for active I/O requests to drain. The release drops
|
|
* the reference on the "dummy bio" when the namespace is
|
|
* created.
|
|
*/
|
|
mtx_lock(&ns->lock);
|
|
if (!refcount_release(&ns->active_bios)) {
|
|
while (ns->active_bios != 0)
|
|
mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
|
|
}
|
|
|
|
/* Abort any pending I/O requests. */
|
|
TAILQ_INIT(&bios);
|
|
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
|
|
mtx_unlock(&ns->lock);
|
|
|
|
while (!TAILQ_EMPTY(&bios)) {
|
|
bio = TAILQ_FIRST(&bios);
|
|
TAILQ_REMOVE(&bios, bio, bio_queue);
|
|
bio->bio_error = ECONNABORTED;
|
|
bio->bio_flags |= BIO_ERROR;
|
|
bio->bio_resid = bio->bio_bcount;
|
|
biodone(bio);
|
|
}
|
|
|
|
mtx_destroy(&ns->lock);
|
|
free(ns, M_NVMF);
|
|
}
|
|
|
|
bool
|
|
nvmf_update_ns(struct nvmf_namespace *ns,
|
|
const struct nvme_namespace_data *data)
|
|
{
|
|
uint8_t lbads, lbaf;
|
|
|
|
if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
|
|
ns_printf(ns, "End-to-end data protection not supported\n");
|
|
return (false);
|
|
}
|
|
|
|
lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
|
|
if (lbaf > data->nlbaf) {
|
|
ns_printf(ns, "Invalid LBA format index\n");
|
|
return (false);
|
|
}
|
|
|
|
if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
|
|
ns_printf(ns, "Namespaces with metadata are not supported\n");
|
|
return (false);
|
|
}
|
|
|
|
lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
|
|
if (lbads == 0) {
|
|
ns_printf(ns, "Invalid LBA format index\n");
|
|
return (false);
|
|
}
|
|
|
|
ns->lba_size = 1 << lbads;
|
|
ns->size = data->nsze * ns->lba_size;
|
|
return (true);
|
|
}
|