src/sys/dev/nvmf/host/nvmf_ns.c

/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2023-2024 Chelsio Communications, Inc.
 * Written by: John Baldwin <jhb@FreeBSD.org>
 */

#include <sys/param.h>
#include <sys/bio.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/disk.h>
#include <sys/fcntl.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/sbuf.h>
#include <machine/stdarg.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/host/nvmf_var.h>

struct nvmf_namespace {
	struct nvmf_softc *sc;
	uint64_t size;
	uint32_t id;
	u_int	flags;
	uint32_t lba_size;
	bool disconnected;

	TAILQ_HEAD(, bio) pending_bios;
	struct mtx lock;
	volatile u_int active_bios;

	struct cdev *cdev;
};

static void	nvmf_ns_strategy(struct bio *bio);

static void
ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
{
	char buf[128];
	struct sbuf sb;
	va_list ap;

	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
	sbuf_set_drain(&sb, sbuf_printf_drain, NULL);

	sbuf_printf(&sb, "%sn%u: ", device_get_nameunit(ns->sc->dev),
	    ns->id);

	va_start(ap, fmt);
	sbuf_vprintf(&sb, fmt, ap);
	va_end(ap);

	sbuf_finish(&sb);
	sbuf_delete(&sb);
}

/*
 * The I/O completion may trigger after the received CQE if the I/O
 * used a zero-copy mbuf that isn't harvested until after the NIC
 * driver processes TX completions.  Abuse bio_driver1 as a refcount.
 * Store I/O errors in bio_driver2.
 */
static __inline u_int *
bio_refs(struct bio *bio)
{
	return ((u_int *)&bio->bio_driver1);
}

static void
nvmf_ns_biodone(struct bio *bio)
{
	struct nvmf_namespace *ns;
	int error;

	if (!refcount_release(bio_refs(bio)))
		return;

	ns = bio->bio_dev->si_drv1;

	/* If a request is aborted, resubmit or queue it for resubmission. */
	if (bio->bio_error == ECONNABORTED && !nvmf_fail_disconnect) {
		bio->bio_error = 0;
		bio->bio_driver2 = 0;
		mtx_lock(&ns->lock);
		if (ns->disconnected) {
			if (nvmf_fail_disconnect) {
				mtx_unlock(&ns->lock);
				bio->bio_error = ECONNABORTED;
				bio->bio_flags |= BIO_ERROR;
				bio->bio_resid = bio->bio_bcount;
				biodone(bio);
			} else {
				TAILQ_INSERT_TAIL(&ns->pending_bios, bio,
				    bio_queue);
				mtx_unlock(&ns->lock);
			}
		} else {
			mtx_unlock(&ns->lock);
			nvmf_ns_strategy(bio);
		}
	} else {
		/*
		 * I/O errors take precedence over generic EIO from
		 * CQE errors.
		 */
		error = (intptr_t)bio->bio_driver2;
		if (error != 0)
			bio->bio_error = error;
		if (bio->bio_error != 0)
			bio->bio_flags |= BIO_ERROR;
		biodone(bio);
	}

	if (refcount_release(&ns->active_bios))
		wakeup(ns);
}

static void
nvmf_ns_io_complete(void *arg, size_t xfered, int error)
{
	struct bio *bio = arg;

	KASSERT(xfered <= bio->bio_bcount,
	    ("%s: xfered > bio_bcount", __func__));

	bio->bio_driver2 = (void *)(intptr_t)error;
	bio->bio_resid = bio->bio_bcount - xfered;

	nvmf_ns_biodone(bio);
}

static void
nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
{
	struct bio *bio = arg;

	if (error != 0)
		bio->bio_resid = bio->bio_bcount;
	else
		bio->bio_resid = 0;

	free(bio->bio_driver2, M_NVMF);
	bio->bio_driver2 = (void *)(intptr_t)error;

	nvmf_ns_biodone(bio);
}

static void
nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
{
	struct bio *bio = arg;

	if (nvmf_cqe_aborted(cqe))
		bio->bio_error = ECONNABORTED;
	else if (cqe->status != 0)
		bio->bio_error = EIO;

	nvmf_ns_biodone(bio);
}

static int
nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
{
	struct nvme_command cmd;
	struct nvmf_request *req;
	struct nvme_dsm_range *dsm_range;
	struct memdesc mem;
	uint64_t lba, lba_count;
	int error;

	dsm_range = NULL;
	memset(&cmd, 0, sizeof(cmd));
	switch (bio->bio_cmd) {
	case BIO_READ:
		lba = bio->bio_offset / ns->lba_size;
		lba_count = bio->bio_bcount / ns->lba_size;
		nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
		break;
	case BIO_WRITE:
		lba = bio->bio_offset / ns->lba_size;
		lba_count = bio->bio_bcount / ns->lba_size;
		nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
		break;
	case BIO_FLUSH:
		nvme_ns_flush_cmd(&cmd, ns->id);
		break;
	case BIO_DELETE:
		dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
		    M_ZERO);
		if (dsm_range == NULL)
			return (ENOMEM);
		lba = bio->bio_offset / ns->lba_size;
		lba_count = bio->bio_bcount / ns->lba_size;
		dsm_range->starting_lba = htole64(lba);
		dsm_range->length = htole32(lba_count);

		cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
		cmd.nsid = htole32(ns->id);
		cmd.cdw10 = htole32(0);		/* 1 range */
		cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
		break;
	default:
		return (EOPNOTSUPP);
	}

	mtx_lock(&ns->lock);
	if (ns->disconnected) {
		if (nvmf_fail_disconnect) {
			error = ECONNABORTED;
		} else {
			TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
			error = 0;
		}
		mtx_unlock(&ns->lock);
		free(dsm_range, M_NVMF);
		return (error);
	}

	req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
	    nvmf_ns_bio_complete, bio, M_NOWAIT);
	if (req == NULL) {
		mtx_unlock(&ns->lock);
		free(dsm_range, M_NVMF);
		return (ENOMEM);
	}

	switch (bio->bio_cmd) {
	case BIO_READ:
	case BIO_WRITE:
		refcount_init(bio_refs(bio), 2);
		mem = memdesc_bio(bio);
		nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
		    bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
		break;
	case BIO_DELETE:
		refcount_init(bio_refs(bio), 2);
		mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
		nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
		    true, nvmf_ns_delete_complete, bio);
		bio->bio_driver2 = dsm_range;
		break;
	default:
		refcount_init(bio_refs(bio), 1);
		KASSERT(bio->bio_resid == 0,
		    ("%s: input bio_resid != 0", __func__));
		break;
	}

	refcount_acquire(&ns->active_bios);
	nvmf_submit_request(req);
	mtx_unlock(&ns->lock);
	return (0);
}

static int
nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
    struct thread *td)
{
	struct nvmf_namespace *ns = dev->si_drv1;
	struct nvme_get_nsid *gnsid;
	struct nvme_pt_command *pt;

	switch (cmd) {
	case NVME_PASSTHROUGH_CMD:
		pt = (struct nvme_pt_command *)arg;
		pt->cmd.nsid = htole32(ns->id);
		return (nvmf_passthrough_cmd(ns->sc, pt, false));
	case NVME_GET_NSID:
		gnsid = (struct nvme_get_nsid *)arg;
		strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
		    sizeof(gnsid->cdev));
		gnsid->nsid = ns->id;
		return (0);
	case DIOCGMEDIASIZE:
		*(off_t *)arg = ns->size;
		return (0);
	case DIOCGSECTORSIZE:
		*(u_int *)arg = ns->lba_size;
		return (0);
	default:
		return (ENOTTY);
	}
}

static int
nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
	int error;

	error = 0;
	if ((oflags & FWRITE) != 0)
		error = securelevel_gt(td->td_ucred, 0);
	return (error);
}

void
nvmf_ns_strategy(struct bio *bio)
{
	struct nvmf_namespace *ns;
	int error;

	ns = bio->bio_dev->si_drv1;

	error = nvmf_ns_submit_bio(ns, bio);
	if (error != 0) {
		bio->bio_error = error;
		bio->bio_flags |= BIO_ERROR;
		bio->bio_resid = bio->bio_bcount;
		biodone(bio);
	}
}

static struct cdevsw nvmf_ns_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_DISK,
	.d_open = nvmf_ns_open,
	.d_read = physread,
	.d_write = physwrite,
	.d_strategy = nvmf_ns_strategy,
	.d_ioctl = nvmf_ns_ioctl
};

struct nvmf_namespace *
nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
    const struct nvme_namespace_data *data)
{
	struct make_dev_args mda;
	struct nvmf_namespace *ns;
	int error;
	uint8_t lbads, lbaf;

	ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
	ns->sc = sc;
	ns->id = id;
	TAILQ_INIT(&ns->pending_bios);
	mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);

	/* One dummy bio avoids dropping to 0 until destroy. */
	refcount_init(&ns->active_bios, 1);

	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
		ns_printf(ns, "End-to-end data protection not supported\n");
		goto fail;
	}

	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
	if (lbaf > data->nlbaf) {
		ns_printf(ns, "Invalid LBA format index\n");
		goto fail;
	}

	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
		ns_printf(ns, "Namespaces with metadata are not supported\n");
		goto fail;
	}

	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
	if (lbads == 0) {
		ns_printf(ns, "Invalid LBA format index\n");
		goto fail;
	}

	ns->lba_size = 1 << lbads;
	ns->size = data->nsze * ns->lba_size;

	if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
		ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;

	if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
		ns->flags |= NVME_NS_FLUSH_SUPPORTED;

	/*
	 * XXX: Does any of the boundary splitting for NOIOB make any
	 * sense for Fabrics?
	 */

	make_dev_args_init(&mda);
	mda.mda_devsw = &nvmf_ns_cdevsw;
	mda.mda_uid = UID_ROOT;
	mda.mda_gid = GID_WHEEL;
	mda.mda_mode = 0600;
	mda.mda_si_drv1 = ns;
	error = make_dev_s(&mda, &ns->cdev, "%sn%u",
	    device_get_nameunit(sc->dev), id);
	if (error != 0)
		goto fail;
	ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%u",
	    device_get_nameunit(sc->dev), id);

	ns->cdev->si_flags |= SI_UNMAPPED;

	return (ns);
fail:
	mtx_destroy(&ns->lock);
	free(ns, M_NVMF);
	return (NULL);
}

void
nvmf_disconnect_ns(struct nvmf_namespace *ns)
{
	mtx_lock(&ns->lock);
	ns->disconnected = true;
	mtx_unlock(&ns->lock);
}

void
nvmf_reconnect_ns(struct nvmf_namespace *ns)
{
	TAILQ_HEAD(, bio) bios;
	struct bio *bio;

	mtx_lock(&ns->lock);
	ns->disconnected = false;
	TAILQ_INIT(&bios);
	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
	mtx_unlock(&ns->lock);

	while (!TAILQ_EMPTY(&bios)) {
		bio = TAILQ_FIRST(&bios);
		TAILQ_REMOVE(&bios, bio, bio_queue);
		nvmf_ns_strategy(bio);
	}
}

void
nvmf_destroy_ns(struct nvmf_namespace *ns)
{
	TAILQ_HEAD(, bio) bios;
	struct bio *bio;

	if (ns->cdev->si_drv2 != NULL)
		destroy_dev(ns->cdev->si_drv2);
	destroy_dev(ns->cdev);

	/*
	 * Wait for active I/O requests to drain.  The release drops
	 * the reference on the "dummy bio" when the namespace is
	 * created.
	 */
	mtx_lock(&ns->lock);
	if (!refcount_release(&ns->active_bios)) {
		while (ns->active_bios != 0)
			mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
	}

	/* Abort any pending I/O requests. */
	TAILQ_INIT(&bios);
	TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
	mtx_unlock(&ns->lock);

	while (!TAILQ_EMPTY(&bios)) {
		bio = TAILQ_FIRST(&bios);
		TAILQ_REMOVE(&bios, bio, bio_queue);
		bio->bio_error = ECONNABORTED;
		bio->bio_flags |= BIO_ERROR;
		bio->bio_resid = bio->bio_bcount;
		biodone(bio);
	}

	mtx_destroy(&ns->lock);
	free(ns, M_NVMF);
}

bool
nvmf_update_ns(struct nvmf_namespace *ns,
    const struct nvme_namespace_data *data)
{
	uint8_t lbads, lbaf;

	if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
		ns_printf(ns, "End-to-end data protection not supported\n");
		return (false);
	}

	lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
	if (lbaf > data->nlbaf) {
		ns_printf(ns, "Invalid LBA format index\n");
		return (false);
	}

	if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
		ns_printf(ns, "Namespaces with metadata are not supported\n");
		return (false);
	}

	lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
	if (lbads == 0) {
		ns_printf(ns, "Invalid LBA format index\n");
		return (false);
	}

	ns->lba_size = 1 << lbads;
	ns->size = data->nsze * ns->lba_size;
	return (true);
}