From bcecad2c24aa500913559c00f1be8b364a3ff150 Mon Sep 17 00:00:00 2001
From: Ruslan Bukin
Date: Tue, 12 May 2026 11:09:04 +0100
Subject: [PATCH] riscv: IOMMU support
Support for RISC-V IOMMU spec v1.0.1 (ratified)
https://github.com/riscv-non-isa/riscv-iommu
Supports translation for PCI devices only.
Supports 1 or 2-level device-directory-table (DDT).
Supports SV39 and SV48 virtual memory system (on per-device basis).
Supports both "standard" and "extended" device-context (DC) structure.
Supports "bypass" mode to disable translation for a particular device.
Supports WSI (Wire-Signalled Interrupts) only.
This includes both PCI-bus and FDT attachment drivers.
Note in case of PCI-bus attachment, interrupts are not available. In this
case no error report is provided in case of translation fault. Otherwise
interrupts are not needed.
Differential Revision: https://reviews.freebsd.org/D55922
---
sys/conf/files.riscv | 9 +
sys/riscv/conf/GENERIC | 1 +
sys/riscv/include/bus_dma_impl.h | 3 +
sys/riscv/include/iommu.h | 10 +
sys/riscv/iommu/iommu.c | 1351 ++++++++++++++++++++++++++++++
sys/riscv/iommu/iommu.h | 359 ++++++++
sys/riscv/iommu/iommu_fdt.c | 145 ++++
sys/riscv/iommu/iommu_frontend.c | 505 +++++++++++
sys/riscv/iommu/iommu_frontend.h | 38 +
sys/riscv/iommu/iommu_if.m | 147 ++++
sys/riscv/iommu/iommu_pci.c | 172 ++++
sys/riscv/iommu/iommu_pmap.c | 629 ++++++++++++++
sys/riscv/iommu/iommu_pmap.h | 49 ++
13 files changed, 3418 insertions(+)
create mode 100644 sys/riscv/include/iommu.h
create mode 100644 sys/riscv/iommu/iommu.c
create mode 100644 sys/riscv/iommu/iommu.h
create mode 100644 sys/riscv/iommu/iommu_fdt.c
create mode 100644 sys/riscv/iommu/iommu_frontend.c
create mode 100644 sys/riscv/iommu/iommu_frontend.h
create mode 100644 sys/riscv/iommu/iommu_if.m
create mode 100644 sys/riscv/iommu/iommu_pci.c
create mode 100644 sys/riscv/iommu/iommu_pmap.c
create mode 100644 sys/riscv/iommu/iommu_pmap.h
diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv
index 90a74367a54..58a31b5f326 100644
--- a/sys/conf/files.riscv
+++ b/sys/conf/files.riscv
@@ -6,6 +6,7 @@ cddl/dev/fbt/riscv/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${
crypto/des/des_enc.c optional netsmb
dev/cpufreq/cpufreq_dt.c optional cpufreq fdt
dev/ofw/ofw_cpu.c optional fdt
+dev/ofw/ofw_pci.c optional pci fdt
dev/ofw/ofw_pcib.c optional pci fdt
dev/pci/pci_dw.c optional pci fdt
dev/pci/pci_dw_if.m optional pci fdt
@@ -42,6 +43,14 @@ libkern/memset.c standard
libkern/strcmp.c standard
libkern/strlen.c standard
libkern/strncmp.c standard
+riscv/iommu/iommu_frontend.c standard
+riscv/iommu/iommu_if.m standard
+riscv/iommu/iommu.c standard
+riscv/iommu/iommu_fdt.c optional fdt
+riscv/iommu/iommu_pci.c optional pci
+riscv/iommu/iommu_pmap.c optional iommu
+dev/iommu/busdma_iommu.c optional iommu
+dev/iommu/iommu_gas.c optional iommu
riscv/riscv/aplic.c standard
riscv/riscv/autoconf.c standard
riscv/riscv/bus_machdep.c standard
diff --git a/sys/riscv/conf/GENERIC b/sys/riscv/conf/GENERIC
index 827d5efef50..cce2787ed5d 100644
--- a/sys/riscv/conf/GENERIC
+++ b/sys/riscv/conf/GENERIC
@@ -77,6 +77,7 @@ options RACCT # Resource accounting framework
options RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
options RCTL # Resource limits
options SMP
+options IOMMU
# RISC-V SBI console
device rcons
diff --git a/sys/riscv/include/bus_dma_impl.h b/sys/riscv/include/bus_dma_impl.h
index 09fd29b74f8..8c2040a68f5 100644
--- a/sys/riscv/include/bus_dma_impl.h
+++ b/sys/riscv/include/bus_dma_impl.h
@@ -41,6 +41,7 @@ struct bus_dma_tag_common {
int flags;
bus_dma_lock_t *lockfunc;
void *lockfuncarg;
+ int domain;
};
struct bus_dma_impl {
@@ -52,6 +53,8 @@ struct bus_dma_impl {
int (*tag_destroy)(bus_dma_tag_t dmat);
int (*map_create)(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp);
int (*map_destroy)(bus_dma_tag_t dmat, bus_dmamap_t map);
+ int (*tag_set_domain)(bus_dma_tag_t);
+ bool (*id_mapped)(bus_dma_tag_t, vm_paddr_t, bus_size_t);
int (*mem_alloc)(bus_dma_tag_t dmat, void** vaddr, int flags,
bus_dmamap_t *mapp);
void (*mem_free)(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map);
diff --git a/sys/riscv/include/iommu.h b/sys/riscv/include/iommu.h
new file mode 100644
index 00000000000..38214b7003f
--- /dev/null
+++ b/sys/riscv/include/iommu.h
@@ -0,0 +1,10 @@
+/*-
+ * This file is in the public domain.
+ */
+
+#ifndef _MACHINE_IOMMU_H_
+#define _MACHINE_IOMMU_H_
+
+#include
+
+#endif /* !_MACHINE_IOMMU_H_ */
diff --git a/sys/riscv/iommu/iommu.c b/sys/riscv/iommu/iommu.c
new file mode 100644
index 00000000000..59df2e68384
--- /dev/null
+++ b/sys/riscv/iommu/iommu.c
@@ -0,0 +1,1351 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+
+#include "iommu_if.h"
+
+#define dprintf(fmt, ...)
+
+MALLOC_DEFINE(M_IOMMU, "RISCV_IOMMU", "RISC-V IOMMU");
+
+#define RD4(sc, reg) bus_read_4(sc->res[0], (reg))
+#define WR4(sc, reg, val) bus_write_4(sc->res[0], (reg), (val))
+#define RD8(sc, reg) bus_read_8(sc->res[0], (reg))
+#define WR8(sc, reg, val) bus_write_8(sc->res[0], (reg), (val))
+
+#define CQ_ENTRY_DWORDS 2 /* 16-byte */
+#define CQ_ENTRY_COUNT 8192 /* Amount of 16-byte entries. */
+#define FQ_ENTRY_DWORDS 4 /* 32-byte */
+#define FQ_ENTRY_COUNT 8192 /* Amount of 32-byte entries. */
+#define PQ_ENTRY_DWORDS 2 /* 16-byte */
+#define PQ_ENTRY_COUNT 8192 /* Amount of 16-byte entries. */
+
+#define DDT_NON_LEAF_DWORDS 1
+#define DDT_DC_STD_DWORDS 4 /* Standard-format DC. */
+#define DDT_DC_EXT_DWORDS 8 /* Extended-format DC. */
+#define DDT_L1_DID_BITS 9 /* All formats. */
+
+#define QUEUE_ALIGN (1024 * 1024) /* TODO */
+#define QUEUE_HEAD(q) ((q)->csr + RISCV_IOMMU_CQH - RISCV_IOMMU_CQB)
+#define QUEUE_TAIL(q) ((q)->csr + RISCV_IOMMU_CQT - RISCV_IOMMU_CQB)
+#define QUEUE_IPSR(q) (1 << (q)->idx)
+
+#define PHYS_TO_PPN(p) ((p) >> 12)
+
+struct riscv_iommu_fq_event {
+ uint16_t cause_id;
+ char *descr;
+};
+
+static struct riscv_iommu_fq_event fq_events[] = {
+ { FQ_CAUSE_INST_FAULT, "Instruction access fault" },
+ { FQ_CAUSE_RD_ADDR_MISALIGNED, "Read address misaligned" },
+ { FQ_CAUSE_RD_FAULT, "Read access fault" },
+ { FQ_CAUSE_WR_ADDR_MISALIGNED, "Write/AMO address misaligned" },
+ { FQ_CAUSE_WR_FAULT, "Write/AMO access fault" },
+ { FQ_CAUSE_INST_FAULT_S, "Instruction page fault" },
+ { FQ_CAUSE_RD_FAULT_S, "Read page fault" },
+ { FQ_CAUSE_WR_FAULT_S, "Write/AMO page fault" },
+ { FQ_CAUSE_INST_FAULT_VS, "Instruction guest page fault" },
+ { FQ_CAUSE_RD_FAULT_VS, "Read guest-page fault" },
+ { FQ_CAUSE_WR_FAULT_VS, "Write/AMO guest-page fault" },
+ { FQ_CAUSE_DMA_DISABLED, "All inbound transactions disallowed" },
+ { FQ_CAUSE_DDT_LOAD_FAULT, "DDT entry load access fault" },
+ { FQ_CAUSE_DDT_INVALID, "DDT entry not valid" },
+ { FQ_CAUSE_DDT_MISCONFIGURED, "DDT entry misconfigured" },
+ { FQ_CAUSE_TR_TYPE_DISALLOWED, "Transaction type disallowed" },
+ { FQ_CAUSE_MSI_LOAD_FAULT, "MSI PTE load access fault" },
+ { FQ_CAUSE_MSI_INVALID, "MSI PTE not valid" },
+ { FQ_CAUSE_MSI_MISCONFIGURED, "MSI PTE misconfigured" },
+ { FQ_CAUSE_MRIF_FAULT, "MRIF access fault" },
+ { FQ_CAUSE_PDT_LOAD_FAULT, "PDT entry load access fault" },
+ { FQ_CAUSE_PDT_INVALID, "PDT entry not valid" },
+ { FQ_CAUSE_PDT_MISCONFIGURED, "PDT entry misconfigured" },
+ { FQ_CAUSE_DDT_CORRUPTED, "DDT data corruption" },
+ { FQ_CAUSE_PDT_CORRUPTED, "PDT data corruption" },
+ { FQ_CAUSE_MSI_PT_CORRUPTED, "MSI PT data corruption" },
+ { FQ_CAUSE_MRIF_CORRUPTED, "MSI MRIF data corruption" },
+ { FQ_CAUSE_INTERNAL_DP_ERROR, "Internal data path error" },
+ { FQ_CAUSE_MSI_WR_FAULT, "IOMMU MSI write access fault" },
+ { FQ_CAUSE_PT_CORRUPTED, "1st/2nd-stage PT data corruption" },
+ { 0, NULL },
+};
+
+static void
+riscv_iommu_init_pscids(struct riscv_iommu_softc *sc)
+{
+
+ sc->pscid_set_size = (1 << sc->pscid_bits);
+ sc->pscid_set = bit_alloc(sc->pscid_set_size, M_IOMMU, M_WAITOK);
+ mtx_init(&sc->pscid_set_mutex, "pscid set", NULL, MTX_SPIN);
+}
+
+static int
+riscv_iommu_pscid_alloc(struct riscv_iommu_softc *sc, int *new_pscid)
+{
+
+ mtx_lock_spin(&sc->pscid_set_mutex);
+ bit_ffc(sc->pscid_set, sc->pscid_set_size, new_pscid);
+ if (*new_pscid == -1) {
+ mtx_unlock_spin(&sc->pscid_set_mutex);
+ return (ENOMEM);
+ }
+ bit_set(sc->pscid_set, *new_pscid);
+ mtx_unlock_spin(&sc->pscid_set_mutex);
+
+ return (0);
+}
+
+static void
+riscv_iommu_pscid_free(struct riscv_iommu_softc *sc, int pscid)
+{
+
+ mtx_lock_spin(&sc->pscid_set_mutex);
+ bit_clear(sc->pscid_set, pscid);
+ mtx_unlock_spin(&sc->pscid_set_mutex);
+}
+
+static uint32_t
+riscv_iommu_q_inc_tail(struct riscv_iommu_queue *q)
+{
+
+ return ((q->lc.tail + 1) & q->mask);
+}
+
+static uint32_t
+riscv_iommu_q_inc_head(struct riscv_iommu_queue *q)
+{
+
+ return ((q->lc.head + 1) & q->mask);
+}
+
+static int
+riscv_iommu_q_has_space(struct riscv_iommu_queue *q)
+{
+
+ if (riscv_iommu_q_inc_tail(q) != q->lc.head)
+ return (1);
+
+ return (0);
+}
+
+static int
+riscv_iommu_q_empty(struct riscv_iommu_queue *q)
+{
+
+ if (q->lc.tail == q->lc.head)
+ return (1);
+
+ return (0);
+}
+
+static int
+riscv_iommu_dequeue(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q,
+ void *data)
+{
+ void *entry_addr;
+
+ q->lc.val = RD8(sc, q->head_off);
+ entry_addr = (void *)((uint64_t)q->vaddr + q->lc.head * q->entry_size);
+ memcpy(data, entry_addr, q->entry_size);
+ q->lc.head = riscv_iommu_q_inc_head(q);
+ WR4(sc, q->head_off, q->lc.head);
+
+ return (0);
+}
+
+static int
+riscv_iommu_enqueue(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q,
+ void *data)
+{
+ void *entry_addr;
+
+ RISCV_IOMMU_LOCK(sc);
+
+ /* Ensure that a space is available. */
+ do {
+ q->lc.head = RD4(sc, q->head_off);
+ } while (riscv_iommu_q_has_space(q) == 0);
+
+ /* Write the command to the current tail entry. */
+ entry_addr = (void *)((uint64_t)q->vaddr + q->lc.tail * q->entry_size);
+ memcpy(entry_addr, data, q->entry_size);
+
+ /* Increment tail index. */
+ q->lc.tail = riscv_iommu_q_inc_tail(q);
+ WR4(sc, q->tail_off, q->lc.tail);
+
+ RISCV_IOMMU_UNLOCK(sc);
+
+ return (0);
+}
+
+static void
+riscv_iommu_sync(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q)
+{
+ struct riscv_iommu_command cmd;
+ uint64_t reg;
+
+ bzero(&cmd, sizeof(struct riscv_iommu_command));
+ reg = COMMAND_OPCODE_IOFENCE;
+ reg |= FUNC_IOFENCE_FUNC_C | FUNC_IOFENCE_PR | FUNC_IOFENCE_PW;
+ cmd.dword0 = reg;
+
+ riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+ /*
+ * FUNC_IOFENCE_WSI does not seem to be implemented in QEMU,
+ * so ensure all requests are processed in polling mode;
+ */
+ do {
+ q->lc.head = RD4(sc, q->head_off);
+ } while (riscv_iommu_q_empty(q) == 0);
+}
+
+static int
+riscv_iommu_inval_ddt(struct riscv_iommu_softc *sc)
+{
+ struct riscv_iommu_command cmd;
+ uint64_t reg;
+
+ bzero(&cmd, sizeof(struct riscv_iommu_command));
+ reg = COMMAND_OPCODE_IODIR;
+ reg |= FUNC_IODIR_INVAL_DDT;
+ cmd.dword0 = reg;
+
+ riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+ return (0);
+}
+
+static int
+riscv_iommu_inval_ddt_did(struct riscv_iommu_softc *sc, int did)
+{
+ struct riscv_iommu_command cmd;
+ uint64_t reg;
+
+ bzero(&cmd, sizeof(struct riscv_iommu_command));
+ reg = COMMAND_OPCODE_IODIR;
+ reg |= FUNC_IODIR_INVAL_DDT;
+ reg |= FUNC_IODIR_DV;
+ reg |= (uint64_t)did << FUNC_IODIR_DID_S;
+ cmd.dword0 = reg;
+
+ riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+ return (0);
+}
+
+/* Invalidate entire address space. */
+static int
+riscv_iommu_inval_vma(struct riscv_iommu_softc *sc)
+{
+ struct riscv_iommu_command cmd;
+ uint64_t reg;
+
+ bzero(&cmd, sizeof(struct riscv_iommu_command));
+ reg = COMMAND_OPCODE_IOTINVAL;
+ reg |= FUNC_IOTINVAL_VMA;
+ cmd.dword0 = reg;
+
+ riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+ return (0);
+}
+
+static int
+riscv_iommu_inval_vma_page(struct riscv_iommu_softc *sc, vm_offset_t addr,
+ int pscid)
+{
+ struct riscv_iommu_command cmd;
+ uint64_t reg;
+
+ bzero(&cmd, sizeof(struct riscv_iommu_command));
+ reg = COMMAND_OPCODE_IOTINVAL;
+ reg |= FUNC_IOTINVAL_VMA;
+ reg |= FUNC_IOTINVAL_AV;
+ reg |= FUNC_IOTINVAL_PSCV;
+ reg |= pscid << FUNC_IOTINVAL_PSCID_S;
+ cmd.dword0 = reg;
+ cmd.dword1 = PHYS_TO_PPN(addr) << FUNC_IOTINVAL_ADDR_S;
+
+ riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+ return (0);
+}
+
+static int
+riscv_iommu_inval_vma_pscid(struct riscv_iommu_softc *sc, int pscid)
+{
+ struct riscv_iommu_command cmd;
+ uint64_t reg;
+
+ bzero(&cmd, sizeof(struct riscv_iommu_command));
+ reg = COMMAND_OPCODE_IOTINVAL;
+ reg |= FUNC_IOTINVAL_VMA;
+ reg |= FUNC_IOTINVAL_PSCV;
+ reg |= pscid << FUNC_IOTINVAL_PSCID_S;
+ cmd.dword0 = reg;
+
+ riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd);
+
+ return (0);
+}
+
+static int
+riscv_iommu_set_mode(struct riscv_iommu_softc *sc)
+{
+ struct riscv_iommu_ddt *ddt;
+ uint64_t reg;
+ uint64_t base;
+
+ reg = RD8(sc, RISCV_IOMMU_DDTP);
+ if (reg & DDTP_BUSY)
+ return (ENXIO);
+
+ ddt = &sc->ddt;
+ base = ddt->base | (sc->iommu_mode << DDTP_IOMMU_MODE_S);
+ WR8(sc, RISCV_IOMMU_DDTP, base);
+
+ reg = RD8(sc, RISCV_IOMMU_DDTP);
+ if (reg != base) {
+ device_printf(sc->dev, "could not set mode\n");
+ return (ENXIO);
+ }
+
+ riscv_iommu_inval_ddt(sc);
+ riscv_iommu_inval_vma(sc);
+
+ return (0);
+}
+
+static int
+riscv_iommu_enable_queue(struct riscv_iommu_softc *sc,
+ struct riscv_iommu_queue *q)
+{
+ uint32_t reg;
+ int timeout;
+
+ if (q == &sc->cq)
+ WR4(sc, QUEUE_TAIL(q), 0);
+ else
+ WR4(sc, QUEUE_HEAD(q), 0);
+
+ reg = CQCSR_CQEN | CQCSR_CIE | CQCSR_CQMF;
+ WR4(sc, q->csr, reg);
+
+ timeout = 1000;
+ do {
+ reg = RD4(sc, RISCV_IOMMU_CQCSR);
+ if ((reg & CQCSR_BUSY) == 0)
+ break;
+ DELAY(10);
+ } while (timeout--);
+
+ if (timeout <= 0) {
+ device_printf(sc->dev, "could not enable command queue\n");
+ return (-1);
+ }
+
+ if ((reg & CQCSR_CQON) == 0) {
+ device_printf(sc->dev, "could not activate command queue\n");
+ return (-1);
+ }
+
+ /* RW1C interrupt pending bit. */
+ WR4(sc, RISCV_IOMMU_IPSR, QUEUE_IPSR(q));
+
+ return (0);
+}
+
+static int
+riscv_iommu_init_queue(struct riscv_iommu_softc *sc,
+ struct riscv_iommu_queue *q, uint64_t base, uint32_t dwords)
+{
+ uint64_t reg;
+ int sz;
+
+ q->entry_size = dwords * 8;
+ sz = (1 << q->size_log2) * q->entry_size;
+
+ /* Set up the command circular buffer */
+ q->vaddr = contigmalloc(sz, M_IOMMU, M_WAITOK | M_ZERO, 0,
+ (1ul << 48) - 1, QUEUE_ALIGN, 0);
+ if (q->vaddr == NULL) {
+ device_printf(sc->dev, "failed to allocate %d bytes\n", sz);
+ return (-1);
+ }
+
+ q->mask = (1 << q->size_log2) - 1;
+ q->head_off = (uint32_t)base - RISCV_IOMMU_CQB + RISCV_IOMMU_CQH;
+ q->tail_off = (uint32_t)base - RISCV_IOMMU_CQB + RISCV_IOMMU_CQT;
+ q->paddr = vtophys(q->vaddr);
+ q->base = (sc->cq.size_log2 - 1) << CQB_LOG2SZ_1_S;
+ q->base |= PHYS_TO_PPN(q->paddr) << CQB_PPN_S;
+ WR8(sc, base, q->base);
+
+ /* Verify it sticks. */
+ reg = RD8(sc, base);
+ if (reg != q->base) {
+ device_printf(sc->dev, "could not init queue\n");
+ return (ENXIO);
+ }
+
+ return (0);
+}
+
+static int
+riscv_iommu_init_queues(struct riscv_iommu_softc *sc)
+{
+ int error;
+
+ sc->cq.size_log2 = ilog2(CQ_ENTRY_COUNT);
+ sc->fq.size_log2 = ilog2(FQ_ENTRY_COUNT);
+ sc->pq.size_log2 = ilog2(PQ_ENTRY_COUNT);
+
+ sc->cq.csr = RISCV_IOMMU_CQCSR;
+ sc->fq.csr = RISCV_IOMMU_FQCSR;
+ sc->pq.csr = RISCV_IOMMU_PQCSR;
+
+ sc->cq.idx = 0;
+ sc->fq.idx = 1;
+ sc->pq.idx = 3;
+
+ /* Command queue (CQ). */
+ error = riscv_iommu_init_queue(sc, &sc->cq, RISCV_IOMMU_CQB,
+ CQ_ENTRY_DWORDS);
+ if (error)
+ return (error);
+
+ /* Fault queue (FQ). */
+ error = riscv_iommu_init_queue(sc, &sc->fq, RISCV_IOMMU_FQB,
+ FQ_ENTRY_DWORDS);
+ if (error)
+ return (error);
+
+ /* Page request queue (PQ). */
+ error = riscv_iommu_init_queue(sc, &sc->pq, RISCV_IOMMU_PQB,
+ PQ_ENTRY_DWORDS);
+ if (error)
+ return (error);
+
+ error = riscv_iommu_enable_queue(sc, &sc->cq);
+ if (error)
+ return (error);
+
+ error = riscv_iommu_enable_queue(sc, &sc->fq);
+ if (error)
+ return (error);
+
+ error = riscv_iommu_enable_queue(sc, &sc->pq);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+static int
+riscv_iommu_init_pagedir(struct riscv_iommu_softc *sc)
+{
+
+ return (0);
+}
+
+static void
+riscv_iommu_print_fault(struct riscv_iommu_softc *sc,
+ struct riscv_iommu_fq_record *rec)
+{
+ struct riscv_iommu_fq_event *ev;
+ uint16_t cause_id;
+ uint16_t ttyp;
+ uint32_t did;
+ uint32_t pid;
+ bool pv, priv;
+ int i;
+
+ cause_id = (rec->hdr & FQR_HDR_CAUSE_M) >> FQR_HDR_CAUSE_S;
+ ttyp = (rec->hdr & FQR_HDR_TTYP_M) >> FQR_HDR_TTYP_S;
+ did = (rec->hdr & FQR_HDR_DID_M) >> FQR_HDR_DID_S;
+ pid = (rec->hdr & FQR_HDR_PID_M) >> FQR_HDR_PID_S;
+ pv = (rec->hdr & FQR_HDR_PV) ? 1 : 0;
+ priv = (rec->hdr & FQR_HDR_PRIV) ? 1 : 0;
+
+ ev = NULL;
+ for (i = 0; fq_events[i].cause_id != 0; i++) {
+ if (fq_events[i].cause_id == cause_id) {
+ ev = &fq_events[i];
+ break;
+ }
+ }
+
+ if (ev == NULL) {
+ device_printf(sc->dev, "Fault: unknown fault 0x%x received\n",
+ cause_id);
+ return;
+ }
+
+ device_printf(sc->dev, "Fault: event 0x%x received: %s\n",
+ ev->cause_id, ev->descr);
+ device_printf(sc->dev, " hdr 0x%lx\n", rec->hdr);
+ device_printf(sc->dev, " iotval 0x%lx\n", rec->iotval);
+ device_printf(sc->dev, " iotval2 0x%lx\n", rec->iotval2);
+ device_printf(sc->dev, " ttyp 0x%x did 0x%x pid 0x%x pv %d priv %d"
+ "\n", ttyp, did, pid, pv, priv);
+}
+
+static int
+riscv_cq_intr(void *arg)
+{
+ struct riscv_iommu_softc *sc;
+ struct riscv_iommu_queue *q;
+ uint32_t reg;
+
+ sc = arg;
+ q = &sc->cq;
+
+ reg = RD4(sc, q->csr);
+ printf("%s: pending %x\n", __func__, reg);
+
+ /* Clear pending bit. */
+ WR4(sc, RISCV_IOMMU_IPSR, IPSR_CIP);
+
+ return (FILTER_HANDLED);
+}
+
+static int
+riscv_fq_intr(void *arg)
+{
+ struct riscv_iommu_fq_record rec;
+ struct riscv_iommu_softc *sc;
+ struct riscv_iommu_queue *q;
+ uint32_t reg;
+
+ sc = arg;
+ q = &sc->fq;
+
+ reg = RD4(sc, q->csr);
+ printf("%s: pending %x\n", __func__, reg);
+
+ /* Clear pending bit. */
+ WR4(sc, RISCV_IOMMU_IPSR, IPSR_FIP);
+
+ do {
+ riscv_iommu_dequeue(sc, q, &rec);
+ riscv_iommu_print_fault(sc, &rec);
+ } while (!riscv_iommu_q_empty(q));
+
+ return (FILTER_HANDLED);
+}
+
+static int
+riscv_pm_intr(void *arg)
+{
+ struct riscv_iommu_softc *sc;
+
+ sc = arg;
+
+ printf("%s\n", __func__);
+
+ /* Clear pending bit. */
+ WR4(sc, RISCV_IOMMU_IPSR, IPSR_PMIP);
+
+ return (FILTER_HANDLED);
+}
+
+static int
+riscv_pq_intr(void *arg)
+{
+ struct riscv_iommu_softc *sc;
+ struct riscv_iommu_queue *q;
+ uint32_t reg;
+
+ sc = arg;
+ q = &sc->pq;
+
+ reg = RD4(sc, q->csr);
+ printf("%s: pending %x\n", __func__, reg);
+
+ /* Clear pending bit. */
+ WR4(sc, RISCV_IOMMU_IPSR, IPSR_PIP);
+
+ return (FILTER_HANDLED);
+}
+
+static int
+riscv_iommu_init_ddt_linear(struct riscv_iommu_softc *sc)
+{
+ struct riscv_iommu_ddt *ddt;
+ uint64_t size;
+ uint64_t reg;
+
+ ddt = &sc->ddt;
+ ddt->num_top_entries = (1 << sc->l0_did_bits);
+
+ size = ddt->num_top_entries * (sc->dc_dwords << 3);
+
+ if (bootverbose)
+ device_printf(sc->dev, "linear ddt size %ld, num_top_entries "
+ "%d\n", size, ddt->num_top_entries);
+
+ ddt->vaddr = contigmalloc(size, M_IOMMU,
+ M_WAITOK | M_ZERO, /* flags */
+ 0, /* low */
+ (1ul << 48) - 1, /* high */
+ size, /* alignment */
+ 0); /* boundary */
+ if (ddt->vaddr == NULL) {
+ device_printf(sc->dev, "failed to allocate ddt\n");
+ return (ENXIO);
+ }
+
+ reg = vtophys(ddt->vaddr);
+ if (bootverbose)
+ device_printf(sc->dev, "ddt base %p size %lx\n", ddt->vaddr,
+ size);
+ ddt->base = PHYS_TO_PPN(reg) << DDTP_PPN_S;
+
+ return (0);
+}
+
+static int
+riscv_iommu_init_ddt_2lvl(struct riscv_iommu_softc *sc)
+{
+ struct riscv_iommu_ddt *ddt;
+ uint64_t size;
+ uint64_t reg;
+ uint64_t sz;
+
+ ddt = &sc->ddt;
+ ddt->num_top_entries = (1 << DDT_L1_DID_BITS);
+
+ size = ddt->num_top_entries * (DDT_NON_LEAF_DWORDS << 3);
+
+ if (bootverbose)
+ device_printf(sc->dev, "%s: size %lu, l1 entries %d, size "
+ "%lu\n", __func__, size, ddt->num_top_entries, size);
+
+ ddt->vaddr = contigmalloc(size, M_IOMMU,
+ M_WAITOK | M_ZERO, /* flags */
+ 0, /* low */
+ (1ul << 48) - 1, /* high */
+ size, /* alignment */
+ 0); /* boundary */
+ if (ddt->vaddr == NULL) {
+ device_printf(sc->dev, "Failed to allocate 2lvl ddt.\n");
+ return (ENOMEM);
+ }
+
+ sz = ddt->num_top_entries * sizeof(struct l1_desc);
+ ddt->l1 = malloc(sz, M_IOMMU, M_WAITOK | M_ZERO);
+
+ reg = vtophys(ddt->vaddr);
+ if (bootverbose)
+ device_printf(sc->dev, "ddt base %p size %lx\n", ddt->vaddr,
+ size);
+ ddt->base = PHYS_TO_PPN(reg) << DDTP_PPN_S;
+
+ return (0);
+}
+
+static int
+riscv_iommu_init_l0_directory(struct riscv_iommu_softc *sc, int sid)
+{
+ struct riscv_iommu_ddt *ddt;
+ struct l1_desc *l1_desc;
+ uint64_t *l1e;
+ uint64_t val;
+ size_t size;
+ int i;
+
+ ddt = &sc->ddt;
+ l1_desc = &ddt->l1[sid >> sc->l0_did_bits];
+ if (l1_desc->va) {
+ /* Already allocated. */
+ return (0);
+ }
+
+ size = (1 << sc->l0_did_bits) * (sc->dc_dwords << 3);
+
+ l1_desc->va = contigmalloc(size, M_IOMMU,
+ M_WAITOK | M_ZERO, /* flags */
+ 0, /* low */
+ (1ul << 48) - 1, /* high */
+ size, /* alignment */
+ 0); /* boundary */
+ if (l1_desc->va == NULL) {
+ device_printf(sc->dev, "failed to allocate l0 directory\n");
+ return (ENXIO);
+ }
+
+ l1_desc->pa = vtophys(l1_desc->va);
+
+ i = sid >> sc->l0_did_bits;
+ l1e = (void *)((uint64_t)ddt->vaddr + DDT_NON_LEAF_DWORDS * 8 * i);
+
+ /* Install the L1 entry. */
+ val = PHYS_TO_PPN(l1_desc->pa) << DC_NON_LEAF_ENTRY_PPN_S;
+ val |= DC_NON_LEAF_ENTRY_VALID;
+ *l1e = val;
+
+ return (0);
+}
+
+static void *
+riscv_iommu_get_dc_addr(struct riscv_iommu_softc *sc, int did)
+{
+ struct riscv_iommu_ddt *ddt;
+ struct l1_desc *l1_desc;
+ uintptr_t l0_base;
+ void *addr;
+ int l0_offs;
+ int l1_idx;
+
+ ddt = &sc->ddt;
+
+ l0_offs = sc->dc_dwords * 8 * (did & ((1 << sc->l0_did_bits) - 1));
+
+ if (sc->iommu_mode == DDTP_IOMMU_MODE_2LVL) {
+ l1_idx = (did >> sc->l0_did_bits) &
+ ((1 << DDT_L1_DID_BITS) - 1);
+ l1_desc = &ddt->l1[l1_idx];
+ l0_base = (uintptr_t)l1_desc->va;
+ } else
+ l0_base = (uintptr_t)ddt->vaddr;
+
+ addr = (void *)(l0_base + l0_offs);
+
+ dprintf("ddt vaddr %p addr %p\n", ddt->vaddr, addr);
+
+ return (addr);
+}
+
+static int
+riscv_iommu_init_dc(struct riscv_iommu_softc *sc,
+ struct riscv_iommu_domain *domain, int did, bool bypass)
+{
+ struct riscv_iommu_dc_base *dc_base;
+ struct riscv_iommu_dc *dc;
+ struct riscv_iommu_pmap *p;
+
+ dc = riscv_iommu_get_dc_addr(sc, did);
+ dc_base = &dc->base;
+
+ device_printf(sc->dev, "address translation for device id"
+ " 0x%x is %s.\n", did, bypass ? "bypassed" : "enabled");
+
+ p = &domain->p;
+
+ bzero(dc_base, sizeof(struct riscv_iommu_dc_base));
+ if (bypass == false)
+ dc_base->fsc = p->pm_satp;
+ dc_base->ta = (domain->pscid << DC_TA_PSCID_S) | DC_TA_V;
+
+ riscv_iommu_inval_ddt_did(sc, did);
+ riscv_iommu_sync(sc, &sc->cq);
+ dc_base->tc |= DC_TC_V;
+ riscv_iommu_inval_ddt_did(sc, did);
+ riscv_iommu_inval_vma(sc);
+ riscv_iommu_sync(sc, &sc->cq);
+
+ return (0);
+}
+
+static void
+riscv_iommu_deinit_dc(struct riscv_iommu_softc *sc, int did)
+{
+ struct riscv_iommu_dc_base *dc_base;
+ struct riscv_iommu_dc *dc;
+
+ dc = riscv_iommu_get_dc_addr(sc, did);
+ dc_base = &dc->base;
+ dc_base->tc &= ~DC_TC_V;
+
+ riscv_iommu_inval_ddt_did(sc, did);
+ riscv_iommu_sync(sc, &sc->cq);
+}
+
+static int
+riscv_iommu_setup_interrupts(struct riscv_iommu_softc *sc)
+{
+ device_t dev;
+ int error;
+
+ dev = sc->dev;
+
+ if (sc->res[1] == NULL || sc->res[2] == NULL ||
+ sc->res[3] == NULL || sc->res[4] == NULL) {
+ device_printf(dev, "Warning: no interrupt resources "
+ "provided.\n");
+ return (ENXIO);
+ }
+
+ error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_MISC,
+ riscv_cq_intr, NULL, sc, &sc->intr_cookie[0]);
+ if (error) {
+ device_printf(dev, "Couldn't setup cq interrupt handler\n");
+ return (ENXIO);
+ }
+
+ error = bus_setup_intr(dev, sc->res[2], INTR_TYPE_MISC,
+ riscv_fq_intr, NULL, sc, &sc->intr_cookie[1]);
+ if (error) {
+ device_printf(dev, "Couldn't setup fq interrupt handler\n");
+ return (ENXIO);
+ }
+
+ error = bus_setup_intr(dev, sc->res[3], INTR_TYPE_MISC,
+ riscv_pm_intr, NULL, sc, &sc->intr_cookie[2]);
+ if (error) {
+ device_printf(dev, "Couldn't setup pm interrupt handler\n");
+ return (ENXIO);
+ }
+
+ error = bus_setup_intr(dev, sc->res[4], INTR_TYPE_MISC,
+ riscv_pq_intr, NULL, sc, &sc->intr_cookie[3]);
+ if (error) {
+ device_printf(dev, "Couldn't setup pq interrupt handler\n");
+ return (ENXIO);
+ }
+
+ WR8(sc, RISCV_IOMMU_ICVEC, 0 << 0 | 1 << 4 | 2 << 8 | 3 << 12);
+
+ return (0);
+}
+
+int
+riscv_iommu_attach(device_t dev)
+{
+ struct riscv_iommu_softc *sc;
+ uint64_t caps;
+ int error;
+
+ sc = device_get_softc(dev);
+
+ caps = bus_read_8(sc->res[0], RISCV_IOMMU_CAPABILITIES);
+ if (bootverbose)
+ device_printf(sc->dev, "IOMMU Capabilities: %lx\n", caps);
+
+ device_printf(sc->dev, "Device-Context structure is %s.\n",
+ caps & CAPABILITIES_MSI_FLAT ?
+ "64-bytes (ext format)" : "32-bytes (std format)");
+
+ if (caps & CAPABILITIES_MSI_FLAT) {
+ sc->dc_dwords = DDT_DC_EXT_DWORDS;
+ sc->l0_did_bits = 6;
+ } else {
+ sc->dc_dwords = DDT_DC_STD_DWORDS;
+ sc->l0_did_bits = 7;
+ }
+
+ if (caps & CAPABILITIES_SV48)
+ sc->pm_mode = PMAP_MODE_SV48;
+ else if (caps & CAPABILITIES_SV39)
+ sc->pm_mode = PMAP_MODE_SV39;
+ else {
+ device_printf(sc->dev, "Unsupported virtual memory system\n");
+ return (ENXIO);
+ }
+
+ mtx_init(&sc->mtx, device_get_nameunit(sc->dev), "riscv_iommu",
+ MTX_DEF);
+
+ WR4(sc, RISCV_IOMMU_FCTL, FCTL_WSI);
+
+ error = riscv_iommu_setup_interrupts(sc);
+ if (error) {
+ device_printf(sc->dev, "Could not setup interrupts. "
+ "Continuing with no interrupts support.");
+ }
+
+ error = riscv_iommu_init_pagedir(sc);
+ if (error)
+ return (error);
+
+ error = riscv_iommu_init_queues(sc);
+ if (error)
+ return (error);
+
+ sc->iommu_mode = DDTP_IOMMU_MODE_2LVL;
+
+ switch (sc->iommu_mode) {
+ case DDTP_IOMMU_MODE_1LVL:
+ error = riscv_iommu_init_ddt_linear(sc);
+ break;
+ case DDTP_IOMMU_MODE_2LVL:
+ error = riscv_iommu_init_ddt_2lvl(sc);
+ break;
+ default:
+ error = ENXIO;
+ }
+ if (error)
+ return (error);
+
+ sc->pscid_bits = 8;
+
+ riscv_iommu_init_pscids(sc);
+ if (error)
+ return (error);
+
+ error = riscv_iommu_set_mode(sc);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+static int
+riscv_iommu_set_buswide(device_t dev, struct riscv_iommu_domain *domain,
+ struct riscv_iommu_ctx *ctx)
+{
+ struct riscv_iommu_softc *sc;
+ int i;
+
+ sc = device_get_softc(dev);
+
+ printf("%s\n", __func__);
+
+ for (i = 0; i < PCI_SLOTMAX; i++)
+ riscv_iommu_init_dc(sc, domain, (ctx->did | i),
+ ctx->bypass);
+
+ return (0);
+}
+
+static int
+riscv_iommu_pci_get_did(device_t child, uintptr_t *xref0, u_int *did0)
+{
+ struct pci_id_ofw_iommu pi;
+ int err;
+
+ dprintf("%s\n", __func__);
+
+ err = pci_get_id(child, PCI_ID_OFW_IOMMU, (uintptr_t *)&pi);
+ if (err == 0) {
+ if (did0)
+ *did0 = pi.id;
+ if (xref0)
+ *xref0 = pi.xref;
+ }
+
+ return (err);
+}
+
+static int
+riscv_iommu_find(device_t dev, device_t child)
+{
+ struct riscv_iommu_softc *sc;
+ uintptr_t xref;
+ int err;
+
+ dprintf("%s\n", __func__);
+
+ sc = device_get_softc(dev);
+
+ err = riscv_iommu_pci_get_did(child, &xref, NULL);
+ if (err)
+ return (ENOENT);
+
+ /* Check if xref is ours. */
+ dprintf("xref %lx sc->xref %lx\n", xref, sc->xref);
+ if (xref != sc->xref)
+ return (EFAULT);
+
+ return (0);
+}
+
+struct riscv_iommu_ctx *
+riscv_iommu_ctx_lookup_by_did(device_t dev, u_int did)
+{
+ struct riscv_iommu_softc *sc;
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_unit *unit;
+ struct riscv_iommu_ctx *ctx;
+
+ dprintf("%s\n", __func__);
+ sc = device_get_softc(dev);
+
+ unit = &sc->unit;
+
+ LIST_FOREACH(domain, &unit->domain_list, next) {
+ LIST_FOREACH(ctx, &domain->ctx_list, next) {
+ if (ctx->did == did) {
+ refcount_acquire(&ctx->refcnt);
+ return (ctx);
+ }
+ }
+ }
+
+ return (NULL);
+}
+
+static struct iommu_ctx *
+riscv_iommu_ctx_lookup(device_t dev, device_t child)
+{
+ struct iommu_unit *iommu __diagused;
+ struct riscv_iommu_softc *sc;
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_unit *unit;
+ struct riscv_iommu_ctx *ctx;
+
+ dprintf("%s\n", __func__);
+ sc = device_get_softc(dev);
+
+ unit = &sc->unit;
+ iommu = &unit->iommu;
+
+ IOMMU_ASSERT_LOCKED(iommu);
+
+ LIST_FOREACH(domain, &unit->domain_list, next) {
+ IOMMU_DOMAIN_LOCK(&domain->iodom);
+ LIST_FOREACH(ctx, &domain->ctx_list, next) {
+ if (ctx->dev == child) {
+ refcount_acquire(&ctx->refcnt);
+ IOMMU_DOMAIN_UNLOCK(&domain->iodom);
+ return (&ctx->ioctx);
+ }
+ }
+ IOMMU_DOMAIN_UNLOCK(&domain->iodom);
+ }
+
+ return (NULL);
+}
+
+static int
+riscv_iommu_unmap(device_t dev, struct iommu_domain *iodom,
+ vm_offset_t va, bus_size_t size)
+{
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_softc *sc;
+ int err;
+ int i;
+
+ sc = device_get_softc(dev);
+
+ domain = (struct riscv_iommu_domain *)iodom;
+
+ err = 0;
+
+ dprintf("%s: %lx, %ld, domain %d\n", __func__, va, size, domain->pscid);
+
+ for (i = 0; i < size; i += PAGE_SIZE) {
+ if (iommu_pmap_remove(&domain->p, va) == 0) {
+ /* pmap entry removed, invalidate TLB. */
+ riscv_iommu_inval_vma_page(sc, va, domain->pscid);
+ } else {
+ err = ENOENT;
+ break;
+ }
+ va += PAGE_SIZE;
+ }
+
+ riscv_iommu_sync(sc, &sc->cq);
+
+ return (err);
+}
+
+static int
+riscv_iommu_map(device_t dev, struct iommu_domain *iodom,
+ vm_offset_t va, vm_page_t *ma, vm_size_t size,
+ vm_prot_t prot)
+{
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_softc *sc;
+ vm_paddr_t pa;
+ int error;
+ int i;
+
+ sc = device_get_softc(dev);
+
+ domain = (struct riscv_iommu_domain *)iodom;
+
+ for (i = 0; size > 0; size -= PAGE_SIZE) {
+ pa = VM_PAGE_TO_PHYS(ma[i++]);
+ dprintf("%s: %lx -> %lx, %ld, domain %d\n", __func__, va, pa,
+ size, domain->pscid);
+ error = iommu_pmap_enter(&domain->p, va, pa, prot, 0);
+ if (error)
+ return (error);
+ riscv_iommu_inval_vma_page(sc, va, domain->pscid);
+ va += PAGE_SIZE;
+ }
+
+ riscv_iommu_sync(sc, &sc->cq);
+
+ return (0);
+}
+
+static struct iommu_domain *
+riscv_iommu_domain_alloc(device_t dev, struct iommu_unit *iommu)
+{
+ struct iommu_domain *iodom;
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_unit *unit;
+ struct riscv_iommu_softc *sc;
+ int new_pscid;
+ int va_bits;
+ int error;
+
+ sc = device_get_softc(dev);
+
+ dprintf("%s\n", __func__);
+
+ unit = (struct riscv_iommu_unit *)iommu;
+
+ error = riscv_iommu_pscid_alloc(sc, &new_pscid);
+ if (error) {
+ device_printf(sc->dev,
+ "Could not allocate PSCID for a new domain.\n");
+ return (NULL);
+ }
+
+ domain = malloc(sizeof(*domain), M_IOMMU, M_WAITOK | M_ZERO);
+ domain->pscid = (uint16_t)new_pscid;
+
+ iommu_pmap_pinit(&domain->p, sc->pm_mode);
+
+ riscv_iommu_inval_vma_pscid(sc, domain->pscid);
+
+ LIST_INIT(&domain->ctx_list);
+
+ IOMMU_LOCK(iommu);
+ LIST_INSERT_HEAD(&unit->domain_list, domain, next);
+ IOMMU_UNLOCK(iommu);
+
+ iodom = &domain->iodom;
+
+ va_bits = sc->pm_mode == PMAP_MODE_SV48 ? 48 : 39;
+
+ /* Avoid sign-extension. */
+ va_bits -= 1;
+
+ iodom->end = (1ULL << va_bits) - 1;
+
+ return (iodom);
+}
+
+static void
+riscv_iommu_domain_free(device_t dev, struct iommu_domain *iodom)
+{
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ dprintf("%s\n", __func__);
+
+ domain = (struct riscv_iommu_domain *)iodom;
+
+ LIST_REMOVE(domain, next);
+
+ iommu_pmap_remove_pages(&domain->p);
+ iommu_pmap_release(&domain->p);
+
+ riscv_iommu_inval_vma_pscid(sc, domain->pscid);
+ riscv_iommu_pscid_free(sc, domain->pscid);
+
+ free(domain, M_IOMMU);
+}
+
+static struct iommu_ctx *
+riscv_iommu_ctx_alloc(device_t dev, struct iommu_domain *iodom, device_t child,
+ bool disabled)
+{
+ struct riscv_iommu_domain *domain;
+ struct riscv_iommu_ctx *ctx;
+
+ dprintf("%s\n", __func__);
+
+ domain = (struct riscv_iommu_domain *)iodom;
+
+ ctx = malloc(sizeof(struct riscv_iommu_ctx), M_IOMMU,
+ M_WAITOK | M_ZERO);
+ ctx->dev = child;
+ ctx->domain = domain;
+ refcount_init(&ctx->refcnt, 1);
+ if (disabled)
+ ctx->bypass = true;
+
+ IOMMU_DOMAIN_LOCK(iodom);
+ LIST_INSERT_HEAD(&domain->ctx_list, ctx, next);
+ IOMMU_DOMAIN_UNLOCK(iodom);
+
+ return (&ctx->ioctx);
+}
+
+static int
+riscv_iommu_ctx_init(device_t dev, struct iommu_ctx *ioctx)
+{
+ struct riscv_iommu_domain *domain;
+ struct iommu_domain *iodom;
+ struct riscv_iommu_softc *sc;
+ struct riscv_iommu_ctx *ctx;
+ devclass_t pci_class;
+ u_int did;
+ int error;
+
+ ctx = (struct riscv_iommu_ctx *)ioctx;
+
+ dprintf("%s\n", __func__);
+
+ sc = device_get_softc(dev);
+
+ domain = ctx->domain;
+ iodom = (struct iommu_domain *)domain;
+
+ pci_class = devclass_find("pci");
+ if (device_get_devclass(device_get_parent(ctx->dev)) == pci_class) {
+ error = riscv_iommu_pci_get_did(ctx->dev, NULL, &did);
+ if (error)
+ return (error);
+
+ ioctx->rid = pci_get_rid(dev);
+ ctx->did = did;
+ ctx->vendor = pci_get_vendor(ctx->dev);
+ ctx->device = pci_get_device(ctx->dev);
+ }
+
+ if (sc->iommu_mode == DDTP_IOMMU_MODE_2LVL) {
+ error = riscv_iommu_init_l0_directory(sc, ctx->did);
+ if (error)
+ return (error);
+ }
+ riscv_iommu_init_dc(sc, domain, ctx->did, ctx->bypass);
+
+ if (device_get_devclass(device_get_parent(ctx->dev)) == pci_class)
+ if (iommu_is_buswide_ctx(iodom->iommu, pci_get_bus(ctx->dev)))
+ riscv_iommu_set_buswide(dev, domain, ctx);
+
+ return (0);
+}
+
+static bool
+riscv_iommu_ctx_free(device_t dev, struct iommu_ctx *ioctx)
+{
+ struct riscv_iommu_softc *sc;
+ struct riscv_iommu_ctx *ctx;
+
+ dprintf("%s\n", __func__);
+
+ IOMMU_ASSERT_LOCKED(ioctx->domain->iommu);
+
+ sc = device_get_softc(dev);
+
+ ctx = (struct riscv_iommu_ctx *)ioctx;
+ if (refcount_release(&ctx->refcnt)) {
+ riscv_iommu_deinit_dc(sc, ctx->did);
+ LIST_REMOVE(ctx, next);
+ free(ctx, M_IOMMU);
+ return (true);
+ }
+
+ return (false);
+}
+
+#ifdef FDT
+static int
+riscv_iommu_ofw_md_data(device_t dev, struct iommu_ctx *ioctx, pcell_t *cells,
+ int ncells)
+{
+ struct riscv_iommu_ctx *ctx;
+
+ printf("%s\n", __func__);
+ ctx = (struct riscv_iommu_ctx *)ioctx;
+
+ if (ncells != 1)
+ return (-1);
+
+ ctx->did = cells[0];
+
+ return (0);
+}
+#endif
+
+static int
+riscv_iommu_read_ivar(device_t dev, device_t child, int which,
+ uintptr_t *result)
+{
+ struct riscv_iommu_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ device_printf(sc->dev, "%s\n", __func__);
+
+ return (ENOENT);
+}
+
+static device_method_t riscv_iommu_methods[] = {
+ /* IOMMU interface */
+ DEVMETHOD(iommu_find, riscv_iommu_find),
+ DEVMETHOD(iommu_map, riscv_iommu_map),
+ DEVMETHOD(iommu_unmap, riscv_iommu_unmap),
+ DEVMETHOD(iommu_domain_alloc, riscv_iommu_domain_alloc),
+ DEVMETHOD(iommu_domain_free, riscv_iommu_domain_free),
+ DEVMETHOD(iommu_ctx_alloc, riscv_iommu_ctx_alloc),
+ DEVMETHOD(iommu_ctx_init, riscv_iommu_ctx_init),
+ DEVMETHOD(iommu_ctx_free, riscv_iommu_ctx_free),
+ DEVMETHOD(iommu_ctx_lookup, riscv_iommu_ctx_lookup),
+#ifdef FDT
+ DEVMETHOD(iommu_ofw_md_data, riscv_iommu_ofw_md_data),
+#endif
+
+ /* Bus interface */
+ DEVMETHOD(bus_read_ivar, riscv_iommu_read_ivar),
+
+ /* End */
+ DEVMETHOD_END
+};
+
+DEFINE_CLASS_0(riscv_iommu, riscv_iommu_driver, riscv_iommu_methods,
+ sizeof(struct riscv_iommu_softc));
diff --git a/sys/riscv/iommu/iommu.h b/sys/riscv/iommu/iommu.h
new file mode 100644
index 00000000000..502a0e121bd
--- /dev/null
+++ b/sys/riscv/iommu/iommu.h
@@ -0,0 +1,359 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _RISCV_IOMMU_IOMMU_H_
+#define _RISCV_IOMMU_IOMMU_H_
+
+#define RISCV_IOMMU_CAPABILITIES 0x0000
+#define CAPABILITIES_VERSION_S 0
+#define CAPABILITIES_VERSION_M (0xff << CAPABILITIES_VERSION_S)
+#define CAPABILITIES_SV32 (1 << 8)
+#define CAPABILITIES_SV39 (1 << 9)
+#define CAPABILITIES_SV48 (1 << 10)
+#define CAPABILITIES_SV57 (1 << 11)
+#define CAPABILITIES_SVPBMT (1 << 15)
+#define CAPABILITIES_SV32X4 (1 << 16)
+#define CAPABILITIES_SV39X4 (1 << 17)
+#define CAPABILITIES_SV48X4 (1 << 18)
+#define CAPABILITIES_SV57X4 (1 << 19)
+#define CAPABILITIES_AMO_MRIF (1 << 21)
+#define CAPABILITIES_MSI_FLAT (1 << 22)
+#define CAPABILITIES_MSI_MRIF (1 << 23)
+#define CAPABILITIES_AMO_HWAD (1 << 24)
+#define CAPABILITIES_ATS (1 << 25)
+#define CAPABILITIES_T2GPA (1 << 26)
+#define CAPABILITIES_END (1 << 27)
+#define CAPABILITIES_IGS_S 28
+#define CAPABILITIES_IGS_M (0x3 << CAPABILITIES_IGS_S)
+#define CAPABILITIES_HPM (1 << 30)
+#define CAPABILITIES_DBG (1 << 31)
+#define CAPABILITIES_PAS_S 32ULL
+#define CAPABILITIES_PAS_M (0x3f << CAPABILITIES_PAS_S)
+#define CAPABILITIES_PD8 (1ULL << 38)
+#define CAPABILITIES_PD17 (1ULL << 39)
+#define CAPABILITIES_PD20 (1ULL << 40)
+#define RISCV_IOMMU_FCTL 0x0008
+#define FCTL_BE (1 << 0) /* Big-endian */
+#define FCTL_WSI (1 << 1) /* Wire-signalled Ints. */
+#define FCTL_GXL (1 << 2) /* Guest physical addresses */
+#define RISCV_IOMMU_DDTP 0x0010
+#define DDTP_IOMMU_MODE_S 0
+#define DDTP_IOMMU_MODE_OFF (0 << DDTP_IOMMU_MODE_S)
+#define DDTP_IOMMU_MODE_BARE (1 << DDTP_IOMMU_MODE_S)
+#define DDTP_IOMMU_MODE_1LVL (2 << DDTP_IOMMU_MODE_S)
+#define DDTP_IOMMU_MODE_2LVL (3 << DDTP_IOMMU_MODE_S)
+#define DDTP_IOMMU_MODE_3LVL (4 << DDTP_IOMMU_MODE_S)
+#define DDTP_BUSY (1 << 4)
+#define DDTP_PPN_S 10
+#define DDTP_PPN_M (0xfffffffffffULL << DDTP_PPN_S)
+#define RISCV_IOMMU_CQB 0x18 /* Command queue base. */
+#define CQB_LOG2SZ_1_S 0
+#define CQB_LOG2SZ_1_M (0x3f << CQB_LOG2SZ_1_S)
+#define CQB_PPN_S 10
+#define CQB_PPN_M (0xfffffffffffULL << CQB_PPN_S)
+#define RISCV_IOMMU_CQH 0x20
+#define RISCV_IOMMU_CQT 0x24
+#define RISCV_IOMMU_FQB 0x28 /* Fault queue base. */
+#define RISCV_IOMMU_FQH 0x30
+#define RISCV_IOMMU_FQT 0x34
+#define RISCV_IOMMU_PQB 0x38 /* Page queue base. */
+#define RISCV_IOMMU_PQH 0x40
+#define RISCV_IOMMU_PQT 0x44
+#define RISCV_IOMMU_CQCSR 0x48
+#define CQCSR_BUSY (1 << 17) /* Write is observed */
+#define CQCSR_CQON (1 << 16) /* Active */
+#define CQCSR_FENCE_W_IP (1 << 11) /* iofence.c completed */
+#define CQCSR_CMD_ILL (1 << 10) /* Illegal command */
+#define CQCSR_CMD_TO (1 << 9) /* Timeout */
+#define CQCSR_CQMF (1 << 8) /* Memory Fault */
+#define CQCSR_CIE (1 << 1) /* Interrupt Enable */
+#define CQCSR_CQEN (1 << 0) /* Enable */
+#define RISCV_IOMMU_FQCSR 0x4C
+#define FQCSR_BUSY (1 << 17) /* Write is observed */
+#define FQCSR_FQON (1 << 16) /* Active */
+#define FQCSR_FQOF (1 << 9) /* Overflow */
+#define FQCSR_FQMF (1 << 8) /* Memory Fault */
+#define FQCSR_FIE (1 << 1) /* Interrupt Enable */
+#define FQCSR_FQEN (1 << 0) /* Enable */
+#define RISCV_IOMMU_PQCSR 0x50
+#define PQCSR_BUSY (1 << 17) /* Write is observed */
+#define PQCSR_PQON (1 << 16) /* Active */
+#define PQCSR_PQOF (1 << 9) /* Overflow */
+#define PQCSR_PQMF (1 << 8) /* Memory Fault */
+#define PQCSR_PIE (1 << 1) /* Interrupt Enable */
+#define PQCSR_PQEN (1 << 0) /* Enable */
+#define RISCV_IOMMU_IPSR 0x54
+#define IPSR_CIP (1 << 0) /* Command queue interrupt pending */
+#define IPSR_FIP (1 << 1) /* Fault queue interrupt pending */
+#define IPSR_PMIP (1 << 2) /* Performance monitoring int pend */
+#define IPSR_PIP (1 << 3) /* Page queue interrupt pending */
+
+#define RISCV_IOMMU_IOCOUNTOVF 0x0058
+#define RISCV_IOMMU_IOCOUNTINH 0x005C
+
+#define RISCV_IOMMU_IOHPMCYCLES 0x0060
+#define RISCV_IOMMU_IOHPMCTR_BASE 0x0068
+#define RISCV_IOMMU_IOHPMCTR(_n) (RISCV_IOMMU_IOHPMCTR_BASE + ((_n) * 0x8))
+#define RISCV_IOMMU_IOHPMEVT_BASE 0x0160
+#define RISCV_IOMMU_IOHPMEVT(_n) (RISCV_IOMMU_IOHPMEVT_BASE + ((_n) * 0x8))
+#define RISCV_IOMMU_TR_REQ_IOVA 0x0258
+#define RISCV_IOMMU_TR_REQ_CTL 0x0260
+#define RISCV_IOMMU_TR_RESPONSE 0x0268
+#define RISCV_IOMMU_ICVEC 0x02F8
+
+#define RISCV_IOMMU_LOCK(_sc) mtx_lock(&(_sc)->mtx)
+#define RISCV_IOMMU_UNLOCK(_sc) mtx_unlock(&(_sc)->mtx)
+
+DECLARE_CLASS(riscv_iommu_driver);
+
+MALLOC_DECLARE(M_IOMMU);
+
+struct riscv_iommu_unit {
+ struct iommu_unit iommu;
+ LIST_HEAD(, riscv_iommu_domain) domain_list;
+ LIST_ENTRY(riscv_iommu_unit) next;
+ device_t dev;
+ intptr_t xref;
+};
+
+struct riscv_iommu_domain {
+ struct iommu_domain iodom;
+ LIST_HEAD(, riscv_iommu_ctx) ctx_list;
+ LIST_ENTRY(riscv_iommu_domain) next;
+ u_int entries_cnt;
+ struct riscv_iommu_cd *cd;
+ struct riscv_iommu_pmap p;
+ uint16_t pscid;
+};
+
+struct riscv_iommu_ctx {
+ struct iommu_ctx ioctx;
+ struct riscv_iommu_domain *domain;
+ LIST_ENTRY(riscv_iommu_ctx) next;
+ device_t dev;
+ bool bypass;
+ int did;
+ uint16_t vendor;
+ uint16_t device;
+ u_int refcnt;
+};
+
+struct riscv_iommu_queue_local_copy {
+ union {
+ uint64_t val;
+ struct {
+ uint32_t head;
+ uint32_t tail;
+ };
+ };
+};
+
+struct riscv_iommu_queue {
+ struct riscv_iommu_queue_local_copy lc;
+ vm_paddr_t paddr;
+ void *vaddr;
+ uint64_t mask;
+ uint32_t head_off;
+ uint32_t tail_off;
+ int size_log2;
+ uint64_t base;
+ uint64_t csr;
+ int idx;
+ uint8_t entry_size;
+};
+
+struct l1_desc {
+ uint8_t span;
+ void *va;
+ vm_paddr_t pa;
+};
+
+/* Base-format device-context. */
+struct riscv_iommu_dc_base {
+ uint64_t tc; /* Translation control */
+#define DC_TC_V (1 << 0)
+#define DC_TC_EN_ATS (1 << 1)
+#define DC_TC_EN_PRI (1 << 2)
+#define DC_TC_T2GPA (1 << 3)
+#define DC_TC_DTF (1 << 4)
+#define DC_TC_PDTV (1 << 5)
+#define DC_TC_PRPR (1 << 6)
+#define DC_TC_GADE (1 << 7)
+#define DC_TC_SADE (1 << 8)
+#define DC_TC_DPE (1 << 9)
+#define DC_TC_SBE (1 << 10)
+#define DC_TC_SXL (1 << 11)
+ uint64_t iohgatp; /* IO Hyp guest address translation */
+ uint64_t ta; /* Translation attributes */
+#define DC_TA_V (1 << 0)
+#define DC_TA_ENS (1 << 1)
+#define DC_TA_SUM (1 << 2)
+#define DC_TA_PSCID_S 12
+#define DC_TA_PSCID_M (0xfffff << DC_TA_PSCID_S)
+ uint64_t fsc; /* First-stage-context */
+};
+
+/* Extended-format device-context. */
+struct riscv_iommu_dc {
+ struct riscv_iommu_dc_base base;
+ uint64_t msiptp; /* MSI page table pointer */
+ uint64_t msi_addr_mask;
+ uint64_t msi_addr_pattern;
+ uint64_t _reserved;
+};
+
+#define DC_NON_LEAF_ENTRY_PPN_S 10
+#define DC_NON_LEAF_ENTRY_VALID (1 << 0)
+
+struct riscv_iommu_ddt {
+ void *vaddr;
+ uint64_t base;
+ uint32_t base_cfg;
+ uint32_t num_top_entries;
+ struct l1_desc *l1;
+ struct riscv_iommu_dc *dc;
+};
+
+struct riscv_iommu_softc {
+ device_t dev;
+ intptr_t xref;
+ struct riscv_iommu_unit unit;
+ struct resource *res[5];
+ void *intr_cookie[4];
+ struct riscv_iommu_queue cq;
+ struct riscv_iommu_queue fq;
+ struct riscv_iommu_queue pq;
+ struct riscv_iommu_ddt ddt;
+ struct mtx mtx;
+ uint32_t l0_did_bits;
+ uint32_t dc_dwords;
+
+ /* PSCID management. */
+ bitstr_t *pscid_set;
+ int pscid_set_size;
+ struct mtx pscid_set_mutex;
+ uint32_t pscid_bits;
+
+ enum pmap_mode pm_mode;
+ int iommu_mode;
+};
+
+/*
+ * Command queue request.
+ */
+struct riscv_iommu_command {
+ uint64_t dword0;
+ uint64_t dword1;
+};
+
+enum riscv_iommu_fq_causes {
+ FQ_CAUSE_INST_FAULT = 1,
+ FQ_CAUSE_RD_ADDR_MISALIGNED = 4,
+ FQ_CAUSE_RD_FAULT = 5,
+ FQ_CAUSE_WR_ADDR_MISALIGNED = 6,
+ FQ_CAUSE_WR_FAULT = 7,
+ FQ_CAUSE_INST_FAULT_S = 12,
+ FQ_CAUSE_RD_FAULT_S = 13,
+ FQ_CAUSE_WR_FAULT_S = 15,
+ FQ_CAUSE_INST_FAULT_VS = 20,
+ FQ_CAUSE_RD_FAULT_VS = 21,
+ FQ_CAUSE_WR_FAULT_VS = 23,
+ FQ_CAUSE_DMA_DISABLED = 256,
+ FQ_CAUSE_DDT_LOAD_FAULT = 257,
+ FQ_CAUSE_DDT_INVALID = 258,
+ FQ_CAUSE_DDT_MISCONFIGURED = 259,
+ FQ_CAUSE_TR_TYPE_DISALLOWED = 260,
+ FQ_CAUSE_MSI_LOAD_FAULT = 261,
+ FQ_CAUSE_MSI_INVALID = 262,
+ FQ_CAUSE_MSI_MISCONFIGURED = 263,
+ FQ_CAUSE_MRIF_FAULT = 264,
+ FQ_CAUSE_PDT_LOAD_FAULT = 265,
+ FQ_CAUSE_PDT_INVALID = 266,
+ FQ_CAUSE_PDT_MISCONFIGURED = 267,
+ FQ_CAUSE_DDT_CORRUPTED = 268,
+ FQ_CAUSE_PDT_CORRUPTED = 269,
+ FQ_CAUSE_MSI_PT_CORRUPTED = 270,
+ FQ_CAUSE_MRIF_CORRUPTED = 271,
+ FQ_CAUSE_INTERNAL_DP_ERROR = 272,
+ FQ_CAUSE_MSI_WR_FAULT = 273,
+ FQ_CAUSE_PT_CORRUPTED = 274,
+};
+
+/*
+ * Fault queue record.
+ */
+struct riscv_iommu_fq_record {
+ uint64_t hdr;
+#define FQR_HDR_CAUSE_S 0
+#define FQR_HDR_CAUSE_M (0xfff << FQR_HDR_CAUSE_S)
+#define FQR_HDR_PID_S 12
+#define FQR_HDR_PID_M (0xfffffULL << FQR_HDR_PID_S)
+#define FQR_HDR_PV (1ULL << 32)
+#define FQR_HDR_PRIV (1ULL << 33)
+#define FQR_HDR_TTYP_S 34ULL
+#define FQR_HDR_TTYP_M (0x3fULL << FQR_HDR_TTYP_S)
+#define FQR_HDR_DID_S 40ULL
+#define FQR_HDR_DID_M (0xffffffULL << FQR_HDR_DID_S)
+ uint32_t custom;
+ uint32_t reserved;
+ uint64_t iotval;
+ uint64_t iotval2;
+};
+
+#define COMMAND_OPCODE_S 0
+#define COMMAND_OPCODE_IOTINVAL (1 << COMMAND_OPCODE_S)
+#define COMMAND_OPCODE_IOFENCE (2 << COMMAND_OPCODE_S)
+#define COMMAND_OPCODE_IODIR (3 << COMMAND_OPCODE_S)
+#define COMMAND_OPCODE_ATS (4 << COMMAND_OPCODE_S)
+#define COMMAND_OPCODE_FUNC_S 7
+#define COMMAND_OPCODE_FUNC_M (0x3 << COMMAND_OPCODE_FUNC_S)
+#define FUNC_IODIR_INVAL_DDT (0 << COMMAND_OPCODE_FUNC_S)
+#define FUNC_IODIR_INVAL_PDT (1 << COMMAND_OPCODE_FUNC_S)
+#define FUNC_IODIR_PID_S 12
+#define FUNC_IODIR_DV (1ULL << 33) /* DID Valid */
+#define FUNC_IODIR_DID_S 40ULL
+/* dword0 */
+#define FUNC_IOTINVAL_VMA (0 << COMMAND_OPCODE_FUNC_S)
+#define FUNC_IOTINVAL_GVMA (1 << COMMAND_OPCODE_FUNC_S)
+#define FUNC_IOTINVAL_AV (1 << 10) /* Address Valid */
+#define FUNC_IOTINVAL_PSCID_S 12 /* Process-Soft-Context ID */
+#define FUNC_IOTINVAL_PSCV (1ULL << 32) /* PSCID Valid */
+#define FUNC_IOTINVAL_GV (1ULL << 33) /* GSCID Valid */
+#define FUNC_IOTINVAL_GSCID_S 44 /* Guest-Soft-Context ID */
+/* dword1 */
+#define FUNC_IOTINVAL_ADDR_S 10
+#define FUNC_IOFENCE_FUNC_C (0 << 7)
+#define FUNC_IOFENCE_AV (1 << 10)
+#define FUNC_IOFENCE_WSI (1 << 11)
+#define FUNC_IOFENCE_PR (1 << 12)
+#define FUNC_IOFENCE_PW (1 << 13)
+#define FUNC_IOFENCE_DATA_S 32ULL
+
+int riscv_iommu_attach(device_t dev);
+struct riscv_iommu_ctx *riscv_iommu_ctx_lookup_by_did(device_t dev, u_int did);
+
+#endif /* _RISCV_IOMMU_IOMMU_H_ */
diff --git a/sys/riscv/iommu/iommu_fdt.c b/sys/riscv/iommu/iommu_fdt.c
new file mode 100644
index 00000000000..99fb41d55f8
--- /dev/null
+++ b/sys/riscv/iommu/iommu_fdt.c
@@ -0,0 +1,145 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+static struct ofw_compat_data compat_data[] = {
+ { "riscv,iommu", 1 },
+ { NULL, 0 }
+};
+
+static struct resource_spec riscv_iommu_spec[] = {
+ { SYS_RES_MEMORY, 0, RF_ACTIVE },
+ { SYS_RES_IRQ, 0, RF_ACTIVE }, /* CQ */
+ { SYS_RES_IRQ, 1, RF_ACTIVE }, /* FQ */
+ { SYS_RES_IRQ, 2, RF_ACTIVE }, /* PM */
+ { SYS_RES_IRQ, 3, RF_ACTIVE }, /* PQ */
+ RESOURCE_SPEC_END
+};
+
+static int
+riscv_iommu_fdt_probe(device_t dev)
+{
+ if (!ofw_bus_status_okay(dev))
+ return (ENXIO);
+
+ if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0)
+ return (ENXIO);
+
+ device_set_desc(dev, "RISC-V IOMMU");
+
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+riscv_iommu_fdt_attach(device_t dev)
+{
+ struct riscv_iommu_softc *sc;
+ struct riscv_iommu_unit *unit;
+ struct iommu_unit *iommu;
+ phandle_t node;
+ int error;
+
+ sc = device_get_softc(dev);
+ sc->dev = dev;
+
+ node = ofw_bus_get_node(dev);
+
+ error = bus_alloc_resources(dev, riscv_iommu_spec, sc->res);
+ if (error) {
+ device_printf(dev, "could not allocate resources\n");
+ goto error;
+ }
+
+ error = riscv_iommu_attach(dev);
+ if (error != 0) {
+ device_printf(dev, "Failed to attach. Error %d\n", error);
+ goto error;
+ }
+
+ unit = &sc->unit;
+ unit->dev = dev;
+
+ iommu = &unit->iommu;
+ iommu->dev = dev;
+
+ LIST_INIT(&unit->domain_list);
+
+ sc->xref = OF_xref_from_node(node);
+
+ error = iommu_register(iommu);
+ if (error) {
+ device_printf(dev, "Failed to register RISC-V IOMMU.\n");
+ goto error;
+ }
+
+ return (0);
+
+error:
+ bus_release_resources(dev, riscv_iommu_spec, sc->res);
+
+ return (error);
+}
+
+static device_method_t riscv_iommu_fdt_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, riscv_iommu_fdt_probe),
+ DEVMETHOD(device_attach, riscv_iommu_fdt_attach),
+ DEVMETHOD_END
+};
+
+DEFINE_CLASS_1(riscv_iommu, riscv_iommu_fdt_driver, riscv_iommu_fdt_methods,
+ sizeof(struct riscv_iommu_softc), riscv_iommu_driver);
+EARLY_DRIVER_MODULE(riscv_iommu, simplebus, riscv_iommu_fdt_driver, 0, 0,
+ BUS_PASS_INTERRUPT + BUS_PASS_ORDER_LATE);
diff --git a/sys/riscv/iommu/iommu_frontend.c b/sys/riscv/iommu/iommu_frontend.c
new file mode 100644
index 00000000000..1de8b362398
--- /dev/null
+++ b/sys/riscv/iommu/iommu_frontend.c
@@ -0,0 +1,505 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_platform.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
+#ifdef FDT
+#include
+#include
+#include
+#endif
+
+#include "iommu_frontend.h"
+#include "iommu_if.h"
+
+static MALLOC_DEFINE(M_IOMMU, "IOMMU", "IOMMU framework");
+
+#define IOMMU_LIST_LOCK() sx_xlock(&iommu_sx)
+#define IOMMU_LIST_UNLOCK() sx_xunlock(&iommu_sx)
+#define IOMMU_LIST_ASSERT_LOCKED() sx_assert(&iommu_sx, SA_XLOCKED)
+
+#define dprintf(fmt, ...)
+
+static struct sx iommu_sx;
+
+struct iommu_entry {
+ struct iommu_unit *iommu;
+ LIST_ENTRY(iommu_entry) next;
+};
+static LIST_HEAD(, iommu_entry) iommu_list = LIST_HEAD_INITIALIZER(iommu_list);
+
+static int
+iommu_domain_unmap_buf(struct iommu_domain *iodom,
+ struct iommu_map_entry *entry, int flags)
+{
+ struct iommu_unit *iommu;
+ int error;
+
+ iommu = iodom->iommu;
+ error = IOMMU_UNMAP(iommu->dev, iodom, entry->start, entry->end -
+ entry->start);
+ return (error);
+}
+
+static int
+iommu_domain_map_buf(struct iommu_domain *iodom, struct iommu_map_entry *entry,
+ vm_page_t *ma, uint64_t eflags, int flags)
+{
+ struct iommu_unit *iommu;
+ vm_prot_t prot;
+ vm_offset_t va;
+ int error;
+
+ dprintf("%s: base %lx, size %lx\n", __func__, base, size);
+
+ prot = 0;
+ if (eflags & IOMMU_MAP_ENTRY_READ)
+ prot |= VM_PROT_READ;
+ if (eflags & IOMMU_MAP_ENTRY_WRITE)
+ prot |= VM_PROT_WRITE;
+
+ va = entry->start;
+ iommu = iodom->iommu;
+ error = IOMMU_MAP(iommu->dev, iodom, va, ma, entry->end -
+ entry->start, prot);
+ return (error);
+}
+
+static const struct iommu_domain_map_ops domain_map_ops = {
+ .map = iommu_domain_map_buf,
+ .unmap = iommu_domain_unmap_buf,
+};
+
+static struct iommu_domain *
+iommu_domain_alloc(struct iommu_unit *iommu)
+{
+ struct iommu_domain *iodom;
+
+ iodom = IOMMU_DOMAIN_ALLOC(iommu->dev, iommu);
+ if (iodom == NULL)
+ return (NULL);
+
+ KASSERT(iodom->end != 0, ("domain end is not set"));
+
+ iommu_domain_init(iommu, iodom, &domain_map_ops);
+ iodom->iommu = iommu;
+ iommu_gas_init_domain(iodom);
+
+ return (iodom);
+}
+
+static int
+iommu_domain_free(struct iommu_domain *iodom)
+{
+ struct iommu_unit *iommu;
+
+ iommu = iodom->iommu;
+
+ IOMMU_LOCK(iommu);
+
+ if ((iodom->flags & IOMMU_DOMAIN_GAS_INITED) != 0) {
+ IOMMU_DOMAIN_LOCK(iodom);
+ iommu_gas_fini_domain(iodom);
+ IOMMU_DOMAIN_UNLOCK(iodom);
+ }
+
+ iommu_domain_fini(iodom);
+
+ IOMMU_DOMAIN_FREE(iommu->dev, iodom);
+ IOMMU_UNLOCK(iommu);
+
+ return (0);
+}
+
+static void
+iommu_tag_init(struct iommu_domain *iodom, struct bus_dma_tag_iommu *t)
+{
+ bus_addr_t maxaddr;
+
+ maxaddr = MIN(iodom->end, BUS_SPACE_MAXADDR);
+
+ t->common.impl = &bus_dma_iommu_impl;
+ t->common.alignment = 1;
+ t->common.boundary = 0;
+ t->common.lowaddr = maxaddr;
+ t->common.highaddr = maxaddr;
+ t->common.maxsize = maxaddr;
+ t->common.nsegments = BUS_SPACE_UNRESTRICTED;
+ t->common.maxsegsz = maxaddr;
+}
+
+static struct iommu_ctx *
+iommu_ctx_alloc(device_t requester, struct iommu_domain *iodom, bool disabled)
+{
+ struct iommu_unit *iommu;
+ struct iommu_ctx *ioctx;
+
+ iommu = iodom->iommu;
+
+ ioctx = IOMMU_CTX_ALLOC(iommu->dev, iodom, requester, disabled);
+ if (ioctx == NULL)
+ return (NULL);
+
+ ioctx->domain = iodom;
+
+ return (ioctx);
+}
+
+static int
+iommu_ctx_init(device_t requester, struct iommu_ctx *ioctx)
+{
+ struct bus_dma_tag_iommu *tag;
+ struct iommu_domain *iodom;
+ struct iommu_unit *iommu;
+ int error;
+
+ iodom = ioctx->domain;
+ iommu = iodom->iommu;
+
+ error = IOMMU_CTX_INIT(iommu->dev, ioctx);
+ if (error)
+ return (error);
+
+ tag = ioctx->tag = malloc(sizeof(struct bus_dma_tag_iommu),
+ M_DEVBUF, M_WAITOK | M_ZERO);
+ tag->owner = requester;
+ tag->ctx = ioctx;
+ tag->ctx->domain = iodom;
+
+ iommu_tag_init(iodom, tag);
+
+ return (error);
+}
+
+static struct iommu_unit *
+iommu_lookup(device_t dev)
+{
+ struct iommu_entry *entry;
+ struct iommu_unit *iommu;
+
+ IOMMU_LIST_LOCK();
+ LIST_FOREACH(entry, &iommu_list, next) {
+ iommu = entry->iommu;
+ if (iommu->dev == dev) {
+ IOMMU_LIST_UNLOCK();
+ return (iommu);
+ }
+ }
+ IOMMU_LIST_UNLOCK();
+
+ return (NULL);
+}
+
+#ifdef FDT
+struct iommu_ctx *
+iommu_get_ctx_ofw(device_t dev, int channel)
+{
+ struct iommu_domain *iodom;
+ struct iommu_unit *iommu;
+ struct iommu_ctx *ioctx;
+ phandle_t node, parent;
+ device_t iommu_dev;
+ pcell_t *cells;
+ int niommus;
+ int ncells;
+ int error;
+
+ node = ofw_bus_get_node(dev);
+ if (node <= 0) {
+ device_printf(dev,
+ "%s called on not ofw based device.\n", __func__);
+ return (NULL);
+ }
+
+ error = ofw_bus_parse_xref_list_get_length(node,
+ "iommus", "#iommu-cells", &niommus);
+ if (error) {
+ device_printf(dev, "%s can't get iommu list.\n", __func__);
+ return (NULL);
+ }
+
+ if (niommus == 0) {
+ device_printf(dev, "%s iommu list is empty.\n", __func__);
+ return (NULL);
+ }
+
+ error = ofw_bus_parse_xref_list_alloc(node, "iommus", "#iommu-cells",
+ channel, &parent, &ncells, &cells);
+ if (error != 0) {
+ device_printf(dev, "%s can't get iommu device xref.\n",
+ __func__);
+ return (NULL);
+ }
+
+ iommu_dev = OF_device_from_xref(parent);
+ if (iommu_dev == NULL) {
+ device_printf(dev, "%s can't get iommu device.\n", __func__);
+ return (NULL);
+ }
+
+ iommu = iommu_lookup(iommu_dev);
+ if (iommu == NULL) {
+ device_printf(dev, "%s can't lookup iommu.\n", __func__);
+ return (NULL);
+ }
+
+ /*
+ * In our current configuration we have a domain per each ctx,
+ * so allocate a domain first.
+ */
+ iodom = iommu_domain_alloc(iommu);
+ if (iodom == NULL) {
+ device_printf(dev, "%s can't allocate domain.\n", __func__);
+ return (NULL);
+ }
+
+ ioctx = iommu_ctx_alloc(dev, iodom, false);
+ if (ioctx == NULL) {
+ iommu_domain_free(iodom);
+ return (NULL);
+ }
+
+ ioctx->domain = iodom;
+
+ error = IOMMU_OFW_MD_DATA(iommu->dev, ioctx, cells, ncells);
+ if (error) {
+ device_printf(dev, "%s can't set MD data\n", __func__);
+ return (NULL);
+ }
+
+ error = iommu_ctx_init(dev, ioctx);
+ if (error) {
+ IOMMU_CTX_FREE(iommu->dev, ioctx);
+ iommu_domain_free(iodom);
+ return (NULL);
+ }
+
+ return (ioctx);
+}
+#endif
+
+struct iommu_ctx *
+iommu_get_ctx(struct iommu_unit *iommu, device_t requester,
+ uint16_t rid, bool disabled, bool rmrr)
+{
+ struct iommu_domain *iodom;
+ struct iommu_ctx *ioctx;
+ int error;
+
+ IOMMU_LOCK(iommu);
+ ioctx = IOMMU_CTX_LOOKUP(iommu->dev, requester);
+ if (ioctx) {
+ IOMMU_UNLOCK(iommu);
+ return (ioctx);
+ }
+ IOMMU_UNLOCK(iommu);
+
+ /*
+ * In our current configuration we have a domain per each ctx.
+ * So allocate a domain first.
+ */
+ iodom = iommu_domain_alloc(iommu);
+ if (iodom == NULL)
+ return (NULL);
+
+ ioctx = iommu_ctx_alloc(requester, iodom, disabled);
+ if (ioctx == NULL) {
+ iommu_domain_free(iodom);
+ return (NULL);
+ }
+
+ error = iommu_ctx_init(requester, ioctx);
+ if (error) {
+ IOMMU_CTX_FREE(iommu->dev, ioctx);
+ iommu_domain_free(iodom);
+ return (NULL);
+ }
+
+ return (ioctx);
+}
+
+void
+iommu_free_ctx_locked(struct iommu_unit *iommu, struct iommu_ctx *ioctx)
+{
+ struct iommu_domain *domain;
+ bool released;
+ int error;
+
+ IOMMU_ASSERT_LOCKED(iommu);
+
+ domain = ioctx->domain;
+
+ released = IOMMU_CTX_FREE(iommu->dev, ioctx);
+ IOMMU_UNLOCK(iommu);
+
+ if (released) {
+ /* Since we have a domain per each ctx, remove it too. */
+ error = iommu_domain_free(domain);
+ if (error)
+ device_printf(iommu->dev, "Could not free a domain\n");
+ }
+}
+
+static void
+iommu_domain_free_entry(struct iommu_map_entry *entry, bool free)
+{
+ iommu_gas_free_space(entry);
+
+ if (free)
+ iommu_gas_free_entry(entry);
+ else
+ entry->flags = 0;
+}
+
+void
+iommu_domain_unload(struct iommu_domain *iodom,
+ struct iommu_map_entries_tailq *entries, bool cansleep)
+{
+ struct iommu_map_entry *entry, *entry1;
+ int error __diagused;
+
+ TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) {
+ KASSERT((entry->flags & IOMMU_MAP_ENTRY_MAP) != 0,
+ ("not mapped entry %p %p", iodom, entry));
+ error = iodom->ops->unmap(iodom, entry,
+ cansleep ? IOMMU_PGF_WAITOK : 0);
+ KASSERT(error == 0, ("unmap %p error %d", iodom, error));
+ TAILQ_REMOVE(entries, entry, dmamap_link);
+ iommu_domain_free_entry(entry, true);
+ }
+
+ if (TAILQ_EMPTY(entries))
+ return;
+
+ panic("entries map is not empty");
+}
+
+int
+iommu_register(struct iommu_unit *iommu)
+{
+ struct iommu_entry *entry;
+
+ mtx_init(&iommu->lock, "IOMMU", NULL, MTX_DEF);
+
+ entry = malloc(sizeof(struct iommu_entry), M_IOMMU, M_WAITOK | M_ZERO);
+ entry->iommu = iommu;
+
+ IOMMU_LIST_LOCK();
+ LIST_INSERT_HEAD(&iommu_list, entry, next);
+ IOMMU_LIST_UNLOCK();
+
+ sysctl_ctx_init(&iommu->sysctl_ctx);
+ iommu_init_busdma(iommu);
+
+ return (0);
+}
+
+int
+iommu_unregister(struct iommu_unit *iommu)
+{
+ struct iommu_entry *entry, *tmp;
+
+ IOMMU_LIST_LOCK();
+ LIST_FOREACH_SAFE(entry, &iommu_list, next, tmp) {
+ if (entry->iommu == iommu) {
+ LIST_REMOVE(entry, next);
+ free(entry, M_IOMMU);
+ }
+ }
+ IOMMU_LIST_UNLOCK();
+
+ iommu_fini_busdma(iommu);
+ sysctl_ctx_free(&iommu->sysctl_ctx);
+
+ mtx_destroy(&iommu->lock);
+
+ return (0);
+}
+
+struct iommu_unit *
+iommu_find(device_t dev, bool verbose)
+{
+ struct iommu_entry *entry;
+ struct iommu_unit *iommu;
+ int error;
+
+ IOMMU_LIST_LOCK();
+ LIST_FOREACH(entry, &iommu_list, next) {
+ iommu = entry->iommu;
+ error = IOMMU_FIND(iommu->dev, dev);
+ if (error == 0) {
+ IOMMU_LIST_UNLOCK();
+ return (entry->iommu);
+ }
+ }
+ IOMMU_LIST_UNLOCK();
+
+ return (NULL);
+}
+
+void
+iommu_unit_pre_instantiate_ctx(struct iommu_unit *unit)
+{
+}
+
+void
+iommu_domain_unload_entry(struct iommu_map_entry *entry, bool free,
+ bool cansleep __unused)
+{
+
+ dprintf("%s\n", __func__);
+
+ iommu_domain_free_entry(entry, free);
+}
+
+static void
+iommu_init(void)
+{
+
+ sx_init(&iommu_sx, "IOMMU list");
+}
+
+SYSINIT(iommu, SI_SUB_DRIVERS, SI_ORDER_FIRST, iommu_init, NULL);
diff --git a/sys/riscv/iommu/iommu_frontend.h b/sys/riscv/iommu/iommu_frontend.h
new file mode 100644
index 00000000000..f4285603738
--- /dev/null
+++ b/sys/riscv/iommu/iommu_frontend.h
@@ -0,0 +1,38 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _ARM64_IOMMU_IOMMU_H_
+#define _ARM64_IOMMU_IOMMU_H_
+
+#define IOMMU_PAGE_SIZE 4096
+#define IOMMU_PAGE_MASK (IOMMU_PAGE_SIZE - 1)
+
+int iommu_unregister(struct iommu_unit *unit);
+int iommu_register(struct iommu_unit *unit);
+struct iommu_ctx * iommu_get_ctx_ofw(device_t dev, int channel);
+
+#endif /* _ARM64_IOMMU_IOMMU_H_ */
diff --git a/sys/riscv/iommu/iommu_if.m b/sys/riscv/iommu/iommu_if.m
new file mode 100644
index 00000000000..56a04b7b863
--- /dev/null
+++ b/sys/riscv/iommu/iommu_if.m
@@ -0,0 +1,147 @@
+#-
+# SPDX-License-Identifier: BSD-2-Clause
+#:
+# Copyright (c) 2026 Ruslan Bukin
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#
+
+#include "opt_platform.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#ifdef FDT
+#include
+#include
+#include
+#endif
+
+INTERFACE iommu;
+
+#
+# Check if the iommu controller dev is responsible to serve traffic
+# for a given child.
+#
+METHOD int find {
+ device_t dev;
+ device_t child;
+};
+
+#
+# Map a virtual address VA to a physical address PA.
+#
+METHOD int map {
+ device_t dev;
+ struct iommu_domain *iodom;
+ vm_offset_t va;
+ vm_page_t *ma;
+ bus_size_t size;
+ vm_prot_t prot;
+};
+
+#
+# Unmap a virtual address VA.
+#
+METHOD int unmap {
+ device_t dev;
+ struct iommu_domain *iodom;
+ vm_offset_t va;
+ bus_size_t size;
+};
+
+#
+# Allocate an IOMMU domain.
+#
+METHOD struct iommu_domain * domain_alloc {
+ device_t dev;
+ struct iommu_unit *iommu;
+};
+
+#
+# Release all the resources held by IOMMU domain.
+#
+METHOD void domain_free {
+ device_t dev;
+ struct iommu_domain *iodom;
+};
+
+#
+# Find a domain allocated for a dev.
+#
+METHOD struct iommu_domain * domain_lookup {
+ device_t dev;
+};
+
+#
+# Find an allocated context for a device.
+#
+METHOD struct iommu_ctx * ctx_lookup {
+ device_t dev;
+ device_t child;
+};
+
+#
+# Allocate a new iommu context.
+#
+METHOD struct iommu_ctx * ctx_alloc {
+ device_t dev;
+ struct iommu_domain *iodom;
+ device_t child;
+ bool disabled;
+};
+
+#
+# Initialize the new iommu context.
+#
+METHOD int ctx_init {
+ device_t dev;
+ struct iommu_ctx *ioctx;
+};
+
+#
+# Free the iommu context.
+#
+METHOD bool ctx_free {
+ device_t dev;
+ struct iommu_ctx *ioctx;
+};
+
+#ifdef FDT
+#
+# Notify controller we have machine-dependent data.
+#
+METHOD int ofw_md_data {
+ device_t dev;
+ struct iommu_ctx *ioctx;
+ pcell_t *cells;
+ int ncells;
+};
+#endif
diff --git a/sys/riscv/iommu/iommu_pci.c b/sys/riscv/iommu/iommu_pci.c
new file mode 100644
index 00000000000..2ec5b2dc95b
--- /dev/null
+++ b/sys/riscv/iommu/iommu_pci.c
@@ -0,0 +1,172 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#define PCI_DEVICE_ID_REDHAT_RISCV_IOMMU 0x0014
+#define PCI_VENDOR_ID_REDHAT 0x1b36
+#define PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA 0x0008
+#define PCI_VENDOR_ID_RIVOS 0x1efd
+
+static int
+iommu_pci_probe(device_t dev)
+{
+ uint16_t vendor_id, device_id;
+
+ vendor_id = pci_get_vendor(dev);
+ device_id = pci_get_device(dev);
+
+ if (vendor_id == PCI_VENDOR_ID_REDHAT &&
+ device_id == PCI_DEVICE_ID_REDHAT_RISCV_IOMMU) {
+ device_set_desc(dev, "RedHat IOMMU");
+ return (BUS_PROBE_DEFAULT);
+ }
+
+ if (vendor_id == PCI_VENDOR_ID_RIVOS &&
+ device_id == PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA) {
+ device_set_desc(dev, "Rivos IOMMU");
+ return (BUS_PROBE_DEFAULT);
+ }
+
+ return (ENXIO);
+}
+
+static int
+iommu_pci_attach(device_t dev)
+{
+ struct riscv_iommu_unit *unit;
+ struct riscv_iommu_softc *sc;
+ struct iommu_unit *iommu;
+ phandle_t node;
+ int count;
+ int error;
+ int rid;
+ int i;
+
+ sc = device_get_softc(dev);
+ sc->dev = dev;
+
+ node = ofw_bus_get_node(dev);
+
+ pci_enable_busmaster(dev);
+
+ rid = PCIR_BAR(0);
+ sc->res[0] = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
+ RF_ACTIVE);
+ if (sc->res[0] == NULL) {
+ device_printf(dev, "couldn't map memory\n");
+ goto error;
+ }
+
+ count = 4;
+ if (pci_alloc_msix(dev, &count) == 0) {
+ for (i = 0; i < 4; i++) {
+ rid = i;
+ sc->res[1 + i] = bus_alloc_resource_any(dev,
+ SYS_RES_IRQ, &rid, RF_ACTIVE);
+ if (sc->res[i + 1] == NULL) {
+ device_printf(dev, "Can't allocate IRQ "
+ " resource.\n");
+ goto error;
+ }
+ }
+ } else
+ device_printf(dev, "Can't allocate MSI-X interrupts."
+ " Ignoring.\n");
+
+ error = riscv_iommu_attach(dev);
+ if (error)
+ goto error;
+
+ unit = &sc->unit;
+ unit->dev = dev;
+
+ iommu = &unit->iommu;
+ iommu->dev = dev;
+
+ LIST_INIT(&unit->domain_list);
+
+ sc->xref = OF_xref_from_node(node);
+
+ error = iommu_register(iommu);
+ if (error) {
+ device_printf(dev, "Failed to register RISC-V IOMMU.\n");
+ goto error;
+ }
+
+ return (0);
+
+error:
+ if (sc->res[0])
+ bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0),
+ sc->res[0]);
+
+ for (i = 0; i < 4; i++)
+ if (sc->res[i + 1])
+ bus_release_resource(dev, SYS_RES_IRQ, i,
+ sc->res[i + 1]);
+
+ return (error);
+}
+
+static device_method_t riscv_iommu_pci_methods[] = {
+ /* Device interface */
+ DEVMETHOD(device_probe, iommu_pci_probe),
+ DEVMETHOD(device_attach, iommu_pci_attach),
+ DEVMETHOD_END
+};
+
+DEFINE_CLASS_1(riscv_iommu, riscv_iommu_pci_driver,
+ riscv_iommu_pci_methods, sizeof(struct riscv_iommu_softc),
+ riscv_iommu_driver);
+DRIVER_MODULE(riscv_iommu, pci, riscv_iommu_pci_driver, NULL, NULL);
diff --git a/sys/riscv/iommu/iommu_pmap.c b/sys/riscv/iommu/iommu_pmap.c
new file mode 100644
index 00000000000..751ba4cf1ac
--- /dev/null
+++ b/sys/riscv/iommu/iommu_pmap.c
@@ -0,0 +1,629 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+
+/*
+ * Boundary values for the page table page index space:
+ *
+ * L3 pages: [0, NUL2E)
+ * L2 pages: [NUL2E, NUL2E + NUL1E)
+ * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E)
+ *
+ * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the
+ * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages
+ * in a set of page tables.
+ */
+#define NUL0E Ln_ENTRIES
+#define NUL1E (Ln_ENTRIES * NUL0E)
+#define NUL2E (Ln_ENTRIES * NUL1E)
+
+#define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
+#define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
+
+#define pmap_clear(pte) pmap_store(pte, 0)
+#define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits)
+#define pmap_load_store(pte, entry) atomic_swap_64(pte, entry)
+#define pmap_load_clear(pte) pmap_load_store(pte, 0)
+#define pmap_load(pte) atomic_load_64(pte)
+#define pmap_store(pte, entry) atomic_store_64(pte, entry)
+#define pmap_store_bits(pte, bits) atomic_set_64(pte, bits)
+
+#define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK)
+#define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK)
+#define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK)
+#define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK)
+
+#define PTE_TO_PHYS(pte) \
+ ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
+#define L2PTE_TO_PHYS(l2) \
+ ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
+#define L1PTE_TO_PHYS(l1) \
+ ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT)
+#define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
+
+/********************/
+/* Inline functions */
+/********************/
+
+static __inline pd_entry_t *
+pmap_l0(struct riscv_iommu_pmap *pmap, vm_offset_t va)
+{
+ KASSERT(pmap->pm_mode != PMAP_MODE_SV39,
+ ("%s: in SV39 mode", __func__));
+ KASSERT(VIRT_IS_VALID(va),
+ ("%s: malformed virtual address %#lx", __func__, va));
+ return (&pmap->pm_top[pmap_l0_index(va)]);
+}
+
+static __inline pd_entry_t *
+pmap_l0_to_l1(struct riscv_iommu_pmap *pmap, pd_entry_t *l0, vm_offset_t va)
+{
+ vm_paddr_t phys;
+ pd_entry_t *l1;
+
+ KASSERT(pmap->pm_mode != PMAP_MODE_SV39,
+ ("%s: in SV39 mode", __func__));
+ phys = PTE_TO_PHYS(pmap_load(l0));
+ l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
+
+ return (&l1[pmap_l1_index(va)]);
+}
+
+static __inline pd_entry_t *
+pmap_l1(struct riscv_iommu_pmap *pmap, vm_offset_t va)
+{
+ pd_entry_t *l0;
+
+ KASSERT(VIRT_IS_VALID(va),
+ ("%s: malformed virtual address %#lx", __func__, va));
+ if (pmap->pm_mode == PMAP_MODE_SV39) {
+ return (&pmap->pm_top[pmap_l1_index(va)]);
+ } else {
+ l0 = pmap_l0(pmap, va);
+ if ((pmap_load(l0) & PTE_V) == 0)
+ return (NULL);
+ if ((pmap_load(l0) & PTE_RX) != 0)
+ return (NULL);
+ return (pmap_l0_to_l1(pmap, l0, va));
+ }
+}
+
+static __inline pd_entry_t *
+pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
+{
+ vm_paddr_t phys;
+ pd_entry_t *l2;
+
+ phys = PTE_TO_PHYS(pmap_load(l1));
+ l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
+
+ return (&l2[pmap_l2_index(va)]);
+}
+
+static __inline pd_entry_t *
+pmap_l2(struct riscv_iommu_pmap *pmap, vm_offset_t va)
+{
+ pd_entry_t *l1;
+
+ l1 = pmap_l1(pmap, va);
+ if (l1 == NULL)
+ return (NULL);
+ if ((pmap_load(l1) & PTE_V) == 0)
+ return (NULL);
+ if ((pmap_load(l1) & PTE_RX) != 0)
+ return (NULL);
+
+ return (pmap_l1_to_l2(l1, va));
+}
+
+static __inline pt_entry_t *
+pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
+{
+ vm_paddr_t phys;
+ pt_entry_t *l3;
+
+ phys = PTE_TO_PHYS(pmap_load(l2));
+ l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
+
+ return (&l3[pmap_l3_index(va)]);
+}
+
+static __inline pt_entry_t *
+pmap_l3(struct riscv_iommu_pmap *pmap, vm_offset_t va)
+{
+ pd_entry_t *l2;
+
+ l2 = pmap_l2(pmap, va);
+ if (l2 == NULL)
+ return (NULL);
+ if ((pmap_load(l2) & PTE_V) == 0)
+ return (NULL);
+ if ((pmap_load(l2) & PTE_RX) != 0)
+ return (NULL);
+
+ return (pmap_l2_to_l3(l2, va));
+}
+
+static __inline void
+pmap_resident_count_inc(struct riscv_iommu_pmap *pmap, int count)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ pmap->sp_resident_count += count;
+}
+
+static __inline void
+pmap_resident_count_dec(struct riscv_iommu_pmap *pmap, int count)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ KASSERT(pmap->sp_resident_count >= count,
+ ("pmap %p resident count underflow %ld %d", pmap,
+ pmap->sp_resident_count, count));
+ pmap->sp_resident_count -= count;
+}
+
+/***************************************************
+ * Page table page management routines.....
+ ***************************************************/
+
+int
+iommu_pmap_pinit(struct riscv_iommu_pmap *pmap, enum pmap_mode pm_mode)
+{
+ vm_paddr_t topphys;
+ vm_page_t m;
+
+ m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
+ VM_ALLOC_WAITOK);
+ topphys = VM_PAGE_TO_PHYS(m);
+ pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys);
+ pmap->pm_mode = pm_mode;
+
+ switch (pm_mode) {
+ case PMAP_MODE_SV39:
+ pmap->pm_satp = SATP_MODE_SV39;
+ break;
+ case PMAP_MODE_SV48:
+ pmap->pm_satp = SATP_MODE_SV48;
+ break;
+ default:
+ panic("Unknown virtual memory system");
+ };
+
+ pmap->pm_satp |= (topphys >> PAGE_SHIFT);
+
+#ifdef INVARIANTS
+ pmap->sp_resident_count = 0;
+#endif
+
+ mtx_init(&pmap->pm_mtx, "iommu pmap", NULL, MTX_DEF);
+
+ return (1);
+}
+
+/*
+ * Release any resources held by the given physical map.
+ * Called when a pmap initialized by pmap_pinit is being released.
+ * Should only be called if the map contains no valid mappings.
+ */
+void
+iommu_pmap_release(struct riscv_iommu_pmap *pmap)
+{
+ vm_page_t m;
+
+ KASSERT(pmap->sp_resident_count == 0,
+ ("pmap_release: pmap resident count %ld != 0",
+ pmap->sp_resident_count));
+
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top));
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ mtx_destroy(&pmap->pm_mtx);
+}
+
+/*
+ * This routine is called if the desired page table page does not exist.
+ *
+ * If page table page allocation fails, this routine may sleep before
+ * returning NULL. It sleeps only if a lock pointer was given.
+ *
+ * Note: If a page allocation fails at page table level two or three,
+ * one or two pages may be held during the wait, only to be released
+ * afterwards. This conservative approach is easily argued to avoid
+ * race conditions.
+ */
+static vm_page_t
+_pmap_alloc_l3(struct riscv_iommu_pmap *pmap, vm_pindex_t ptepindex)
+{
+ vm_page_t m, pdpg;
+ pt_entry_t entry;
+ vm_paddr_t phys;
+ pn_t pn;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
+ /*
+ * Allocate a page table page.
+ */
+ m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
+ if (m == NULL) {
+ /*
+ * Indicate the need to retry. While waiting, the page table
+ * page may have been allocated.
+ */
+ return (NULL);
+ }
+ m->pindex = ptepindex;
+
+ /*
+ * Map the pagetable page into the process address space, if
+ * it isn't already there.
+ */
+ pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT;
+ if (ptepindex >= NUL2E + NUL1E) {
+ pd_entry_t *l0;
+ vm_pindex_t l0index;
+
+ KASSERT(pmap->pm_mode != PMAP_MODE_SV39,
+ ("%s: pindex %#lx in SV39 mode", __func__, ptepindex));
+ KASSERT(ptepindex < NUL2E + NUL1E + NUL0E,
+ ("%s: pindex %#lx out of range", __func__, ptepindex));
+
+ l0index = ptepindex - (NUL2E + NUL1E);
+ l0 = &pmap->pm_top[l0index];
+ KASSERT((pmap_load(l0) & PTE_V) == 0,
+ ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0)));
+
+ entry = PTE_V | (pn << PTE_PPN0_S);
+ pmap_store(l0, entry);
+ } else if (ptepindex >= NUL2E) {
+ pd_entry_t *l0, *l1;
+ vm_pindex_t l0index, l1index;
+
+ l1index = ptepindex - NUL2E;
+ if (pmap->pm_mode == PMAP_MODE_SV39) {
+ l1 = &pmap->pm_top[l1index];
+ } else {
+ l0index = l1index >> Ln_ENTRIES_SHIFT;
+ l0 = &pmap->pm_top[l0index];
+ if (pmap_load(l0) == 0) {
+ /* Recurse to allocate the L1 page. */
+ if (_pmap_alloc_l3(pmap,
+ NUL2E + NUL1E + l0index) == NULL)
+ goto fail;
+ phys = PTE_TO_PHYS(pmap_load(l0));
+ } else {
+ phys = PTE_TO_PHYS(pmap_load(l0));
+ pdpg = PHYS_TO_VM_PAGE(phys);
+ pdpg->ref_count++;
+ }
+ l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
+ l1 = &l1[ptepindex & Ln_ADDR_MASK];
+ }
+ KASSERT((pmap_load(l1) & PTE_V) == 0,
+ ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
+
+ entry = PTE_V | (pn << PTE_PPN0_S);
+ pmap_store(l1, entry);
+ } else {
+ vm_pindex_t l0index, l1index;
+ pd_entry_t *l0, *l1, *l2;
+
+ l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
+ if (pmap->pm_mode == PMAP_MODE_SV39) {
+ l1 = &pmap->pm_top[l1index];
+ if (pmap_load(l1) == 0) {
+ /* recurse for allocating page dir */
+ if (_pmap_alloc_l3(pmap, NUL2E + l1index)
+ == NULL)
+ goto fail;
+ } else {
+ pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
+ pdpg->ref_count++;
+ }
+ } else {
+ l0index = l1index >> Ln_ENTRIES_SHIFT;
+ l0 = &pmap->pm_top[l0index];
+ if (pmap_load(l0) == 0) {
+ /* Recurse to allocate the L1 entry. */
+ if (_pmap_alloc_l3(pmap, NUL2E + l1index)
+ == NULL)
+ goto fail;
+ phys = PTE_TO_PHYS(pmap_load(l0));
+ l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
+ l1 = &l1[l1index & Ln_ADDR_MASK];
+ } else {
+ phys = PTE_TO_PHYS(pmap_load(l0));
+ l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
+ l1 = &l1[l1index & Ln_ADDR_MASK];
+ if (pmap_load(l1) == 0) {
+ /* Recurse to allocate the L2 page. */
+ if (_pmap_alloc_l3(pmap,
+ NUL2E + l1index) == NULL)
+ goto fail;
+ } else {
+ pdpg = PTE_TO_VM_PAGE(pmap_load(l1));
+ pdpg->ref_count++;
+ }
+ }
+ }
+
+ phys = PTE_TO_PHYS(pmap_load(l1));
+ l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
+ l2 = &l2[ptepindex & Ln_ADDR_MASK];
+ KASSERT((pmap_load(l2) & PTE_V) == 0,
+ ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
+
+ entry = PTE_V | (pn << PTE_PPN0_S);
+ pmap_store(l2, entry);
+ }
+
+ pmap_resident_count_inc(pmap, 1);
+
+ return (m);
+
+fail:
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
+}
+
+/*
+ * Remove a single IOMMU entry.
+ */
+int
+iommu_pmap_remove(struct riscv_iommu_pmap *pmap, vm_offset_t va)
+{
+ pt_entry_t *l3;
+ int rc;
+
+ PMAP_LOCK(pmap);
+
+ l3 = pmap_l3(pmap, va);
+ if (l3 != NULL) {
+ pmap_resident_count_dec(pmap, 1);
+ pmap_clear(l3);
+ rc = KERN_SUCCESS;
+ } else
+ rc = KERN_FAILURE;
+
+ PMAP_UNLOCK(pmap);
+
+ return (rc);
+}
+
+/* Add a single IOMMU entry. This function does not sleep. */
+int
+iommu_pmap_enter(struct riscv_iommu_pmap *pmap, vm_offset_t va, vm_paddr_t pa,
+ vm_prot_t prot, u_int flags)
+{
+ pd_entry_t *l2, l2e;
+ pt_entry_t new_l3;
+ pt_entry_t *l3;
+ vm_page_t mpte;
+ pn_t pn;
+ int rv;
+
+ pn = (pa / PAGE_SIZE);
+
+ new_l3 = PTE_V | PTE_R | PTE_A;
+ if (prot & VM_PROT_EXECUTE)
+ new_l3 |= PTE_X;
+ if (flags & VM_PROT_WRITE)
+ new_l3 |= PTE_D;
+ if (prot & VM_PROT_WRITE)
+ new_l3 |= PTE_W;
+ if (va < VM_MAX_USER_ADDRESS)
+ new_l3 |= PTE_U;
+
+ new_l3 |= (pn << PTE_PPN0_S);
+ new_l3 |= PTE_MA_IO;
+
+ /*
+ * Set modified bit gratuitously for writeable mappings if
+ * the page is unmanaged. We do not want to take a fault
+ * to do the dirty bit accounting for these mappings.
+ */
+ if (prot & VM_PROT_WRITE)
+ new_l3 |= PTE_D;
+
+ CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
+
+ mpte = NULL;
+ PMAP_LOCK(pmap);
+
+ l2 = pmap_l2(pmap, va);
+ if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
+ ((l2e & PTE_RWX) == 0)) {
+ l3 = pmap_l2_to_l3(l2, va);
+ } else if (va < VM_MAXUSER_ADDRESS) {
+ mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va));
+ if (mpte == NULL) {
+ CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
+ rv = KERN_RESOURCE_SHORTAGE;
+ goto out;
+ }
+ l3 = pmap_l3(pmap, va);
+ } else
+ panic("pmap_enter: missing L3 table for kernel va %#lx", va);
+
+ KASSERT((pmap_load(l3) & PTE_V) == 0, ("l3 is valid"));
+
+ pmap_store(l3, new_l3);
+ pmap_resident_count_inc(pmap, 1);
+
+ rv = KERN_SUCCESS;
+out:
+ PMAP_UNLOCK(pmap);
+
+ return (rv);
+}
+
+static void
+iommu_pmap_remove_pages_sv48(struct riscv_iommu_pmap *pmap)
+{
+ pd_entry_t l0e, *l1, l1e, *l2, l2e, *l3, l3e;
+ vm_paddr_t pa0, pa1, pa;
+ vm_page_t m0, m1, m;
+ int i, j, k, l;
+
+ PMAP_LOCK(pmap);
+
+ for (i = 0; i < Ln_ENTRIES; i++) {
+ l0e = pmap->pm_top[i];
+ if ((l0e & PTE_V) == 0)
+ continue;
+ pa0 = PTE_TO_PHYS(l0e);
+ m0 = PHYS_TO_VM_PAGE(pa0);
+ l1 = (pd_entry_t *)PHYS_TO_DMAP(pa0);
+
+ for (j = 0; j < Ln_ENTRIES; j++) {
+ l1e = l1[j];
+ if ((l1e & PTE_V) == 0)
+ continue;
+ pa1 = PTE_TO_PHYS(l1e);
+ m1 = PHYS_TO_VM_PAGE(pa1);
+ l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1);
+
+ for (k = 0; k < Ln_ENTRIES; k++) {
+ l2e = l2[k];
+ if ((l2e & PTE_V) == 0)
+ continue;
+ pa = PTE_TO_PHYS(l2e);
+ m = PHYS_TO_VM_PAGE(pa);
+ l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
+
+ for (l = 0; l < Ln_ENTRIES; l++) {
+ l3e = l3[l];
+ if ((l3e & PTE_V) == 0)
+ continue;
+ panic("%s: l3e found (idx %d %d %d %d)",
+ __func__, i, j, k, l);
+ }
+
+ vm_page_unwire_noq(m1);
+ vm_page_unwire_noq(m);
+ pmap_resident_count_dec(pmap, 1);
+ vm_page_free(m);
+ pmap_clear(&l2[k]);
+ }
+
+ vm_page_unwire_noq(m0);
+ pmap_resident_count_dec(pmap, 1);
+ vm_page_free(m1);
+ pmap_clear(&l1[j]);
+ }
+
+ pmap_resident_count_dec(pmap, 1);
+ vm_page_free(m0);
+ pmap_clear(&pmap->pm_top[i]);
+ }
+
+ KASSERT(pmap->sp_resident_count == 0,
+ ("Invalid resident count %jd", pmap->sp_resident_count));
+
+ PMAP_UNLOCK(pmap);
+}
+
+static void
+iommu_pmap_remove_pages_sv39(struct riscv_iommu_pmap *pmap)
+{
+ pd_entry_t l1e, *l2, l2e, *l3, l3e;
+ vm_paddr_t pa1, pa;
+ vm_page_t m1, m;
+ int j, k, l;
+
+ PMAP_LOCK(pmap);
+
+ for (j = 0; j < Ln_ENTRIES; j++) {
+ l1e = pmap->pm_top[j];
+ if ((l1e & PTE_V) == 0)
+ continue;
+ pa1 = PTE_TO_PHYS(l1e);
+ m1 = PHYS_TO_VM_PAGE(pa1);
+ l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1);
+
+ for (k = 0; k < Ln_ENTRIES; k++) {
+ l2e = l2[k];
+ if ((l2e & PTE_V) == 0)
+ continue;
+ pa = PTE_TO_PHYS(l2e);
+ m = PHYS_TO_VM_PAGE(pa);
+ l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
+
+ for (l = 0; l < Ln_ENTRIES; l++) {
+ l3e = l3[l];
+ if ((l3e & PTE_V) == 0)
+ continue;
+ panic("%s: l3e found (idx %d %d %d)",
+ __func__, j, k, l);
+ }
+
+ vm_page_unwire_noq(m1);
+ vm_page_unwire_noq(m);
+ pmap_resident_count_dec(pmap, 1);
+ vm_page_free(m);
+ pmap_clear(&l2[k]);
+ }
+
+ pmap_resident_count_dec(pmap, 1);
+ vm_page_free(m1);
+ pmap_clear(&pmap->pm_top[j]);
+ }
+
+ KASSERT(pmap->sp_resident_count == 0,
+ ("Invalid resident count %jd", pmap->sp_resident_count));
+
+ PMAP_UNLOCK(pmap);
+}
+
+void
+iommu_pmap_remove_pages(struct riscv_iommu_pmap *pmap)
+{
+
+ switch (pmap->pm_mode) {
+ case PMAP_MODE_SV39:
+ iommu_pmap_remove_pages_sv39(pmap);
+ break;
+ case PMAP_MODE_SV48:
+ iommu_pmap_remove_pages_sv48(pmap);
+ break;
+ default:
+ panic("Unknown virtual memory system");
+ }
+}
diff --git a/sys/riscv/iommu/iommu_pmap.h b/sys/riscv/iommu/iommu_pmap.h
new file mode 100644
index 00000000000..25347e23911
--- /dev/null
+++ b/sys/riscv/iommu/iommu_pmap.h
@@ -0,0 +1,49 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2026 Ruslan Bukin
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _RISCV_IOMMU_IOMMU_PMAP_H_
+#define _RISCV_IOMMU_IOMMU_PMAP_H_
+
+struct riscv_iommu_pmap {
+ pd_entry_t *pm_top;
+ enum pmap_mode pm_mode;
+ uint64_t pm_satp;
+ struct mtx pm_mtx;
+#ifdef INVARIANTS
+ long sp_resident_count;
+#endif
+};
+
+int iommu_pmap_enter(struct riscv_iommu_pmap *pmap, vm_offset_t va,
+ vm_paddr_t pa, vm_prot_t prot, u_int flags);
+int iommu_pmap_remove(struct riscv_iommu_pmap *pmap, vm_offset_t va);
+void iommu_pmap_remove_pages(struct riscv_iommu_pmap *pmap);
+int iommu_pmap_pinit(struct riscv_iommu_pmap *pmap, enum pmap_mode pm_mode);
+void iommu_pmap_release(struct riscv_iommu_pmap *pmap);
+void iommu_pmap_remove_pages(struct riscv_iommu_pmap *pmap);
+
+#endif /* !_RISCV_IOMMU_IOMMU_PMAP_H_ */