diff --git a/lib/libsys/getsockopt.2 b/lib/libsys/getsockopt.2 index 3867824681d..85d94e01463 100644 --- a/lib/libsys/getsockopt.2 +++ b/lib/libsys/getsockopt.2 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd November 25, 2024 +.Dd April 21, 2026 .Dt GETSOCKOPT 2 .Os .Sh NAME @@ -220,6 +220,10 @@ Incoming TCP and UDP connections are distributed among the participating listening sockets based on a hash function of local port number, and foreign IP address and port number. A maximum of 256 sockets can be bound to the same load-balancing group. +.Dv PF_DIVERT +sockets may also be bound to a group, see the +.Xr divert 4 +manual page for details. .Pp .Dv SO_KEEPALIVE enables the diff --git a/share/man/man4/divert.4 b/share/man/man4/divert.4 index 647bb72ab49..6292df190eb 100644 --- a/share/man/man4/divert.4 +++ b/share/man/man4/divert.4 @@ -1,5 +1,4 @@ -.\" -.Dd January 23, 2026 +.Dd April 21, 2026 .Dt DIVERT 4 .Os .Sh NAME @@ -57,7 +56,26 @@ firewall processing at the next rule. .Pp By reading from and writing to a divert socket, matching packets can be passed through an arbitrary ``filter'' as they travel through -the host machine, special routing tricks can be done, etc. +the host machine, special routing tricks can be done, etc.. +.Pp +Multiple divert sockets may be bound to the same port if the +.Dv SO_REUSEPORT_LB +socket option is set on all of them. +In this case, the kernel will attempt to load-balance packets among +the sockets. +The implementation ensures that packets from the same flow are delivered +to the same socket. +To this end it relies on the firewall to provide a flow identifier with +each diverted packet. +When using the +.Xr pf 4 +firewall, this is the associated state ID, if one exists, otherwise all +packets are diverted to the first socket in the group. +Currently the +.Xr ipfw 4 +firewall does not provide a flow identifier, so all packets are diverted +to the first socket in the group. +At most 32 sockets can be bound to the same port. .Sh READING PACKETS Packets are diverted either as they are ``incoming'' or ``outgoing.'' Incoming packets are diverted after reception on an IP interface, diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 39bc9de6ec9..839048908f9 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -88,7 +89,7 @@ */ #define DIVHASHSIZE (1 << 3) /* 8 entries, one cache line. */ #define DIVHASH(port) (port % DIVHASHSIZE) -#define DCBHASH(dcb) ((dcb)->dcb_port % DIVHASHSIZE) +#define DCBHASH(dcb) (DIVHASH((dcb)->dcb_port)) /* * Divert sockets work in conjunction with ipfw or other packet filters, @@ -147,10 +148,22 @@ struct divcb { struct epoch_context dcb_epochctx; }; -CK_SLIST_HEAD(divhashhead, divcb); +struct divcblbgroup { + CK_SLIST_ENTRY(divcblbgroup) dl_next; + struct epoch_context dl_epochctx; + uint16_t dl_port; + uint16_t dl_count; +#define DIVCBLBGROUP_SIZE 32 + struct divcb *dl_dcb[DIVCBLBGROUP_SIZE]; +}; -VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]) = {}; +CK_SLIST_HEAD(divhashhead, divcb); +CK_SLIST_HEAD(divlbgrouphashhead, divcblbgroup); + +VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]); #define V_divhash VNET(divhash) +VNET_DEFINE_STATIC(struct divlbgrouphashhead, divlbhash[DIVHASHSIZE]); +#define V_divlbhash VNET(divlbhash) VNET_DEFINE_STATIC(uint64_t, dcb_count) = 0; #define V_dcb_count VNET(dcb_count) VNET_DEFINE_STATIC(uint64_t, dcb_gencnt) = 0; @@ -163,10 +176,15 @@ MTX_SYSINIT(divert, &divert_mtx, "divert(4) socket pcb lists", MTX_DEF); /* * Divert a packet by passing it up to the divert socket at port 'port'. + * + * 'id' is an opaque identifier for the flow and is used to load-balance packets + * across multiple divert sockets bound to the same port. Packets with the same + * identifier will be delivered to the same socket. */ static void -divert_packet(struct mbuf *m, bool incoming) +divert_packet(struct mbuf *m, uint64_t id, bool incoming) { + struct divcblbgroup *dlb; struct divcb *dcb; u_int16_t nport; struct sockaddr_in divsrc; @@ -272,10 +290,27 @@ divert_packet(struct mbuf *m, bool incoming) sizeof(divsrc.sin_zero)); } - /* Put packet on socket queue, if any */ - CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next) - if (dcb->dcb_port == nport) + /* + * Look for a matching divert socket or socket group, and enqueue the + * packet. + */ + CK_SLIST_FOREACH(dlb, &V_divlbhash[DIVHASH(nport)], dl_next) { + uint16_t count; + + count = atomic_load_acq_16(&dlb->dl_count); + if (dlb->dl_port == nport && count > 0) { + uint32_t hash; + + hash = jenkins_hash(&id, sizeof(uint64_t), 0); + dcb = dlb->dl_dcb[hash % count]; break; + } + } + if (dlb == NULL) { + CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next) + if (dcb->dcb_port == nport) + break; + } if (dcb != NULL) { struct socket *sa = dcb->dcb_socket; @@ -596,6 +631,53 @@ div_free(epoch_context_t ctx) free(dcb, M_PCB); } +static void +divlbgroup_free(epoch_context_t ctx) +{ + struct divcblbgroup *dlb = __containerof(ctx, struct divcblbgroup, + dl_epochctx); + + free(dlb, M_PCB); +} + +static void +div_lbgroup_detach(struct divcb *dcb) +{ + struct divcblbgroup *dlb; + + CK_SLIST_FOREACH(dlb, &V_divlbhash[DCBHASH(dcb)], dl_next) { + if (dlb->dl_port != dcb->dcb_port) + continue; + + /* + * Delicately remove the socket from its group, taking + * care to synchronize with lookups, which do not handle + * NULL slots in the group table. + * + * Note that the hash is not stable across different + * group sizes. + */ + for (int i = 0; i < dlb->dl_count; i++) { + unsigned int count; + + if (dlb->dl_dcb[i] != dcb) + continue; + + count = dlb->dl_count; + if (i != count - 1) + dlb->dl_dcb[i] = dlb->dl_dcb[count - 1]; + atomic_store_rel_16(&dlb->dl_count, count - 1); + if (count == 1) { + CK_SLIST_REMOVE(&V_divlbhash[DCBHASH(dcb)], dlb, + divcblbgroup, dl_next); + NET_EPOCH_CALL(divlbgroup_free, + &dlb->dl_epochctx); + } + return; + } + } +} + static void div_detach(struct socket *so) { @@ -603,8 +685,10 @@ div_detach(struct socket *so) so->so_pcb = NULL; DIVERT_LOCK(); - if (dcb->dcb_bound != DCB_UNBOUND) + if (dcb->dcb_bound != DCB_UNBOUND) { CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next); + div_lbgroup_detach(dcb); + } V_dcb_count--; V_dcb_gencnt++; DIVERT_UNLOCK(); @@ -614,28 +698,70 @@ div_detach(struct socket *so) static int div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) { + struct divcblbgroup *dlb; struct divcb *dcb; + int error; uint16_t port; if (nam->sa_family != AF_INET) return EAFNOSUPPORT; if (nam->sa_len != sizeof(struct sockaddr_in)) return EINVAL; + + error = 0; + if ((so->so_options & SO_REUSEPORT_LB) != 0) + dlb = malloc(sizeof(*dlb), M_PCB, M_WAITOK | M_ZERO); + else + dlb = NULL; + port = ((struct sockaddr_in *)nam)->sin_port; DIVERT_LOCK(); - CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next) - if (dcb->dcb_port == port) { - DIVERT_UNLOCK(); - return (EADDRINUSE); + if (dlb == NULL) { + CK_SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next) { + if (dcb->dcb_port == port) { + DIVERT_UNLOCK(); + return (EADDRINUSE); + } } + } dcb = so->so_pcb; - if (dcb->dcb_bound != DCB_UNBOUND) - CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next); - dcb->dcb_port = port; - CK_SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next); + if (dlb != NULL) { + struct divcblbgroup *tmp; + + CK_SLIST_FOREACH(tmp, &V_divlbhash[DIVHASH(port)], dl_next) { + if (tmp->dl_port == port) + break; + } + if (tmp == NULL) { + dlb->dl_port = port; + dlb->dl_count = 1; + dlb->dl_dcb[0] = dcb; + CK_SLIST_INSERT_HEAD(&V_divlbhash[DIVHASH(port)], dlb, + dl_next); + } else if (tmp->dl_count < DIVCBLBGROUP_SIZE) { + KASSERT(tmp->dl_count > 0, + ("div_bind: lbgroup %p has count 0", tmp)); + + tmp->dl_dcb[tmp->dl_count] = dcb; + atomic_store_rel_16(&tmp->dl_count, tmp->dl_count + 1); + free(dlb, M_PCB); + } else { + error = ENOSPC; + free(dlb, M_PCB); + } + } + if (error == 0) { + if (dcb->dcb_bound != DCB_UNBOUND) { + CK_SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, + dcb_next); + div_lbgroup_detach(dcb); + } + dcb->dcb_port = port; + CK_SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next); + } DIVERT_UNLOCK(); - return (0); + return (error); } static int diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h index 934ca80a083..081938ec7ae 100644 --- a/sys/netinet/ip_var.h +++ b/sys/netinet/ip_var.h @@ -324,7 +324,7 @@ VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr); #define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) /* Divert hooks. */ -extern void (*ip_divert_ptr)(struct mbuf *m, bool incoming); +extern void (*ip_divert_ptr)(struct mbuf *m, uint64_t id, bool incoming); /* ng_ipfw hooks -- XXX make it the same as divert and dummynet */ extern int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool); extern int (*ip_dn_ctl_ptr)(struct sockopt *); diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 48e20df3ef9..851f70cbb0a 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -96,7 +96,7 @@ VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL; int (*ip_dn_ctl_ptr)(struct sockopt *); int (*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *); -void (*ip_divert_ptr)(struct mbuf *, bool); +void (*ip_divert_ptr)(struct mbuf *, uint64_t, bool); int (*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool); #ifdef INET diff --git a/sys/netpfil/ipfw/ip_fw_pfil.c b/sys/netpfil/ipfw/ip_fw_pfil.c index ddd8e00316b..7e1c24a89ed 100644 --- a/sys/netpfil/ipfw/ip_fw_pfil.c +++ b/sys/netpfil/ipfw/ip_fw_pfil.c @@ -563,7 +563,7 @@ ipfw_divert(struct mbuf **m0, struct ip_fw_args *args, bool tee) m_tag_prepend(clone, tag); /* Do the dirty job... */ - ip_divert_ptr(clone, args->flags & IPFW_ARGS_IN); + ip_divert_ptr(clone, 0, args->flags & IPFW_ARGS_IN); return 0; } diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c index 27d03b68893..98b5657f728 100644 --- a/sys/netpfil/pf/pf.c +++ b/sys/netpfil/pf/pf.c @@ -11966,7 +11966,7 @@ pf_test(sa_family_t af, int dir, int pflags, struct ifnet *ifp, struct mbuf **m0 pd.m->m_flags &= ~M_FASTFWD_OURS; } } - ip_divert_ptr(*m0, dir == PF_IN); + ip_divert_ptr(*m0, s != NULL ? s->id : 0, dir == PF_IN); *m0 = NULL; return (action); } else if (mtag == NULL) {