sockets: let protocols be responsible for socket buffer mutexes

Sockets that implement their own socket buffers (marked with PR_SOCKBUF)
are now also responsible for initialization of socket buffer mutexes in
pr_attach and for destruction in pr_detach (or pr_close).

This removes a big bunch of reported LORs, as now WITNESS is able to see
that tcp(4) socket buffer mutex and netlink(4) socket buffer mutex are two
different things.  Distinct names also improve diagnostics for blocked
threads.

This also removes a hack from unix(4), where we used to mtx_destroy().
Also removes an innocent bug from unix(4) where for accept(2)-ed socket
soreserve() was called twice.  This one was innocent since first call to
soreserve() was asking for 0 bytes of space.

This slightly increased amount of pasted code in TCP's syncache_socket().
The problem is that while for sockets created with socket(2) it is
pr_attach responsible for call to soreserve() (including !PR_SOCKBUF
protocols), but for the sockets created with accept(2) it was
solisten_clone() doing soreserve(), combined with the fact that for
accept(2) TCP completely bypasses pr_attach. This all should improve once
TCP has its own socket buffers.

Reviewed by:		markj
Differential Revision:	https://reviews.freebsd.org/D54984
This commit is contained in:
Gleb Smirnoff
2026-02-03 09:09:49 -08:00
parent dbe9fa0be1
commit 64f7e3c9c1
4 changed files with 68 additions and 51 deletions
+40 -37
View File
@@ -818,8 +818,6 @@ soalloc(struct vnet *vnet)
* a feature to change class of an existing lock, so we use DUPOK.
*/
mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
so->so_rcv.sb_sel = &so->so_rdsel;
so->so_snd.sb_sel = &so->so_wrsel;
sx_init(&so->so_snd_sx, "so_snd_sx");
@@ -893,14 +891,47 @@ sodealloc(struct socket *so)
&so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
sx_destroy(&so->so_snd_sx);
sx_destroy(&so->so_rcv_sx);
mtx_destroy(&so->so_snd_mtx);
mtx_destroy(&so->so_rcv_mtx);
}
crfree(so->so_cred);
mtx_destroy(&so->so_lock);
uma_zfree(socket_zone, so);
}
/*
* Shim to accomodate protocols that already do their own socket buffers
* management (marked with PR_SOCKBUF) with protocols that yet do not.
*
* Attach via socket(2) is different from attach via accept(2). In case of
* normal socket(2) syscall it is the pr_attach that calls soreserve(), even
* for protocols that don't yet do PR_SOCKBUF. In case of accepted connection
* it is our shim that calls soreserve() and the hiwat values are taken from
* the parent socket.
*/
static int
soattach(struct socket *so, int proto, struct thread *td, struct socket *head)
{
int error;
VNET_ASSERT(curvnet == so->so_vnet,
("%s: %p != %p", __func__, curvnet, so->so_vnet));
if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
so->so_snd.sb_mtx = &so->so_snd_mtx;
so->so_rcv.sb_mtx = &so->so_rcv_mtx;
}
if (head == NULL || (error = soreserve(so, head->sol_sbsnd_hiwat,
head->sol_sbrcv_hiwat)) == 0)
error = so->so_proto->pr_attach(so, proto, td);
if (error != 0 && (so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
mtx_destroy(&so->so_snd_mtx);
mtx_destroy(&so->so_rcv_mtx);
}
return (error);
}
/*
* socreate returns a socket with a ref count of 1 and a file descriptor
* reference. The socket should be closed with soclose().
@@ -956,16 +987,8 @@ socreate(int dom, struct socket **aso, int type, int proto,
so_rdknl_assert_lock);
knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
so_wrknl_assert_lock);
if ((prp->pr_flags & PR_SOCKBUF) == 0) {
so->so_snd.sb_mtx = &so->so_snd_mtx;
so->so_rcv.sb_mtx = &so->so_rcv_mtx;
}
/*
* Auto-sizing of socket buffers is managed by the protocols and
* the appropriate flags must be set in the pr_attach() method.
*/
CURVNET_SET(so->so_vnet);
error = prp->pr_attach(so, proto, td);
error = soattach(so, proto, td, NULL);
CURVNET_RESTORE();
if (error) {
sodealloc(so);
@@ -1192,13 +1215,6 @@ solisten_clone(struct socket *head)
so_rdknl_assert_lock);
knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
so_wrknl_assert_lock);
VNET_SO_ASSERT(head);
if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
sodealloc(so);
log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
__func__, head->so_pcb);
return (NULL);
}
so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
@@ -1206,10 +1222,6 @@ solisten_clone(struct socket *head)
so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
so->so_snd.sb_flags = head->sol_sbsnd_flags &
(SB_AUTOSIZE | SB_AUTOLOWAT);
if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
so->so_snd.sb_mtx = &so->so_snd_mtx;
so->so_rcv.sb_mtx = &so->so_rcv_mtx;
}
return (so);
}
@@ -1223,7 +1235,7 @@ sonewconn(struct socket *head, int connstatus)
if ((so = solisten_clone(head)) == NULL)
return (NULL);
if (so->so_proto->pr_attach(so, 0, NULL) != 0) {
if (soattach(so, 0, NULL, head) != 0) {
sodealloc(so);
log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
__func__, head->so_pcb);
@@ -1327,14 +1339,7 @@ sopeeloff(struct socket *head)
so_rdknl_assert_lock);
knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
so_wrknl_assert_lock);
VNET_SO_ASSERT(head);
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
sodealloc(so);
log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
__func__, head->so_pcb);
return (NULL);
}
if (so->so_proto->pr_attach(so, 0, NULL)) {
if (soattach(so, 0, NULL, head)) {
sodealloc(so);
log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
__func__, head->so_pcb);
@@ -1346,10 +1351,6 @@ sopeeloff(struct socket *head)
so->so_snd.sb_timeo = head->so_snd.sb_timeo;
so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
so->so_snd.sb_mtx = &so->so_snd_mtx;
so->so_rcv.sb_mtx = &so->so_rcv_mtx;
}
soref(so);
@@ -1906,6 +1907,8 @@ sofree(struct socket *so)
SOCK_SENDBUF_UNLOCK(so);
SOCK_RECVBUF_UNLOCK(so);
#endif
mtx_destroy(&so->so_snd_mtx);
mtx_destroy(&so->so_rcv_mtx);
}
seldrain(&so->so_rdsel);
seldrain(&so->so_wrsel);
+8 -9
View File
@@ -479,7 +479,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
{
u_long sendspace, recvspace;
struct unpcb *unp;
int error;
int error, rcvmtxopts;
bool locked;
KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
@@ -494,6 +494,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
* limits to unpdg_recvspace.
*/
sendspace = recvspace = unpdg_recvspace;
rcvmtxopts = 0;
break;
case SOCK_STREAM:
@@ -505,14 +506,7 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
sendspace = unpsp_sendspace;
recvspace = unpsp_recvspace;
common:
/*
* XXXGL: we need to initialize the mutex with MTX_DUPOK.
* Ideally, protocols that have PR_SOCKBUF should be
* responsible for mutex initialization officially, and then
* this uglyness with mtx_destroy(); mtx_init(); would go away.
*/
mtx_destroy(&so->so_rcv_mtx);
mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF | MTX_DUPOK);
rcvmtxopts = MTX_DUPOK;
knlist_init(&so->so_wrsel.si_note, so, uipc_wrknl_lock,
uipc_wrknl_unlock, uipc_wrknl_assert_lock);
STAILQ_INIT(&so->so_rcv.uxst_mbq);
@@ -520,6 +514,8 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
default:
panic("uipc_attach");
}
mtx_init(&so->so_rcv_mtx, "unix so_rcv", NULL, MTX_DEF | rcvmtxopts);
mtx_init(&so->so_snd_mtx, "unix so_snd", NULL, MTX_DEF);
error = soreserve(so, sendspace, recvspace);
if (error)
return (error);
@@ -888,6 +884,9 @@ uipc_detach(struct socket *so)
MPASS(TAILQ_EMPTY(&so->so_rcv.uxdg_conns));
MPASS(STAILQ_EMPTY(&so->so_snd.uxdg_mb));
}
mtx_destroy(&so->so_snd_mtx);
mtx_destroy(&so->so_rcv_mtx);
}
static int
+12 -3
View File
@@ -772,12 +772,21 @@ syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
NET_EPOCH_ASSERT();
/*
* Ok, create the full blown connection, and set things up
* as they would have been set up if we had created the
* connection when the SYN arrived.
* Creation of a socket via solisten_clone() bypasses call to pr_attach.
* That's why there is some pasted code from soattach() and from
* tcp_usr_attach() here. This should improve once TCP is PR_SOCKBUF.
*/
if ((so = solisten_clone(lso)) == NULL)
goto allocfail;
mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
so->so_snd.sb_mtx = &so->so_snd_mtx;
so->so_rcv.sb_mtx = &so->so_rcv_mtx;
error = soreserve(so, lso->sol_sbsnd_hiwat, lso->sol_sbrcv_hiwat);
if (error) {
sodealloc(so);
goto allocfail;
}
#ifdef MAC
mac_socketpeer_set_from_mbuf(m, so);
#endif
+8 -2
View File
@@ -330,14 +330,17 @@ nl_attach(struct socket *so, int proto, struct thread *td)
so, is_linux ? "(linux) " : "", curproc->p_pid,
nl_get_proto_name(proto));
nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
mtx_init(&so->so_snd_mtx, "netlink so_snd", NULL, MTX_DEF);
mtx_init(&so->so_rcv_mtx, "netlink so_rcv", NULL, MTX_DEF);
error = soreserve(so, nl_sendspace, nl_recvspace);
if (error != 0) {
free(nlp, M_PCB);
mtx_destroy(&so->so_snd_mtx);
mtx_destroy(&so->so_rcv_mtx);
return (error);
}
TAILQ_INIT(&so->so_rcv.nl_queue);
TAILQ_INIT(&so->so_snd.nl_queue);
nlp = malloc(sizeof(struct nlpcb), M_PCB, M_WAITOK | M_ZERO);
so->so_pcb = nlp;
nlp->nl_socket = so;
nlp->nl_proto = proto;
@@ -517,6 +520,9 @@ nl_close(struct socket *so)
nl_buf_free(nb);
}
mtx_destroy(&so->so_snd_mtx);
mtx_destroy(&so->so_rcv_mtx);
NL_LOG(LOG_DEBUG3, "socket %p, detached", so);
/* XXX: is delayed free needed? */