nfsd: Add support for striped Flexible File layout

Without this patch, the NFSv4.1/4.2 pNFS server configuration
did not support striping.  This was mainly because the Linux
client driver did not support it either.  The Linux client
driver for Flexible File layout does now support striping.
(Linux kernel version 6.18 or newer)

As such, this patch adds striping support.  The configuration
is currently just two new sysctls called
vfs.nfsd.pnfsstripeunit - Size (in bytes) of a stripe
vfs.nfsd.pnfsstripecnt - # of DSs to stripe across
A setting of 0 for the first sysctl and 1 for the second
disables striping.  A patch that allows use of a different
striping configuration for each exported MDS file system
is planned for the future.

The pnfsdscopymr may be broken by this patch, but since
no one reported that they were actually using a pNFS server
configuration, I do not believe that will be a problem at
this time.

Until the FreeBSD NFSv4.1/4.2 client is patched to handle
striped flexible file layouts, mounts to a striped pNFS
configuration must be done without the "pnfs" mount option.
(Linux systems with a kernel version of 6.18 or newer
should be able to handle a striped pNFS configuration.)

Future patches that convert the pNFS server to a
"loosely coupled" configuration (which allows the use
of non-FreeBSD servers as DSs) are anticipated.
This commit is contained in:
Rick Macklem
2026-06-03 18:26:36 -07:00
parent 67df313015
commit 72e57bc264
7 changed files with 908 additions and 314 deletions
+3
View File
@@ -199,9 +199,12 @@ struct nfsd_nfsd_args {
* NFSDEV_MAXMIRRORS - Maximum level of mirroring for a DS.
* (Most will only put files on two DSs, but this setting allows up to 4.)
* NFSDEV_MAXVERS - maximum number of NFS versions supported by Flex File.
* NFSDEV_MAXSTRIPE - sanity limit for maximum number of DSs used to stripe a
* file, which is nfsrv_maxstripecnt set by the nfsd option.
*/
#define NFSDEV_MAXMIRRORS 4
#define NFSDEV_MAXVERS 4
#define NFSDEV_MAXSTRIPE 1024
struct nfsd_pnfsd_args {
int op; /* Which pNFSd op to perform. */
+4 -4
View File
@@ -761,10 +761,10 @@ int nfsvno_testexp(struct nfsrv_descript *, struct nfsexstuff *);
uint32_t nfsrv_hashfh(fhandle_t *);
uint32_t nfsrv_hashsessionid(uint8_t *);
void nfsrv_backupstable(void);
int nfsrv_dsgetdevandfh(struct vnode *, NFSPROC_T *, int *, fhandle_t *,
char *);
int nfsrv_dsgetsockmnt(struct vnode *, int, char *, int *, int *,
NFSPROC_T *, struct vnode **, fhandle_t *, char *, char *,
int nfsrv_dsgetdevandfh(struct vnode *, NFSPROC_T *, int *, uint64_t *, int *,
fhandle_t **, char **);
int nfsrv_dsgetsockmnt(struct vnode *, int, char *, int *, int *, int *,
uint64_t *, NFSPROC_T *, struct vnode ***, fhandle_t **, char **, char *,
struct vnode **, struct nfsmount **, struct nfsmount *, int *, int *);
int nfsrv_dscreate(struct vnode *, struct vattr *, struct vattr *,
fhandle_t *, struct pnfsdsfile *, struct pnfsdsattr *, char *,
+13 -3
View File
@@ -282,9 +282,19 @@
#define NFSX_V4NAMEDATTRFH 3
#define NFSX_V4FILELAYOUT (4 * NFSX_UNSIGNED + NFSX_V4DEVICEID + \
NFSX_HYPER + NFSM_RNDUP(NFSX_V4PNFSFH))
#define NFSX_V4FLEXLAYOUT(m) (NFSX_HYPER + 3 * NFSX_UNSIGNED + \
((m) * (NFSX_V4DEVICEID + NFSX_STATEID + NFSM_RNDUP(NFSX_V4PNFSFH) + \
8 * NFSX_UNSIGNED)))
/*
* NFSX_V4FLEXLAYOUT() is the size in bytes of the XDR for ff_layout4, given
* m - # of mirrors
* s - # of stripes
* NFSX_HYPER is for ffl_stripe_unit
* "m + 3" calculates the counts of ffl_mirrors and ffm_data_servers plus
* ffl_flags and ffl_stats_collect_hint.
* The final section calculates the size of ff_data_server4.
*/
#define NFSX_V4FLEXLAYOUT(m, s) (NFSX_HYPER + ((m) + 3) * NFSX_UNSIGNED + \
(((m) * (s)) * (NFSX_V4DEVICEID + NFSX_STATEID + \
NFSM_RNDUP(NFSX_V4PNFSFH) + 7 * NFSX_UNSIGNED)))
/* sizes common to multiple NFS versions */
#define NFSX_FHMAX (NFSX_V4FHMAX)
+13
View File
@@ -403,7 +403,20 @@ struct nfsdontlist {
* in the metadata file's extended attribute called pnfsd.dsfile.
*/
#define PNFS_FILENAME_LEN (2 * sizeof(fhandle_t))
struct opnfsdsfile {
fhandle_t dsf_fh;
uint32_t dsf_dir;
union {
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
} dsf_nam;
char dsf_filename[PNFS_FILENAME_LEN + 1];
};
/* New structure with stripe fields. */
struct pnfsdsfile {
uint32_t dsf_stripecnt;
uint64_t dsf_stripesiz;
fhandle_t dsf_fh;
uint32_t dsf_dir;
union {
File diff suppressed because it is too large Load Diff
+11 -5
View File
@@ -5133,11 +5133,15 @@ nfsrvd_layoutget(struct nfsrv_descript *nd, __unused int isdgram,
}
layp = NULL;
#ifdef notnow
if (layouttype == NFSLAYOUT_NFSV4_1_FILES && nfsrv_maxpnfsmirror == 1)
layp = malloc(NFSX_V4FILELAYOUT, M_TEMP, M_WAITOK);
else if (layouttype == NFSLAYOUT_FLEXFILE)
layp = malloc(NFSX_V4FLEXLAYOUT(nfsrv_maxpnfsmirror), M_TEMP,
M_WAITOK);
#else
if (layouttype == NFSLAYOUT_FLEXFILE)
#endif
layp = malloc(NFSX_V4FLEXLAYOUT(NFSDEV_MAXMIRRORS,
NFSDEV_MAXSTRIPE), M_TEMP, M_WAITOK);
else
nd->nd_repstat = NFSERR_UNKNLAYOUTTYPE;
if (layp != NULL)
@@ -5692,7 +5696,7 @@ nfsrvd_allocate(struct nfsrv_descript *nd, __unused int isdgram,
nfsquad_t clientid;
nfsattrbit_t attrbits;
if (!nfsrv_doallocate) {
if (!nfsrv_doallocate || nfsrv_devidcnt > 0) {
/*
* If any exported file system, such as a ZFS one, cannot
* do VOP_ALLOCATE(), this operation cannot be supported
@@ -5824,9 +5828,9 @@ nfsrvd_deallocate(struct nfsrv_descript *nd, __unused int isdgram,
}
stp->ls_stateid.other[2] = *tl++;
/*
* Don't allow this to be done for a DS.
* Don't allow this to be done for a DS or MDS.
*/
if ((nd->nd_flag & ND_DSSERVER) != 0)
if ((nd->nd_flag & ND_DSSERVER) != 0 || nfsrv_devidcnt > 0)
nd->nd_repstat = NFSERR_NOTSUPP;
/* However, allow the proxy stateid. */
if (stp->ls_stateid.seqid == 0xffffffff &&
@@ -6361,6 +6365,8 @@ nfsrvd_seek(struct nfsrv_descript *nd, __unused int isdgram,
nd->nd_repstat = NFSERR_WRONGTYPE;
if (nd->nd_repstat == 0 && off < 0)
nd->nd_repstat = NFSERR_NXIO;
if (nd->nd_repstat == 0 && nfsrv_devidcnt > 0)
nd->nd_repstat = NFSERR_NOTSUPP;
if (nd->nd_repstat == 0) {
/* Check permissions for the input file. */
NFSZERO_ATTRBIT(&attrbits);
+47 -44
View File
@@ -236,7 +236,8 @@ static int nfsrv_fndclid(nfsquad_t *clidvec, nfsquad_t clid, int clidcnt);
static struct nfslayout *nfsrv_filelayout(struct nfsrv_descript *nd, int iomode,
fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs);
static struct nfslayout *nfsrv_flexlayout(struct nfsrv_descript *nd, int iomode,
int mirrorcnt, fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs);
int mirrorcnt, uint64_t stripesiz, int stripecnt, fhandle_t *fhp,
fhandle_t *dsfhp, char *devid, fsid_t fs);
static int nfsrv_dontlayout(fhandle_t *fhp);
static int nfsrv_createdsfile(vnode_t vp, fhandle_t *fhp, struct pnfsdsfile *pf,
vnode_t dvp, struct nfsdevice *ds, struct ucred *cred, NFSPROC_T *p,
@@ -6626,7 +6627,8 @@ nfsrv_layoutget(struct nfsrv_descript *nd, vnode_t vp, struct nfsexstuff *exp,
struct nfslayout *lyp;
char *devid;
fhandle_t fh, *dsfhp;
int error, mirrorcnt;
int error, mirrorcnt, stripecnt;
uint64_t stripesiz;
if (nfsrv_devidcnt == 0)
return (NFSERR_UNKNLAYOUTTYPE);
@@ -6723,9 +6725,8 @@ nfsrv_layoutget(struct nfsrv_descript *nd, vnode_t vp, struct nfsexstuff *exp,
NFSUNLOCKLAYOUT(lhyp);
/* Find the device id and file handle. */
dsfhp = malloc(sizeof(fhandle_t) * NFSDEV_MAXMIRRORS, M_TEMP, M_WAITOK);
devid = malloc(NFSX_V4DEVICEID * NFSDEV_MAXMIRRORS, M_TEMP, M_WAITOK);
error = nfsrv_dsgetdevandfh(vp, p, &mirrorcnt, dsfhp, devid);
error = nfsrv_dsgetdevandfh(vp, p, &mirrorcnt, &stripesiz, &stripecnt,
&dsfhp, &devid);
NFSD_DEBUG(4, "layoutget devandfh=%d\n", error);
if (error == 0) {
if (layouttype == NFSLAYOUT_NFSV4_1_FILES) {
@@ -6735,11 +6736,11 @@ nfsrv_layoutget(struct nfsrv_descript *nd, vnode_t vp, struct nfsexstuff *exp,
lyp = nfsrv_filelayout(nd, *iomode, &fh, dsfhp,
devid, vp->v_mount->mnt_stat.f_fsid);
} else {
if (NFSX_V4FLEXLAYOUT(mirrorcnt) > maxcnt)
if (NFSX_V4FLEXLAYOUT(mirrorcnt, stripecnt) > maxcnt)
error = NFSERR_TOOSMALL;
else
lyp = nfsrv_flexlayout(nd, *iomode, mirrorcnt,
&fh, dsfhp, devid,
stripesiz, stripecnt, &fh, dsfhp, devid,
vp->v_mount->mnt_stat.f_fsid);
}
}
@@ -6814,15 +6815,16 @@ nfsrv_filelayout(struct nfsrv_descript *nd, int iomode, fhandle_t *fhp,
*/
static struct nfslayout *
nfsrv_flexlayout(struct nfsrv_descript *nd, int iomode, int mirrorcnt,
fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs)
uint64_t stripesiz, int stripecnt, fhandle_t *fhp, fhandle_t *dsfhp,
char *devid, fsid_t fs)
{
uint32_t *tl;
struct nfslayout *lyp;
uint64_t lenval;
int i;
int i, j;
lyp = malloc(sizeof(struct nfslayout) + NFSX_V4FLEXLAYOUT(mirrorcnt),
M_NFSDSTATE, M_WAITOK | M_ZERO);
lyp = malloc(sizeof(struct nfslayout) +
NFSX_V4FLEXLAYOUT(mirrorcnt, stripecnt), M_NFSDSTATE,
M_WAITOK | M_ZERO);
lyp->lay_type = NFSLAYOUT_FLEXFILE;
if (iomode == NFSLAYOUTIOMODE_RW)
lyp->lay_flags = NFSLAY_RW;
@@ -6836,41 +6838,42 @@ nfsrv_flexlayout(struct nfsrv_descript *nd, int iomode, int mirrorcnt,
/* Fill in the xdr for the files layout. */
tl = (uint32_t *)lyp->lay_xdr;
lenval = 0;
txdr_hyper(lenval, tl); tl += 2; /* Stripe unit. */
txdr_hyper(stripesiz, tl); tl += 2; /* Stripe unit. */
*tl++ = txdr_unsigned(mirrorcnt); /* # of mirrors. */
for (i = 0; i < mirrorcnt; i++) {
*tl++ = txdr_unsigned(1); /* One stripe. */
NFSBCOPY(devid, tl, NFSX_V4DEVICEID); /* Device ID. */
tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
devid += NFSX_V4DEVICEID;
*tl++ = txdr_unsigned(1); /* Efficiency. */
*tl++ = 0; /* Proxy Stateid. */
*tl++ = 0x55555555;
*tl++ = 0x55555555;
*tl++ = 0x55555555;
*tl++ = txdr_unsigned(1); /* 1 file handle. */
*tl++ = txdr_unsigned(NFSX_V4PNFSFH);
NFSBCOPY(dsfhp, tl, sizeof(*dsfhp));
tl += (NFSM_RNDUP(NFSX_V4PNFSFH) / NFSX_UNSIGNED);
dsfhp++;
if (nfsrv_flexlinuxhack != 0) {
*tl++ = txdr_unsigned(strlen(FLEX_UID0));
*tl = 0; /* 0 pad string. */
NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0));
*tl++ = txdr_unsigned(strlen(FLEX_UID0));
*tl = 0; /* 0 pad string. */
NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0));
} else {
*tl++ = txdr_unsigned(strlen(FLEX_OWNERID));
NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED);
*tl++ = txdr_unsigned(strlen(FLEX_OWNERID));
NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED);
*tl++ = txdr_unsigned(stripecnt); /* Stripe cnt. */
for (j = 0; j < stripecnt; j++) {
NFSBCOPY(devid, tl, NFSX_V4DEVICEID); /* Device ID. */
tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
devid += NFSX_V4DEVICEID;
*tl++ = txdr_unsigned(1); /* Efficiency. */
*tl++ = 0; /* Proxy Stateid. */
*tl++ = 0x55555555;
*tl++ = 0x55555555;
*tl++ = 0x55555555;
*tl++ = txdr_unsigned(1); /* 1 file handle. */
*tl++ = txdr_unsigned(NFSX_V4PNFSFH);
NFSBCOPY(dsfhp, tl, sizeof(*dsfhp));
tl += (NFSM_RNDUP(NFSX_V4PNFSFH) / NFSX_UNSIGNED);
dsfhp++;
if (nfsrv_flexlinuxhack != 0) {
*tl++ = txdr_unsigned(strlen(FLEX_UID0));
*tl = 0; /* 0 pad string. */
NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0));
*tl++ = txdr_unsigned(strlen(FLEX_UID0));
*tl = 0; /* 0 pad string. */
NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0));
} else {
*tl++ = txdr_unsigned(strlen(FLEX_OWNERID));
NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED);
*tl++ = txdr_unsigned(strlen(FLEX_OWNERID));
NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED);
}
}
}
*tl++ = txdr_unsigned(0); /* ff_flags. */
*tl = txdr_unsigned(60); /* Status interval hint. */
lyp->lay_layoutlen = NFSX_V4FLEXLAYOUT(mirrorcnt);
lyp->lay_layoutlen = NFSX_V4FLEXLAYOUT(mirrorcnt, stripecnt);
return (lyp);
}
@@ -7126,7 +7129,7 @@ nfsrv_layoutreturn(struct nfsrv_descript *nd, vnode_t vp,
error = nfsvno_getfh(vp, &fh, p);
if (error == 0) {
error = nfsrv_updatemdsattr(vp, &na, p);
if (error != 0)
if (error != 0 && error != ESTALE)
printf("nfsrv_layoutreturn: updatemdsattr"
" failed=%d\n", error);
}
@@ -8454,7 +8457,7 @@ nfsrv_mdscopymr(char *mdspathp, char *dspathp, char *curdspathp, char *buf,
struct vnode *vp, *curvp;
struct pnfsdsfile *pf;
struct nfsmount *nmp, *curnmp;
int dsdir, error, mirrorcnt, ippos;
int dsdir, error, ippos;
vp = NULL;
curvp = NULL;
@@ -8591,7 +8594,7 @@ nfsrv_mdscopymr(char *mdspathp, char *dspathp, char *curdspathp, char *buf,
* on the MDS file (as checked via the nmp argument),
* nfsrv_dsgetsockmnt() returns EEXIST, so no copying will occur.
*/
error = nfsrv_dsgetsockmnt(vp, 0, buf, buflenp, &mirrorcnt, p,
error = nfsrv_dsgetsockmnt(vp, 0, buf, buflenp, NULL, NULL, NULL, p,
NULL, NULL, NULL, fname, nvpp, &nmp, curnmp, &ippos, &dsdir);
if (curvp != NULL)
vput(curvp);