diff --git a/sys/fs/nfs/nfs.h b/sys/fs/nfs/nfs.h index b30e4f17b7c..1056595e8cd 100644 --- a/sys/fs/nfs/nfs.h +++ b/sys/fs/nfs/nfs.h @@ -199,9 +199,12 @@ struct nfsd_nfsd_args { * NFSDEV_MAXMIRRORS - Maximum level of mirroring for a DS. * (Most will only put files on two DSs, but this setting allows up to 4.) * NFSDEV_MAXVERS - maximum number of NFS versions supported by Flex File. + * NFSDEV_MAXSTRIPE - sanity limit for maximum number of DSs used to stripe a + * file, which is nfsrv_maxstripecnt set by the nfsd option. */ #define NFSDEV_MAXMIRRORS 4 #define NFSDEV_MAXVERS 4 +#define NFSDEV_MAXSTRIPE 1024 struct nfsd_pnfsd_args { int op; /* Which pNFSd op to perform. */ diff --git a/sys/fs/nfs/nfs_var.h b/sys/fs/nfs/nfs_var.h index 8d8de381cba..00ed32ae705 100644 --- a/sys/fs/nfs/nfs_var.h +++ b/sys/fs/nfs/nfs_var.h @@ -761,10 +761,10 @@ int nfsvno_testexp(struct nfsrv_descript *, struct nfsexstuff *); uint32_t nfsrv_hashfh(fhandle_t *); uint32_t nfsrv_hashsessionid(uint8_t *); void nfsrv_backupstable(void); -int nfsrv_dsgetdevandfh(struct vnode *, NFSPROC_T *, int *, fhandle_t *, - char *); -int nfsrv_dsgetsockmnt(struct vnode *, int, char *, int *, int *, - NFSPROC_T *, struct vnode **, fhandle_t *, char *, char *, +int nfsrv_dsgetdevandfh(struct vnode *, NFSPROC_T *, int *, uint64_t *, int *, + fhandle_t **, char **); +int nfsrv_dsgetsockmnt(struct vnode *, int, char *, int *, int *, int *, + uint64_t *, NFSPROC_T *, struct vnode ***, fhandle_t **, char **, char *, struct vnode **, struct nfsmount **, struct nfsmount *, int *, int *); int nfsrv_dscreate(struct vnode *, struct vattr *, struct vattr *, fhandle_t *, struct pnfsdsfile *, struct pnfsdsattr *, char *, diff --git a/sys/fs/nfs/nfsproto.h b/sys/fs/nfs/nfsproto.h index 1054c85d8e8..467018c3953 100644 --- a/sys/fs/nfs/nfsproto.h +++ b/sys/fs/nfs/nfsproto.h @@ -282,9 +282,19 @@ #define NFSX_V4NAMEDATTRFH 3 #define NFSX_V4FILELAYOUT (4 * NFSX_UNSIGNED + NFSX_V4DEVICEID + \ NFSX_HYPER + NFSM_RNDUP(NFSX_V4PNFSFH)) -#define NFSX_V4FLEXLAYOUT(m) (NFSX_HYPER + 3 * NFSX_UNSIGNED + \ - ((m) * (NFSX_V4DEVICEID + NFSX_STATEID + NFSM_RNDUP(NFSX_V4PNFSFH) + \ - 8 * NFSX_UNSIGNED))) + +/* + * NFSX_V4FLEXLAYOUT() is the size in bytes of the XDR for ff_layout4, given + * m - # of mirrors + * s - # of stripes + * NFSX_HYPER is for ffl_stripe_unit + * "m + 3" calculates the counts of ffl_mirrors and ffm_data_servers plus + * ffl_flags and ffl_stats_collect_hint. + * The final section calculates the size of ff_data_server4. + */ +#define NFSX_V4FLEXLAYOUT(m, s) (NFSX_HYPER + ((m) + 3) * NFSX_UNSIGNED + \ + (((m) * (s)) * (NFSX_V4DEVICEID + NFSX_STATEID + \ + NFSM_RNDUP(NFSX_V4PNFSFH) + 7 * NFSX_UNSIGNED))) /* sizes common to multiple NFS versions */ #define NFSX_FHMAX (NFSX_V4FHMAX) diff --git a/sys/fs/nfs/nfsrvstate.h b/sys/fs/nfs/nfsrvstate.h index 0e93f87234b..91365f66bde 100644 --- a/sys/fs/nfs/nfsrvstate.h +++ b/sys/fs/nfs/nfsrvstate.h @@ -403,7 +403,20 @@ struct nfsdontlist { * in the metadata file's extended attribute called pnfsd.dsfile. */ #define PNFS_FILENAME_LEN (2 * sizeof(fhandle_t)) +struct opnfsdsfile { + fhandle_t dsf_fh; + uint32_t dsf_dir; + union { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } dsf_nam; + char dsf_filename[PNFS_FILENAME_LEN + 1]; +}; + +/* New structure with stripe fields. */ struct pnfsdsfile { + uint32_t dsf_stripecnt; + uint64_t dsf_stripesiz; fhandle_t dsf_fh; uint32_t dsf_dir; union { diff --git a/sys/fs/nfsserver/nfs_nfsdport.c b/sys/fs/nfsserver/nfs_nfsdport.c index 1e6936ed351..7fc8c426ba6 100644 --- a/sys/fs/nfsserver/nfs_nfsdport.c +++ b/sys/fs/nfsserver/nfs_nfsdport.c @@ -71,6 +71,9 @@ extern int nfs_bufpackets; extern u_long sb_max_adj; extern struct nfsv4lock nfsv4rootfs_lock; +static uint64_t nfsrv_stripesiz = 0; +static int nfsrv_maxstripecnt = 1; + VNET_DECLARE(int, nfsrv_numnfsd); VNET_DECLARE(struct nfsrv_stablefirst, nfsrv_stablefirst); VNET_DECLARE(SVCPOOL *, nfsrvd_pool); @@ -132,7 +135,7 @@ static int nfsrv_createiovecw(int, struct mbuf *, char *, struct iovec **, static void nfs_dtypetovtype(struct nfsvattr *, struct vnode *, uint8_t); static void nfsrv_pnfscreate(struct vnode *, struct vattr *, struct ucred *, NFSPROC_T *); -static void nfsrv_pnfsremovesetup(struct vnode *, NFSPROC_T *, struct vnode **, +static void nfsrv_pnfsremovesetup(struct vnode *, NFSPROC_T *, struct vnode ***, int *, char *, fhandle_t *); static void nfsrv_pnfsremove(struct vnode **, int, char *, fhandle_t *, NFSPROC_T *); @@ -141,27 +144,36 @@ static int nfsrv_proxyds(struct vnode *, off_t, int, struct ucred *, struct nfsvattr *, struct acl *, off_t *, int, bool *); static int nfsrv_setextattr(struct vnode *, struct nfsvattr *, NFSPROC_T *); static int nfsrv_readdsrpc(fhandle_t *, off_t, int, struct ucred *, - NFSPROC_T *, struct nfsmount *, struct mbuf **, struct mbuf **); + NFSPROC_T *, struct nfsmount **, int, int, uint64_t, struct mbuf **, + struct mbuf **, int *); static int nfsrv_writedsrpc(fhandle_t *, off_t, int, struct ucred *, - NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct mbuf **, - char *, int *); + NFSPROC_T *, struct vnode *, struct nfsmount **, int, int, uint64_t, + struct mbuf **, char *, int *); +#ifdef notnow static int nfsrv_allocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *); static int nfsrv_deallocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *); static int nfsrv_setacldsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct acl *, int *); -static int nfsrv_setattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, - struct vnode *, struct nfsmount **, int, struct nfsvattr *, int *); -static int nfsrv_getattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *, - struct vnode *, struct nfsmount *, struct nfsvattr *); +#endif +static int nfsrv_setattrdsrpc(fhandle_t *, struct vnode *, struct ucred *, + NFSPROC_T *, struct nfsmount **, int, int, struct nfsvattr *, int *); +static int nfsrv_setattrdsdorpc(fhandle_t *, struct vnode *, struct ucred *, + NFSPROC_T *, struct nfsmount *, struct nfsvattr *, struct nfsvattr *); +static int nfsrv_getattrdsrpc(fhandle_t *, struct vnode *, struct ucred *, + NFSPROC_T *, struct nfsmount **, int, struct nfsvattr *, int *); +static int nfsrv_getattrdsdorpc(fhandle_t *, struct vnode *, struct ucred *, + NFSPROC_T *, struct nfsmount *, struct nfsvattr *); +#ifdef notnow static int nfsrv_seekdsrpc(fhandle_t *, off_t *, int, bool *, struct ucred *, NFSPROC_T *, struct nfsmount *); +#endif static int nfsrv_putfhname(fhandle_t *, char *); static int nfsrv_pnfslookupds(struct vnode *, struct vnode *, struct pnfsdsfile *, struct vnode **, NFSPROC_T *); -static void nfsrv_pnfssetfh(struct vnode *, struct pnfsdsfile *, char *, char *, - struct vnode *, NFSPROC_T *); +static void nfsrv_pnfssetfh(struct vnode *, struct pnfsdsfile *, char **, + char *, struct vnode *, NFSPROC_T *); static int nfsrv_dsremove(struct vnode *, char *, struct ucred *, NFSPROC_T *); static int nfsrv_dssetacl(struct vnode *, struct acl *, struct ucred *, NFSPROC_T *); @@ -330,6 +342,55 @@ SYSCTL_PROC(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_dolocallocks, "IU", "Enable nfsd to acquire local locks on files"); +static int +sysctl_stripecnt(SYSCTL_HANDLER_ARGS) +{ + int error, newmaxstripecnt; + + newmaxstripecnt = nfsrv_maxstripecnt; + error = sysctl_handle_int(oidp, &newmaxstripecnt, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (newmaxstripecnt == nfsrv_maxstripecnt) + return (0); + if (newnfs_numnfsd > 0) + return (EPERM); + if (jailed(curthread->td_ucred)) + return (EINVAL); + if (newmaxstripecnt <= 0 || newmaxstripecnt > NFSDEV_MAXSTRIPE) + return (EINVAL); + nfsrv_maxstripecnt = newmaxstripecnt; + return (0); +} +SYSCTL_PROC(_vfs_nfsd, OID_AUTO, pnfsstripecnt, + CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_stripecnt, "IU", "Set the #stripes for a pNFS server"); + +static int +sysctl_stripeunit(SYSCTL_HANDLER_ARGS) +{ + uint64_t newstripesiz; + int error; + + newstripesiz = nfsrv_stripesiz; + error = sysctl_handle_64(oidp, &newstripesiz, 0, req); + if (error != 0 || req->newptr == NULL) + return (error); + if (newstripesiz == nfsrv_stripesiz) + return (0); + if (newnfs_numnfsd > 0) + return (EPERM); + if (jailed(curthread->td_ucred)) + return (EINVAL); + nfsrv_stripesiz = newstripesiz; + if (newstripesiz == 0) + nfsrv_maxstripecnt = 1; + return (0); +} +SYSCTL_PROC(_vfs_nfsd, OID_AUTO, pnfsstripeunit, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_stripeunit, "QU", "Set the stripe unit length for a pNFS server"); + #define MAX_REORDERED_RPC 16 #define NUM_HEURISTIC 1031 #define NHUSE_INIT 64 @@ -1492,14 +1553,14 @@ int nfsvno_removesub(struct nameidata *ndp, bool is_v4, struct nfsrv_descript *nd, struct thread *p, struct nfsexstuff *exp) { - struct vnode *vp, *dsdvp[NFSDEV_MAXMIRRORS], *newvp; + struct vnode *vp, **dsdvpp, *newvp; struct mount *mp; - int error = 0, mirrorcnt, ret; + int error = 0, dsfilecnt, ret; char fname[PNFS_FILENAME_LEN + 1]; fhandle_t fh; vp = ndp->ni_vp; - dsdvp[0] = NULL; + dsdvpp = NULL; if (vp->v_type == VDIR) { error = NFSERR_ISDIR; } else if (is_v4) { @@ -1510,12 +1571,18 @@ nfsvno_removesub(struct nameidata *ndp, bool is_v4, struct nfsrv_descript *nd, error = nfsrv_checkremove(vp, 1, NULL, nd->nd_clientid, p); } - if (error == 0) - nfsrv_pnfsremovesetup(vp, p, dsdvp, &mirrorcnt, fname, &fh); - if (!error) + if (error == 0) { + nfsrv_pnfsremovesetup(vp, p, &dsdvpp, &dsfilecnt, fname, &fh); + NFSD_DEBUG(4, "nfsrv_pnfsremovesetup err=%d dsfilecnt=%d\n", + error, dsfilecnt); error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd); - if (error == 0 && dsdvp[0] != NULL) - nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p); + } + if (error == 0 && dsdvpp != NULL) { + nfsrv_pnfsremove(dsdvpp, dsfilecnt, fname, &fh, p); + NFSD_DEBUG(4, "aft nfsrv_pnfsremove dsfilecnt=%d fname=%s\n", + dsfilecnt, fname); + } + free(dsdvpp, M_TEMP); if (is_v4 && (nd->nd_flag & ND_NFSV41) != 0 && error == 0) error = nfsvno_getfh(vp, &fh, p); if (ndp->ni_dvp == vp) @@ -1596,13 +1663,13 @@ int nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, struct nfsrv_descript *nd, struct thread *p) { - struct vnode *fvp, *tvp, *tdvp, *dsdvp[NFSDEV_MAXMIRRORS], *newvp; + struct vnode *fvp, *tvp, *tdvp, **dsdvpp, *newvp; struct mount *mp; - int error = 0, mirrorcnt, ret; + int error = 0, dsfilecnt, ret; char fname[PNFS_FILENAME_LEN + 1]; fhandle_t fh, fh2; - dsdvp[0] = NULL; + dsdvpp = NULL; fvp = fromndp->ni_vp; if (nd->nd_repstat != 0) { vrele(fromndp->ni_dvp); @@ -1693,10 +1760,10 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, if ((nd->nd_flag & ND_NFSV41) != 0) error = nfsvno_getfh(tvp, &fh2, p); if (error == 0) - nfsrv_pnfsremovesetup(tvp, p, dsdvp, &mirrorcnt, fname, - &fh); + nfsrv_pnfsremovesetup(tvp, p, &dsdvpp, &dsfilecnt, + fname, &fh); NFSD_DEBUG(4, "nfsvno_rename: pnfsremovesetup" - " dsdvp=%p\n", dsdvp[0]); + " dsdvpp=%p\n", dsdvpp); } out: mp = NULL; @@ -1738,13 +1805,14 @@ nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp, } /* - * If dsdvp[0] != NULL, it was set up by nfsrv_pnfsremovesetup() and + * If dsdvpp != NULL, it was set up by nfsrv_pnfsremovesetup() and * if the rename succeeded, the DS file for the tvp needs to be * removed. */ - if (error == 0 && dsdvp[0] != NULL) { - nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p); + if (error == 0 && dsdvpp != NULL) { + nfsrv_pnfsremove(dsdvpp, dsfilecnt, fname, &fh, p); NFSD_DEBUG(4, "nfsvno_rename: pnfsremove\n"); + free(dsdvpp, M_TEMP); } /* Use ret to determine if the file still exists. */ @@ -4678,26 +4746,31 @@ static void nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, NFSPROC_T *p) { - struct nfsrvdscreate *dsc, *tdsc = NULL; + struct nfsrvdscreate *dsc, *tdsc; struct nfsdevice *ds, *tds, *fds; struct mount *mp; struct pnfsdsfile *pf, *tpf; struct pnfsdsattr dsattr; struct vattr va; - struct vnode *dvp[NFSDEV_MAXMIRRORS]; + struct vnode **dvp; struct nfsmount *nmp; fhandle_t fh; uid_t vauid; gid_t vagid; u_short vamode; struct ucred *tcred; - int dsdir[NFSDEV_MAXMIRRORS], error, i, mirrorcnt, ret; + int *dsdir, error, i, j, mirrorcnt, ret, stripecnt; int failpos, timo; /* Get a DS server directory in a round-robin order. */ mirrorcnt = 1; mp = vp->v_mount; ds = fds = NULL; + i = j = 0; + dvp = malloc(sizeof(*dvp) * nfsrv_maxpnfsmirror * nfsrv_maxstripecnt, + M_TEMP, M_WAITOK); + dsdir = malloc(sizeof(*dsdir) * nfsrv_maxpnfsmirror * + nfsrv_maxstripecnt, M_TEMP, M_WAITOK); NFSDDSLOCK(); /* * Search for the first entry that handles this MDS fs, but use the @@ -4705,49 +4778,85 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, */ TAILQ_FOREACH(tds, &nfsrv_devidhead, nfsdev_list) { if (tds->nfsdev_nmp != NULL) { + i++; if (tds->nfsdev_mdsisset == 0 && ds == NULL) ds = tds; else if (tds->nfsdev_mdsisset != 0 && fsidcmp( &mp->mnt_stat.f_fsid, &tds->nfsdev_mdsfsid) == 0) { - ds = fds = tds; - break; + if (j == 0) + ds = fds = tds; + if (nfsrv_stripesiz == 0) + break; + j++; } } } if (ds == NULL) { NFSDDSUNLOCK(); + free(dvp, M_TEMP); + free(dsdir, M_TEMP); NFSD_DEBUG(4, "nfsrv_pnfscreate: no srv\n"); return; } + + /* + * i or j is the count of devices. The stripecnt is that number + * of devices devided by the number of mirrors. + */ + stripecnt = 0; + if (nfsrv_stripesiz > 0) { + if (j > 0) + stripecnt = j / nfsrv_maxpnfsmirror; + else + stripecnt = i / nfsrv_maxpnfsmirror; + if (stripecnt > nfsrv_maxstripecnt) + stripecnt = nfsrv_maxstripecnt; + } + if (stripecnt == 0) + stripecnt = 1; + + /* Set the first device as found above. */ i = dsdir[0] = ds->nfsdev_nextdir; ds->nfsdev_nextdir = (ds->nfsdev_nextdir + 1) % nfsrv_dsdirsize; dvp[0] = ds->nfsdev_dsdir[i]; tds = TAILQ_NEXT(ds, nfsdev_list); - if (nfsrv_maxpnfsmirror > 1 && tds != NULL) { + if ((nfsrv_maxpnfsmirror > 1 || stripecnt > 1) && tds != NULL) { + j = 1; /* Stripe number */ + mirrorcnt = 0; TAILQ_FOREACH_FROM(tds, &nfsrv_devidhead, nfsdev_list) { if (tds->nfsdev_nmp != NULL && ((tds->nfsdev_mdsisset == 0 && fds == NULL) || (tds->nfsdev_mdsisset != 0 && fds != NULL && fsidcmp(&mp->mnt_stat.f_fsid, &tds->nfsdev_mdsfsid) == 0))) { - dsdir[mirrorcnt] = i; - dvp[mirrorcnt] = tds->nfsdev_dsdir[i]; - mirrorcnt++; - if (mirrorcnt >= nfsrv_maxpnfsmirror) - break; + dsdir[mirrorcnt * stripecnt + j] = i; + dvp[mirrorcnt * stripecnt + j] = + tds->nfsdev_dsdir[i]; + j++; + if (j >= stripecnt) { + mirrorcnt++; + if (mirrorcnt >= nfsrv_maxpnfsmirror) + break; + j = 0; + } } } - } + if (mirrorcnt == 0) { + mirrorcnt = 1; + stripecnt = j; + } + } else + stripecnt = 1; /* Put at end of list to implement round-robin usage. */ TAILQ_REMOVE(&nfsrv_devidhead, ds, nfsdev_list); TAILQ_INSERT_TAIL(&nfsrv_devidhead, ds, nfsdev_list); NFSDDSUNLOCK(); dsc = NULL; - if (mirrorcnt > 1) - tdsc = dsc = malloc(sizeof(*dsc) * (mirrorcnt - 1), M_TEMP, + j = mirrorcnt * stripecnt; + if (j > 1) + tdsc = dsc = malloc(sizeof(*dsc) * (j - 1), M_TEMP, M_WAITOK | M_ZERO); - tpf = pf = malloc(sizeof(*pf) * nfsrv_maxpnfsmirror, M_TEMP, M_WAITOK | - M_ZERO); + tpf = pf = malloc(sizeof(*pf) * j, M_TEMP, M_WAITOK | M_ZERO); error = nfsvno_getfh(vp, &fh, p); if (error == 0) @@ -4775,7 +4884,9 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, * additional mirrors. */ failpos = -1; - for (i = 0; i < mirrorcnt - 1 && error == 0; i++, tpf++, tdsc++) { + for (i = 0; i < j - 1 && error == 0; i++, tpf++, tdsc++) { + tpf->dsf_stripecnt = stripecnt; + tpf->dsf_stripesiz = nfsrv_stripesiz; tpf->dsf_dir = dsdir[i]; tdsc->tcred = tcred; tdsc->p = p; @@ -4806,11 +4917,13 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, } } if (error == 0) { - tpf->dsf_dir = dsdir[mirrorcnt - 1]; - error = nfsrv_dscreate(dvp[mirrorcnt - 1], vap, &va, &fh, tpf, + tpf->dsf_stripecnt = stripecnt; + tpf->dsf_stripesiz = nfsrv_stripesiz; + tpf->dsf_dir = dsdir[j - 1]; + error = nfsrv_dscreate(dvp[j - 1], vap, &va, &fh, tpf, &dsattr, NULL, tcred, p, NULL); - if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(error)) { - failpos = mirrorcnt - 1; + if (failpos == -1 && j > 1 && nfsds_failerr(error)) { + failpos = j - 1; error = 0; } } @@ -4818,7 +4931,7 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, if (timo < 1) timo = 1; /* Wait for kernel task(s) to complete. */ - for (tdsc = dsc, i = 0; i < mirrorcnt - 1; i++, tdsc++) { + for (tdsc = dsc, i = 0; i < j - 1; i++, tdsc++) { while (tdsc->inprog != 0 && tdsc->done == 0) tsleep(&tdsc->tsk, PVFS, "srvdcr", timo); if (tdsc->err != 0) { @@ -4830,7 +4943,7 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, } /* - * If failpos has been set, that mirror has failed, so it needs + * If failpos has been set, that DS has failed, so it needs * to be disabled. */ if (failpos >= 0) { @@ -4855,25 +4968,12 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, NFSFREECRED(tcred); if (error == 0) { ASSERT_VOP_ELOCKED(vp, "nfsrv_pnfscreate vp"); - - NFSD_DEBUG(4, "nfsrv_pnfscreate: mirrorcnt=%d maxmirror=%d\n", - mirrorcnt, nfsrv_maxpnfsmirror); - /* - * For all mirrors that couldn't be created, fill in the - * *pf structure, but with an IP address == 0.0.0.0. - */ - tpf = pf + mirrorcnt; - for (i = mirrorcnt; i < nfsrv_maxpnfsmirror; i++, tpf++) { - *tpf = *pf; - tpf->dsf_sin.sin_family = AF_INET; - tpf->dsf_sin.sin_len = sizeof(struct sockaddr_in); - tpf->dsf_sin.sin_addr.s_addr = 0; - tpf->dsf_sin.sin_port = 0; - } + NFSD_DEBUG(4, "nfsrv_pnfscreate: mirrorcnt=%d stripecnt=%d\n", + mirrorcnt, stripecnt); error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", - sizeof(*pf) * nfsrv_maxpnfsmirror, (char *)pf, p); + sizeof(*pf) * mirrorcnt * stripecnt, (char *)pf, p); if (error == 0) error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr", @@ -4883,25 +4983,28 @@ nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred, error); } else printf("pNFS: pnfscreate=%d\n", error); + free(dvp, M_TEMP); + free(dsdir, M_TEMP); free(pf, M_TEMP); free(dsc, M_TEMP); } /* * Get the information needed to remove the pNFS Data Server file from the - * Metadata file. Upon success, ddvp is set non-NULL to the locked - * DS directory vnode. The caller must unlock *ddvp when done with it. + * Metadata file. Upon success, *dvppp is set to an array of locked + * DS directory vnode(s). The caller must unlock this array of *dvp when done + * with it. */ static void -nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode **dvpp, - int *mirrorcntp, char *fname, fhandle_t *fhp) +nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode ***dvppp, + int *dsfilecntp, char *fname, fhandle_t *fhp) { struct vattr va; struct ucred *tcred; char *buf; int buflen, error; - dvpp[0] = NULL; + *dvppp = NULL; /* If not an exported regular file or not a pNFS server, just return. */ if (vp->v_type != VREG || (vp->v_mount->mnt_flag & MNT_EXPORTED) == 0 || nfsrv_devidcnt == 0) @@ -4924,11 +5027,12 @@ nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode **dvpp, return; } - buflen = 1024; + buflen = sizeof(struct pnfsdsfile) * NFSDEV_MAXMIRRORS * + NFSDEV_MAXSTRIPE; buf = malloc(buflen, M_TEMP, M_WAITOK); /* Get the directory vnode for the DS mount and the file handle. */ - error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, dvpp, - NULL, NULL, fname, NULL, NULL, NULL, NULL, NULL); + error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, dsfilecntp, NULL, NULL, + p, dvppp, NULL, NULL, fname, NULL, NULL, NULL, NULL, NULL); free(buf, M_TEMP); if (error != 0) printf("pNFS: nfsrv_pnfsremovesetup getsockmnt=%d\n", error); @@ -5006,28 +5110,31 @@ start_dsremove(void *arg, int pending) * removed to set up the dvp and fill in the FH. */ static void -nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp, - NFSPROC_T *p) +nfsrv_pnfsremove(struct vnode **dvpp, int dsfilecnt, char *fname, + fhandle_t *fhp, NFSPROC_T *p) { struct ucred *tcred; struct nfsrvdsremove *dsrm, *tdsrm; struct nfsdevice *ds; struct nfsmount *nmp; + struct vnode **tdvpp; int failpos, i, ret, timo; tcred = newnfs_getcred(); dsrm = NULL; - if (mirrorcnt > 1) - dsrm = malloc(sizeof(*dsrm) * mirrorcnt - 1, M_TEMP, M_WAITOK); + if (dsfilecnt > 1) + tdsrm = dsrm = malloc(sizeof(*dsrm) * dsfilecnt - 1, M_TEMP, + M_WAITOK); /* * Remove the file on each DS mirror, using kernel process(es) for the * additional mirrors. */ failpos = -1; - for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) { + tdvpp = dvpp; + for (i = 0; i < dsfilecnt - 1; i++, tdsrm++, tdvpp++) { tdsrm->tcred = tcred; tdsrm->p = p; - tdsrm->dvp = dvp[i]; + tdsrm->dvp = *tdvpp; strlcpy(tdsrm->fname, fname, PNFS_FILENAME_LEN + 1); tdsrm->inprog = 0; tdsrm->done = 0; @@ -5038,19 +5145,19 @@ nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp, NFSD_DEBUG(4, "nfsrv_pnfsremove: nfs_pnfsio=%d\n", ret); } if (ret != 0) { - ret = nfsrv_dsremove(dvp[i], fname, tcred, p); + ret = nfsrv_dsremove(tdsrm->dvp, fname, tcred, p); if (failpos == -1 && nfsds_failerr(ret)) failpos = i; } } - ret = nfsrv_dsremove(dvp[mirrorcnt - 1], fname, tcred, p); - if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(ret)) - failpos = mirrorcnt - 1; + ret = nfsrv_dsremove(*tdvpp, fname, tcred, p); + if (failpos == -1 && dsfilecnt > 1 && nfsds_failerr(ret)) + failpos = dsfilecnt - 1; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; /* Wait for kernel task(s) to complete. */ - for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) { + for (tdsrm = dsrm, i = 0; i < dsfilecnt - 1; i++, tdsrm++) { while (tdsrm->inprog != 0 && tdsrm->done == 0) tsleep(&tdsrm->tsk, PVFS, "srvdsrm", timo); if (failpos == -1 && nfsds_failerr(tdsrm->err)) @@ -5062,7 +5169,8 @@ nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp, * to be disabled. */ if (failpos >= 0) { - nmp = VFSTONFS(dvp[failpos]->v_mount); + tdvpp = dvpp + failpos; + nmp = VFSTONFS((*tdvpp)->v_mount); NFSLOCKMNT(nmp); if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM | NFSMNTP_CANCELRPCS)) == 0) { @@ -5147,14 +5255,16 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, struct mbuf **mpp2, struct nfsvattr *nap, struct acl *aclp, off_t *offp, int content, bool *eofp) { - struct nfsmount *nmp[NFSDEV_MAXMIRRORS], *failnmp; - fhandle_t fh[NFSDEV_MAXMIRRORS]; - struct vnode *dvp[NFSDEV_MAXMIRRORS]; + struct nfsmount **nmp, *failnmp; + fhandle_t *fhp; + struct vnode **dvp; struct nfsdevice *ds; struct pnfsdsattr dsattr; struct opnfsdsattr odsattr; char *buf; int buflen, error, failpos, i, mirrorcnt, origmircnt, trycnt; + int stripecnt; + uint64_t stripesiz; NFSD_DEBUG(4, "in nfsrv_proxyds\n"); /* @@ -5165,7 +5275,8 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, nfsrv_devidcnt == 0) return (ENOENT); - buflen = 1024; + buflen = sizeof(struct pnfsdsfile) * NFSDEV_MAXMIRRORS * + NFSDEV_MAXSTRIPE; buf = malloc(buflen, M_TEMP, M_WAITOK); error = 0; @@ -5228,23 +5339,34 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, origmircnt = -1; trycnt = 0; tryagain: + nmp = NULL; + dvp = NULL; + fhp = NULL; if (error == 0) { - buflen = 1024; + buflen = sizeof(struct pnfsdsfile) * NFSDEV_MAXMIRRORS * + NFSDEV_MAXSTRIPE; if (ioproc == NFSPROC_READDS && NFSVOPISLOCKED(vp) == LK_EXCLUSIVE) printf("nfsrv_proxyds: Readds vp exclusively locked\n"); + if (ioproc == NFSPROC_WRITEDS && NFSVOPISLOCKED(vp) == + LK_SHARED) + NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY); error = nfsrv_dsgetsockmnt(vp, LK_SHARED, buf, &buflen, - &mirrorcnt, p, dvp, fh, NULL, NULL, NULL, NULL, NULL, - NULL, NULL); + &mirrorcnt, &stripecnt, &stripesiz, p, &dvp, &fhp, NULL, + NULL, NULL, NULL, NULL, NULL, NULL); if (error == 0) { - for (i = 0; i < mirrorcnt; i++) - nmp[i] = VFSTONFS(dvp[i]->v_mount); + nmp = malloc(sizeof(*nmp) * mirrorcnt * stripecnt, + M_TEMP, M_WAITOK); + for (i = 0; i < mirrorcnt * stripecnt; i++) + if (dvp[i] != NULL) + nmp[i] = VFSTONFS(dvp[i]->v_mount); + else + nmp[i] = NULL; } else printf("pNFS: proxy getextattr sockaddr=%d\n", error); } else printf("pNFS: nfsrv_dsgetsockmnt=%d\n", error); if (error == 0) { - failpos = -1; if (origmircnt == -1) origmircnt = mirrorcnt; /* @@ -5261,24 +5383,26 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, * failed mirror#. */ if (ioproc == NFSPROC_READDS) { - error = nfsrv_readdsrpc(fh, off, cnt, cred, p, nmp[0], - mpp, mpp2); - if (nfsds_failerr(error) && mirrorcnt > 1) { + error = nfsrv_readdsrpc(fhp, off, cnt, cred, p, nmp, + mirrorcnt, stripecnt, stripesiz, mpp, mpp2, + &failpos); + if (failpos >= 0 && mirrorcnt > 1) { /* * Setting failpos will cause the mirror * to be disabled and then a retry of this * read is required. */ - failpos = 0; error = 0; trycnt++; } } else if (ioproc == NFSPROC_WRITEDS) - error = nfsrv_writedsrpc(fh, off, cnt, cred, p, vp, - &nmp[0], mirrorcnt, mpp, cp, &failpos); + error = nfsrv_writedsrpc(fhp, off, cnt, cred, p, vp, + nmp, mirrorcnt, stripecnt, stripesiz, mpp, cp, + &failpos); else if (ioproc == NFSPROC_SETATTR) - error = nfsrv_setattrdsrpc(fh, cred, p, vp, &nmp[0], - mirrorcnt, nap, &failpos); + error = nfsrv_setattrdsrpc(fhp, vp, cred, p, nmp, + mirrorcnt, stripecnt, nap, &failpos); +#ifdef notnow else if (ioproc == NFSPROC_SETACL) error = nfsrv_setacldsrpc(fh, cred, p, vp, &nmp[0], mirrorcnt, aclp, &failpos); @@ -5301,16 +5425,16 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, else if (ioproc == NFSPROC_DEALLOCATE) error = nfsrv_deallocatedsrpc(fh, off, *offp, cred, p, vp, &nmp[0], mirrorcnt, &failpos); +#endif else { - error = nfsrv_getattrdsrpc(&fh[mirrorcnt - 1], cred, p, - vp, nmp[mirrorcnt - 1], nap); - if (nfsds_failerr(error) && mirrorcnt > 1) { + error = nfsrv_getattrdsrpc(fhp, vp, cred, p, + nmp, stripecnt, nap, &failpos); + if (failpos >= 0 && mirrorcnt > 1) { /* * Setting failpos will cause the mirror * to be disabled and then a retry of this * getattr is required. */ - failpos = mirrorcnt - 1; error = 0; trycnt++; } @@ -5335,8 +5459,12 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, } NFSUNLOCKMNT(failnmp); } - for (i = 0; i < mirrorcnt; i++) - NFSVOPUNLOCK(dvp[i]); + for (i = 0; i < mirrorcnt * stripecnt; i++) + if (dvp[i] != NULL) + NFSVOPUNLOCK(dvp[i]); + free(dvp, M_TEMP); + free(nmp, M_TEMP); + free(fhp, M_TEMP); NFSD_DEBUG(4, "nfsrv_proxyds: aft RPC=%d trya=%d\n", error, trycnt); /* Try the Read/Getattr again if a mirror was deleted. */ @@ -5361,46 +5489,111 @@ nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred, */ int nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, - int *mirrorcntp, NFSPROC_T *p, struct vnode **dvpp, fhandle_t *fhp, - char *devid, char *fnamep, struct vnode **nvpp, struct nfsmount **newnmpp, + int *mirrorcntp, int *stripecntp, uint64_t *stripesizp, NFSPROC_T *p, + struct vnode ***dvppp, fhandle_t **fhpp, + char **devid, char *fnamep, struct vnode **nvpp, struct nfsmount **newnmpp, struct nfsmount *curnmp, int *ippos, int *dsdirp) { struct vnode *dvp, *nvp = NULL, **tdvpp; struct mount *mp; struct nfsmount *nmp, *newnmp; + fhandle_t *tfhp; struct sockaddr *sad; struct sockaddr_in *sin; struct nfsdevice *ds, *tds, *fndds; struct pnfsdsfile *pf; + struct opnfsdsfile *opf; uint32_t dsdir; - int error, fhiszero, fnd, gotone, i, mirrorcnt; + int error, fhiszero, fnd, gotmirror, gotone, i, j, k, l, m, mirrorcnt; + char *tdevid; + bool dvplocked; ASSERT_VOP_LOCKED(vp, "nfsrv_dsgetsockmnt vp"); - *mirrorcntp = 1; - tdvpp = dvpp; + dvplocked = false; + if (mirrorcntp != NULL) + *mirrorcntp = 1; + if (stripecntp != NULL) + *stripecntp = 1; + if (stripesizp != NULL) + *stripesizp = 0; if (nvpp != NULL) *nvpp = NULL; - if (dvpp != NULL) - *dvpp = NULL; + if (dvppp != NULL) + *dvppp = NULL; + if (fhpp != NULL) + *fhpp = NULL; + if (devid != NULL) + *devid = NULL; + tdevid = NULL; if (ippos != NULL) *ippos = -1; if (newnmpp != NULL) newnmp = *newnmpp; else newnmp = NULL; + if (fnamep != NULL) + fnamep[0] = '\0'; mp = vp->v_mount; error = vn_extattr_get(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", buflenp, buf, p); - mirrorcnt = *buflenp / sizeof(*pf); - if (error == 0 && (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS || - *buflenp != sizeof(*pf) * mirrorcnt)) + if (error == 0 && *buflenp > 0) { + j = *buflenp / sizeof(*pf); + if (*buflenp != sizeof(*pf) * j) { + /* Try opnfsdsfile. */ + j = *buflenp / sizeof(*opf); + if (j >= 1 && *buflenp == sizeof(*opf) * j) { + char *tbuf; + + tbuf = malloc(*buflenp, M_TEMP, M_WAITOK); + memcpy(tbuf, buf, *buflenp); + pf = (struct pnfsdsfile *)buf; + opf = (struct opnfsdsfile *)tbuf; + for (k = 0; k < j; k++, pf++, opf++) { + memcpy(&pf->dsf_fh, opf, sizeof(*opf)); + pf->dsf_stripecnt = 1; + pf->dsf_stripesiz = 0; + } + free(tbuf, M_TEMP); + } else + error = ENOATTR; + } else if (j < 1) + error = ENOATTR; + } else if (error == 0) error = ENOATTR; + if (error != 0) + return (error); pf = (struct pnfsdsfile *)buf; + if (pf->dsf_stripesiz > 0) { + mirrorcnt = j / pf->dsf_stripecnt; + k = pf->dsf_stripecnt; + } else { + mirrorcnt = j; + k = 1; + } + if (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS || + k < 1 || k > NFSDEV_MAXSTRIPE || j != mirrorcnt * k) + return (ENOATTR); + if (stripecntp != NULL) + *stripecntp = k; + if (stripesizp != NULL) + *stripesizp = pf->dsf_stripesiz; + + /* Allocate a large enough array for dvppp, if required. */ + if (dvppp != NULL) + tdvpp = *dvppp = malloc(sizeof(*tdvpp) * mirrorcnt * k, M_TEMP, + M_WAITOK | M_ZERO); + if (fhpp != NULL) + tfhp = *fhpp = malloc(sizeof(*tfhp) * mirrorcnt * k, M_TEMP, + M_WAITOK); + if (devid != NULL) + tdevid = *devid = malloc(NFSX_V4DEVICEID * mirrorcnt * k, + M_TEMP, M_WAITOK); + /* If curnmp != NULL, check for a match in the mirror list. */ - if (curnmp != NULL && error == 0) { + if (curnmp != NULL) { fnd = 0; - for (i = 0; i < mirrorcnt; i++, pf++) { + for (i = 0; i < j; i += k, pf += k) { sad = (struct sockaddr *)&pf->dsf_sin; if (nfsaddr2_match(sad, curnmp->nm_nam)) { if (ippos != NULL) @@ -5413,11 +5606,12 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, error = ENXIO; } - gotone = 0; + gotmirror = gotone = 0; + l = 0; /* Index for tdvpp and tfhp. */ pf = (struct pnfsdsfile *)buf; - NFSD_DEBUG(4, "nfsrv_dsgetsockmnt: mirrorcnt=%d err=%d\n", mirrorcnt, - error); - for (i = 0; i < mirrorcnt && error == 0; i++, pf++) { + NFSD_DEBUG(4, "nfsrv_dsgetsockmnt: mirrorcnt=%d stripecnt=%d err=%d\n", + mirrorcnt, k, error); + for (i = 0; i < j && error == 0; i++, pf++) { fhiszero = 0; sad = (struct sockaddr *)&pf->dsf_sin; sin = &pf->dsf_sin; @@ -5486,6 +5680,7 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, dvp = fndds->nfsdev_dsdir[dsdir]; if (lktype != 0 || fhiszero != 0 || (nvpp != NULL && *nvpp == NULL)) { + dvplocked = true; if (fhiszero != 0) error = vn_lock(dvp, LK_EXCLUSIVE); @@ -5525,16 +5720,21 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, if (error == 0) { gotone++; NFSD_DEBUG(4, "gotone=%d\n", gotone); - if (devid != NULL) { + if (tdevid != NULL) { NFSBCOPY(fndds->nfsdev_deviceid, - devid, NFSX_V4DEVICEID); - devid += NFSX_V4DEVICEID; + tdevid, NFSX_V4DEVICEID); + tdevid += NFSX_V4DEVICEID; } - if (dvpp != NULL) - *tdvpp++ = dvp; - if (fhp != NULL) - NFSBCOPY(&pf->dsf_fh, fhp++, + if (dvppp != NULL) + tdvpp[l] = dvp; + if (fhpp != NULL) + NFSBCOPY(&pf->dsf_fh, &tfhp[l], NFSX_MYFH); + if (dvppp != NULL || fhpp != NULL) { + l++; + if (l % k == 0) + gotmirror++; + } if (fnamep != NULL && gotone == 1) strlcpy(fnamep, pf->dsf_filename, @@ -5542,27 +5742,63 @@ nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp, } else NFSD_DEBUG(4, "nfsrv_dsgetsockmnt " "err=%d\n", error); + } else if (fnamep == NULL) { + /* + * fnamep is NULL for ReadDS, WriteDS, + * SetattrDS and GetattrDS. For these cases, + * do not use a partial stripe set as a + * mirror. + */ + for (m = l / k * k; m < l; m++) { + if (dvplocked) + NFSVOPUNLOCK(tdvpp[m]); + tdvpp[m] = NULL; + } + l = l / k * k; } } } - if (error == 0 && gotone == 0) + if (fnamep != NULL) { + /* + * If fnamep != NULL, a list of all DSs is wanted. + * For this, cheat and return the total cound of DSs in + * mirrorcnt. + */ + gotmirror = gotone; + NFSD_DEBUG(4, "nfsrv_dsgetsockmnt: fname=%s, gotmirror=%d\n", + fnamep, gotmirror); + } + if (error == 0 && gotmirror == 0) error = ENOENT; NFSD_DEBUG(4, "eo nfsrv_dsgetsockmnt: gotone=%d err=%d\n", gotone, error); - if (error == 0) - *mirrorcntp = gotone; - else { - if (gotone > 0 && dvpp != NULL) { - /* - * If the error didn't occur on the first one and - * dvpp != NULL, the one(s) prior to the failure will - * have locked dvp's that need to be unlocked. - */ - for (i = 0; i < gotone; i++) { - NFSVOPUNLOCK(*dvpp); - *dvpp++ = NULL; + if (error == 0) { + if (mirrorcntp != NULL) + *mirrorcntp = gotmirror; + } else { + if (dvppp != NULL) { + if (l > 0 && dvplocked) { + /* + * If the error didn't occur on the first one + * and dvppp != NULL, the one(s) prior to the + * failure will have locked dvp's that need to + * be unlocked. + */ + tdvpp = *dvppp; + for (i = 0; i < l; i++) + NFSVOPUNLOCK(*tdvpp++); } + free(*dvppp, M_TEMP); + *dvppp = NULL; + } + if (fhpp != NULL) { + free(*fhpp, M_TEMP); + *fhpp = NULL; + } + if (devid != NULL) { + free(*devid, M_TEMP); + *devid = NULL; } /* * If it found the vnode to be copied from before a failure, @@ -5598,8 +5834,27 @@ nfsrv_setextattr(struct vnode *vp, struct nfsvattr *nap, NFSPROC_T *p) return (error); } +/* + * Do a read RPC on a DS data file, using this structure for the arguments, + * so that this function can be executed by a separate kernel process. + */ +struct nfsrvreaddsdorpc { + int done; + int inprog; + struct task tsk; + fhandle_t fh; + off_t off; + int len; + struct nfsmount *nmp; + struct ucred *cred; + NFSPROC_T *p; + struct mbuf *m; + struct mbuf *mend; + int err; +}; + static int -nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, +nfsrv_readdsdorpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, NFSPROC_T *p, struct nfsmount *nmp, struct mbuf **mpp, struct mbuf **mpendp) { uint32_t *tl; @@ -5608,7 +5863,7 @@ nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, struct mbuf *m, *m2; int error = 0, retlen, tlen, trimlen; - NFSD_DEBUG(4, "in nfsrv_readdsrpc\n"); + NFSD_DEBUG(4, "in nfsrv_readdsdorpc\n"); nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); *mpp = NULL; /* @@ -5648,7 +5903,7 @@ nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, m = m->m_next; } if (m == NULL) { - printf("nfsrv_readdsrpc: busted mbuf list\n"); + printf("nfsrv_readdsdorpc: busted mbuf list\n"); error = ENOENT; goto nfsmout; } @@ -5682,7 +5937,7 @@ nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, m = m->m_next; } while (m != NULL); if (tlen > 0) { - printf("nfsrv_readdsrpc: busted mbuf list\n"); + printf("nfsrv_readdsdorpc: busted mbuf list\n"); error = ENOENT; goto nfsmout; } @@ -5696,7 +5951,135 @@ nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, /* If nd->nd_mrep is already NULL, this is a no-op. */ m_freem(nd->nd_mrep); free(nd, M_TEMP); - NFSD_DEBUG(4, "nfsrv_readdsrpc error=%d\n", error); + NFSD_DEBUG(4, "nfsrv_readdsdorpc error=%d\n", error); + return (error); +} + +/* + * Start up the thread that will execute nfsrv_readdsdorpc(). + */ +static void +start_readdsdorpc(void *arg, int pending) +{ + struct nfsrvreaddsdorpc *drpc; + + drpc = (struct nfsrvreaddsdorpc *)arg; + drpc->err = nfsrv_readdsdorpc(&drpc->fh, drpc->off, drpc->len, + drpc->cred, drpc->p, drpc->nmp, &drpc->m, &drpc->mend); + drpc->done = 1; + NFSD_DEBUG(4, "start_readdsdorpc: err=%d\n", drpc->err); +} + +static int +nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, + NFSPROC_T *p, struct nfsmount **nmp, int mirrorcnt, int stripecnt, + uint64_t stripesiz, struct mbuf **mpp, struct mbuf **mendp, int *failposp) +{ + struct nfsrvreaddsdorpc *drpc, *tdrpc; + struct mbuf *m, *m2, *mend; + fhandle_t *tfhp; + struct nfsmount **tnmp; + uint64_t scnt; + int error, i, j, k, l, n, ret, timo; + + NFSD_DEBUG(4, "in nfsrv_readdsrpc\n"); + drpc = NULL; + m2 = *mpp = NULL; + *failposp = -1; + if (stripecnt > 1) { + /* Set j to the upper bound of the # of stripes to read. */ + j = (len + len - 1) / stripesiz + 1; + tdrpc = drpc = malloc(sizeof(*drpc) * j, M_TEMP, M_WAITOK); + } + + /* For each stripe except last one, read the stripe. */ + for (j = 0; ; j++, tdrpc++) { + if (stripecnt > 1) { + k = (off / stripesiz) % (uint64_t)stripecnt; + scnt = stripesiz - (off % stripesiz); + l = ((uint64_t)len < scnt) ? len : (int)scnt; + } else { + k = 0; + l = len; + } + if (j == 0) + n = k; /* Save first stripe# for later. */ + tfhp = fhp + k; + tnmp = nmp + k; + NFSD_DEBUG(4, "nfsrv_readdsrpc: mcopy k=%d l=%d\n", k, l); + + /* Break out of the loop for the last stripe. */ + if (l == len) + break; + + /* + * Do the read RPC for every DS, using a separate kernel + * process for every DS, except the last one. + */ + error = 0; + tdrpc->done = 0; + NFSBCOPY(tfhp, &tdrpc->fh, sizeof(*tfhp)); + tdrpc->off = off; + tdrpc->len = l; + tdrpc->nmp = *tnmp; + tdrpc->cred = cred; + tdrpc->p = p; + tdrpc->inprog = 0; + tdrpc->err = 0; + ret = EIO; + if (nfs_pnfsiothreads != 0) { + ret = nfs_pnfsio(start_readdsdorpc, tdrpc); + NFSD_DEBUG(4, "nfsrv_readdsrpc: " + "nfs_pnfsio=%d\n", ret); + } + if (ret != 0) { + ret = nfsrv_readdsdorpc(tfhp, off, l, cred, p, + *tnmp, &tdrpc->m, &tdrpc->mend); + if (nfsds_failerr(ret) && *failposp == -1) + *failposp = k; + else if (error == 0 && ret != 0) + tdrpc->err = ret; + tdrpc->inprog = 0; + tdrpc->done = 1; + } + off += l; + len -= l; + } + ret = nfsrv_readdsdorpc(tfhp, off, l, cred, p, *tnmp, &m, &mend); + if (nfsds_failerr(ret) && *failposp == -1) + *failposp = k; + if (error == 0 && ret != 0) + error = ret; + NFSD_DEBUG(4, "nfsrv_readdsrpc: aft stripes=%d\n", error); + tdrpc = drpc; + timo = hz / 50; /* Wait for 20msec. */ + if (timo < 1) + timo = 1; + k = n; /* Keep track of stripe#. */ + for (i = 0; i < j - 1; i++, tdrpc++) { + /* Wait for RPCs on separate threads to complete. */ + while (tdrpc->inprog != 0 && tdrpc->done == 0) + tsleep(&tdrpc->tsk, PVFS, "srvwrds", timo); + if (nfsds_failerr(tdrpc->err) && *failposp == -1) + *failposp = k; + else if (error == 0 && tdrpc->err != 0) + error = tdrpc->err; + else { + if (m2 != NULL) + m2->m_next = tdrpc->m; + else + *mpp = tdrpc->m; + m2 = tdrpc->mend; + } + k = (k + 1) % stripecnt; + } + if (m2 != NULL) + m2->m_next = m; + else + *mpp = m; + *mendp = mend; + + free(drpc, M_TEMP); return (error); } @@ -5847,62 +6230,100 @@ start_writedsdorpc(void *arg, int pending) static int nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, - NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, - struct mbuf **mpp, char *cp, int *failposp) + NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmp, int mirrorcnt, + int stripecnt, uint64_t stripesiz, struct mbuf **mpp, char *cp, + int *failposp) { - struct nfsrvwritedsdorpc *drpc, *tdrpc = NULL; + struct nfsrvwritedsdorpc *drpc, *tdrpc; struct nfsvattr na; struct mbuf *m; - int error, i, offs, ret, timo; + fhandle_t *tfhp; + struct nfsmount **tnmp; + uint64_t scnt; + int error, i, j, k, l, n, o, offs, ret, timo; NFSD_DEBUG(4, "in nfsrv_writedsrpc\n"); KASSERT(*mpp != NULL, ("nfsrv_writedsrpc: NULL mbuf chain")); drpc = NULL; - if (mirrorcnt > 1) - tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, - M_WAITOK); + *failposp = -1; + if (mirrorcnt > 1 || stripecnt > 1) { + /* Set j to the upper bound of the # of DSs to read. */ + if (stripecnt > 1) + j = (len + len - 1) / stripesiz + 1; + else + j = 1; + j *= mirrorcnt; + tdrpc = drpc = malloc(sizeof(*drpc) * j, M_TEMP, M_WAITOK); + } /* Calculate offset in mbuf chain that data starts. */ offs = cp - mtod(*mpp, char *); - NFSD_DEBUG(4, "nfsrv_writedsrpc: mcopy offs=%d len=%d\n", offs, len); + /* For each stripe, write to all the mirrors. */ + for (j = 0; ; j++) { + if (stripecnt > 1) { + k = (off / stripesiz) % (uint64_t)stripecnt; + scnt = stripesiz - (off % stripesiz); + l = ((uint64_t)len < scnt) ? len : (int)scnt; + } else { + k = 0; + l = len; + } + if (j == 0) + o = k; /* Save first stripe# for later. */ + tfhp = fhp + k; + tnmp = nmp + k; + NFSD_DEBUG(4, "nfsrv_writedsrpc: mcopy offs=%d k=%d " + "l=%d\n", offs, k, l); - /* - * Do the write RPC for every DS, using a separate kernel process - * for every DS except the last one. - */ - error = 0; - for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { - tdrpc->done = 0; - NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp)); - tdrpc->off = off; - tdrpc->len = len; - tdrpc->nmp = *nmpp; - tdrpc->cred = cred; - tdrpc->p = p; - tdrpc->inprog = 0; - tdrpc->err = 0; - tdrpc->m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK); - ret = EIO; - if (nfs_pnfsiothreads != 0) { - ret = nfs_pnfsio(start_writedsdorpc, tdrpc); - NFSD_DEBUG(4, "nfsrv_writedsrpc: nfs_pnfsio=%d\n", - ret); + /* + * Do the write RPC for every DS, using a separate kernel + * process for every DS, except the last one. + */ + error = 0; + n = mirrorcnt; + if (l == len) + n--; + for (i = 0; i < n; i++, tdrpc++) { + tdrpc->done = 0; + NFSBCOPY(tfhp, &tdrpc->fh, sizeof(*tfhp)); + tdrpc->off = off; + tdrpc->len = l; + tdrpc->nmp = *tnmp; + tdrpc->cred = cred; + tdrpc->p = p; + tdrpc->inprog = 0; + tdrpc->err = 0; + tdrpc->m = m_copym(*mpp, offs, NFSM_RNDUP(l), + M_WAITOK); + ret = EIO; + if (nfs_pnfsiothreads != 0) { + ret = nfs_pnfsio(start_writedsdorpc, tdrpc); + NFSD_DEBUG(4, "nfsrv_writedsrpc: " + "nfs_pnfsio=%d\n", ret); + } + if (ret != 0) { + ret = nfsrv_writedsdorpc(*tnmp, tfhp, off, l, + NULL, tdrpc->m, cred, p); + if (nfsds_failerr(ret) && *failposp == -1) + *failposp = k; + else if (error == 0 && ret != 0) + tdrpc->err = ret; + tdrpc->inprog = 0; + tdrpc->done = 1; + } + tnmp += stripecnt; + tfhp += stripecnt; } - if (ret != 0) { - ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, NULL, - tdrpc->m, cred, p); - if (nfsds_failerr(ret) && *failposp == -1) - *failposp = i; - else if (error == 0 && ret != 0) - error = ret; - } - nmpp++; - fhp++; + if (l == len) + break; + offs += l; + off += l; + len -= l; } - m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK); - ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, &na, m, cred, p); + m = m_copym(*mpp, offs, NFSM_RNDUP(l), M_WAITOK); + ret = nfsrv_writedsdorpc(*tnmp, tfhp, off, l, &na, m, cred, p); if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1) - *failposp = mirrorcnt - 1; + *failposp = k; else if (error == 0 && ret != 0) error = ret; if (error == 0) @@ -5912,19 +6333,22 @@ nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred, timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; - for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { + k = o; + for (i = 0; i < j * mirrorcnt - 1; i++, tdrpc++) { /* Wait for RPCs on separate threads to complete. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) tsleep(&tdrpc->tsk, PVFS, "srvwrds", timo); if (nfsds_failerr(tdrpc->err) && *failposp == -1) - *failposp = i; + *failposp = k; else if (error == 0 && tdrpc->err != 0) error = tdrpc->err; + k = (k + 1) % stripecnt; } free(drpc, M_TEMP); return (error); } +#ifdef notnow /* * Do a allocate RPC on a DS data file, using this structure for the arguments, * so that this function can be executed by a separate kernel process. @@ -6267,10 +6691,11 @@ nfsrv_deallocatedsrpc(fhandle_t *fhp, off_t off, off_t len, struct ucred *cred, free(drpc, M_TEMP); return (error); } +#endif static int -nfsrv_setattrdsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, - struct vnode *vp, struct nfsmount *nmp, struct nfsvattr *nap, +nfsrv_setattrdsdorpc(fhandle_t *fhp, struct vnode *vp, struct ucred *cred, + NFSPROC_T *p, struct nfsmount *nmp, struct nfsvattr *nap, struct nfsvattr *dsnap) { uint32_t *tl; @@ -6291,7 +6716,7 @@ nfsrv_setattrdsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, st.other[1] = 0x55555555; st.other[2] = 0x55555555; st.seqid = 0xffffffff; - nfscl_reqstart(nd, NFSPROC_SETATTR, nmp, (u_int8_t *)fhp, sizeof(*fhp), + nfscl_reqstart(nd, NFSPROC_SETATTR, nmp, (uint8_t *)fhp, sizeof(*fhp), NULL, NULL, 0, 0, cred); nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID); nfscl_fillsattr(nd, &nap->na_vattr, vp, NFSSATTR_FULL, 0); @@ -6377,83 +6802,98 @@ start_setattrdsdorpc(void *arg, int pending) struct nfsrvsetattrdsdorpc *drpc; drpc = (struct nfsrvsetattrdsdorpc *)arg; - drpc->err = nfsrv_setattrdsdorpc(&drpc->fh, drpc->cred, drpc->p, - drpc->vp, drpc->nmp, &drpc->na, &drpc->dsna); + drpc->err = nfsrv_setattrdsdorpc(&drpc->fh, drpc->vp, drpc->cred, + drpc->p, drpc->nmp, &drpc->na, &drpc->dsna); drpc->done = 1; } static int -nfsrv_setattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, - struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, +nfsrv_setattrdsrpc(fhandle_t *fhp, struct vnode *vp, struct ucred *cred, + NFSPROC_T *p, struct nfsmount **nmp, int mirrorcnt, int stripecnt, struct nfsvattr *nap, int *failposp) { - struct nfsrvsetattrdsdorpc *drpc, *tdrpc = NULL; + struct nfsrvsetattrdsdorpc *drpc, *tdrpc; + fhandle_t *tfhp; + struct nfsmount **tnmp; struct nfsvattr na; - int error, i, ret, timo; + int error, i, j, ret, timo; NFSD_DEBUG(4, "in nfsrv_setattrdsrpc\n"); drpc = NULL; - if (mirrorcnt > 1) - tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP, - M_WAITOK); - - /* - * Do the setattr RPC for every DS, using a separate kernel process - * for every DS except the last one. - */ + *failposp = -1; error = 0; - for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { + if (mirrorcnt > 1 || stripecnt > 1) + tdrpc = drpc = malloc(sizeof(*drpc) * stripecnt * mirrorcnt, + M_TEMP, M_WAITOK); + + /* For each stripe, write to all the mirrors. */ + tfhp = fhp; + tnmp = nmp; + for (i = 0; i < stripecnt * mirrorcnt - 1; i++, tdrpc++, tfhp++, + tnmp++) { + j = i / stripecnt; + NFSD_DEBUG(4, "nfsrv_setattrdsrpc: stripe=%d mirror=%d\n", + i, j); tdrpc->done = 0; - tdrpc->inprog = 0; - NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp)); - tdrpc->nmp = *nmpp; + NFSBCOPY(nap, &tdrpc->na, sizeof(*nap)); + NFSBCOPY(tfhp, &tdrpc->fh, sizeof(*tfhp)); tdrpc->vp = vp; + tdrpc->nmp = *tnmp; tdrpc->cred = cred; tdrpc->p = p; - tdrpc->na = *nap; + tdrpc->inprog = 0; tdrpc->err = 0; ret = EIO; if (nfs_pnfsiothreads != 0) { ret = nfs_pnfsio(start_setattrdsdorpc, tdrpc); - NFSD_DEBUG(4, "nfsrv_setattrdsrpc: nfs_pnfsio=%d\n", - ret); + NFSD_DEBUG(4, "nfsrv_setattrdsrpc: " + "nfs_pnfsio=%d\n", ret); } if (ret != 0) { - ret = nfsrv_setattrdsdorpc(fhp, cred, p, vp, *nmpp, nap, - &na); + ret = nfsrv_setattrdsdorpc(tfhp, vp, cred, p, + *tnmp, &tdrpc->na, &tdrpc->dsna); if (nfsds_failerr(ret) && *failposp == -1) *failposp = i; else if (error == 0 && ret != 0) - error = ret; + tdrpc->err = ret; + tdrpc->inprog = 0; + tdrpc->done = 1; } - nmpp++; - fhp++; } - ret = nfsrv_setattrdsdorpc(fhp, cred, p, vp, *nmpp, nap, &na); + ret = nfsrv_setattrdsdorpc(tfhp, vp, cred, p, *tnmp, nap, &na); if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1) - *failposp = mirrorcnt - 1; + *failposp = i; else if (error == 0 && ret != 0) error = ret; - if (error == 0) - error = nfsrv_setextattr(vp, &na, p); - NFSD_DEBUG(4, "nfsrv_setattrdsrpc: aft setextat=%d\n", error); tdrpc = drpc; timo = hz / 50; /* Wait for 20msec. */ if (timo < 1) timo = 1; - for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) { + for (i = 0; i < stripecnt * mirrorcnt - 1; i++, tdrpc++) { /* Wait for RPCs on separate threads to complete. */ while (tdrpc->inprog != 0 && tdrpc->done == 0) - tsleep(&tdrpc->tsk, PVFS, "srvsads", timo); + tsleep(&tdrpc->tsk, PVFS, "srvwrds", timo); if (nfsds_failerr(tdrpc->err) && *failposp == -1) *failposp = i; else if (error == 0 && tdrpc->err != 0) error = tdrpc->err; } + + /* Find the reply attribute with the largest size and set that one. */ + if (error == 0 && (mirrorcnt > 1 || stripecnt > 1)) { + tdrpc = drpc; + for (i = 0; i < stripecnt * mirrorcnt - 1; i++, tdrpc++) { + if (tdrpc->dsna.na_size > na.na_size) + NFSBCOPY(&tdrpc->dsna, &na, sizeof(*nap)); + } + } + if (error == 0) + error = nfsrv_setextattr(vp, &na, p); free(drpc, M_TEMP); return (error); } +#ifdef notnow /* * Do a Setattr of an NFSv4 ACL on the DS file. */ @@ -6599,19 +7039,143 @@ nfsrv_setacldsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, free(drpc, M_TEMP); return (error); } +#endif + +struct nfsrvgetattrdsdorpc { + int done; + int inprog; + struct task tsk; + fhandle_t fh; + struct vnode *vp; + struct nfsvattr na; + struct nfsmount *nmp; + struct ucred *cred; + NFSPROC_T *p; + int err; +}; + +/* + * Start up the thread that will execute nfsrv_getattrdsdorpc(). + */ +static void +start_getattrdsdorpc(void *arg, int pending) +{ + struct nfsrvgetattrdsdorpc *drpc; + + drpc = (struct nfsrvgetattrdsdorpc *)arg; + drpc->err = nfsrv_getattrdsdorpc(&drpc->fh, drpc->vp, drpc->cred, + drpc->p, drpc->nmp, &drpc->na); + drpc->done = 1; + NFSD_DEBUG(4, "start_getattrdsdorpc: err=%d\n", drpc->err); +} + +/* + * For a striped configuration, a getattr RPC must be done on all stripes, + * since there is no way of knowing which DS currently stores the last + * bytes of the file. + */ +static int +nfsrv_getattrdsrpc(fhandle_t *fhp, struct vnode *vp, struct ucred *cred, + NFSPROC_T *p, struct nfsmount **nmp, int stripecnt, struct nfsvattr *nap, + int *failposp) +{ + struct nfsrvgetattrdsdorpc *drpc, *tdrpc; + fhandle_t *tfhp; + struct nfsmount **tnmp; + int error, i, ret, timo; + + NFSD_DEBUG(4, "in nfsrv_getattrdsrpc\n"); + drpc = NULL; + *failposp = -1; + if (stripecnt > 1) + tdrpc = drpc = malloc(sizeof(*drpc) * stripecnt, M_TEMP, + M_WAITOK); + + /* For each stripe except last one, do a Getattr.. */ + tfhp = fhp; + tnmp = nmp; + for (i = 0; i < stripecnt - 1; i++, tdrpc++, tfhp++, tnmp++) { + NFSD_DEBUG(4, "nfsrv_getattrdsrpc: stripe=%d\n", i); + error = 0; + tdrpc->done = 0; + NFSBCOPY(tfhp, &tdrpc->fh, sizeof(*tfhp)); + tdrpc->nmp = *tnmp; + tdrpc->vp = vp; + tdrpc->cred = cred; + tdrpc->p = p; + tdrpc->inprog = 0; + tdrpc->err = 0; + ret = EIO; + if (nfs_pnfsiothreads != 0) { + ret = nfs_pnfsio(start_getattrdsdorpc, tdrpc); + NFSD_DEBUG(4, "nfsrv_getattrdsrpc: " + "nfs_pnfsio=%d\n", ret); + } + if (ret != 0) { + ret = nfsrv_getattrdsdorpc(tfhp, vp, cred, p, *tnmp, + &tdrpc->na); + if (nfsds_failerr(ret) && *failposp == -1) + *failposp = i; + else if (error == 0 && ret != 0) + tdrpc->err = ret; + tdrpc->inprog = 0; + tdrpc->done = 1; + } + } + ret = nfsrv_getattrdsdorpc(tfhp, vp, cred, p, *tnmp, nap); + if (nfsds_failerr(ret) && *failposp == -1) + *failposp = i; + if (error == 0 && ret != 0) + error = ret; + NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft stripes=%d\n", error); + timo = hz / 50; /* Wait for 20msec. */ + if (timo < 1) + timo = 1; + tdrpc = drpc; + for (i = 0; i < stripecnt - 1; i++, tdrpc++) { + /* Wait for RPCs on separate threads to complete. */ + while (tdrpc->inprog != 0 && tdrpc->done == 0) + tsleep(&tdrpc->tsk, PVFS, "srvwrds", timo); + if (nfsds_failerr(tdrpc->err) && *failposp == -1) + *failposp = i; + else if (error == 0 && tdrpc->err != 0) + error = tdrpc->err; + } + + /* Find the attribute with the largest size and return that one. */ + if (stripecnt > 1) { + tdrpc = drpc; + for (i = 0; i < stripecnt - 1; i++, tdrpc++) { + if (tdrpc->na.na_size > nap->na_size) + NFSBCOPY(&tdrpc->na, nap, sizeof(*nap)); + } + } + /* + * We can only save the updated values in the extended + * attribute if the vp is exclusively locked. + */ + if (error == 0 && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { + error = nfsrv_setextattr(vp, nap, p); + NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft setextat=%d\n", + error); + } + + free(drpc, M_TEMP); + return (error); +} /* * Getattr call to the DS for the attributes that change due to writing. */ static int -nfsrv_getattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, - struct vnode *vp, struct nfsmount *nmp, struct nfsvattr *nap) +nfsrv_getattrdsdorpc(fhandle_t *fhp, struct vnode *vp, struct ucred *cred, + NFSPROC_T *p, struct nfsmount *nmp, struct nfsvattr *nap) { struct nfsrv_descript *nd; int error; nfsattrbit_t attrbits; - NFSD_DEBUG(4, "in nfsrv_getattrdsrpc\n"); + NFSD_DEBUG(4, "in nfsrv_getattrdsdorpc\n"); nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO); nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, (u_int8_t *)fhp, sizeof(fhandle_t), NULL, NULL, 0, 0, cred); @@ -6628,35 +7192,21 @@ nfsrv_getattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p, free(nd, M_TEMP); return (error); } - NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft getattrrpc=%d\n", + NFSD_DEBUG(4, "nfsrv_getattrdsdorpc: aft getattrrpc=%d\n", nd->nd_repstat); - if (nd->nd_repstat == 0) { + if (nd->nd_repstat == 0) error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); - /* - * We can only save the updated values in the extended - * attribute if the vp is exclusively locked. - * This should happen when any of the following operations - * occur on the vnode: - * Close, Delegreturn, LayoutCommit, LayoutReturn - * As such, the updated extended attribute should get saved - * before nfsrv_checkdsattr() returns 0 and allows the cached - * attributes to be returned without calling this function. - */ - if (error == 0 && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { - error = nfsrv_setextattr(vp, nap, p); - NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft setextat=%d\n", - error); - } - } else + else error = nd->nd_repstat; m_freem(nd->nd_mrep); free(nd, M_TEMP); - NFSD_DEBUG(4, "nfsrv_getattrdsrpc error=%d\n", error); + NFSD_DEBUG(4, "nfsrv_getattrdsdorpc error=%d\n", error); return (error); } +#ifdef notnow /* * Seek call to a DS. */ @@ -6709,21 +7259,24 @@ nfsrv_seekdsrpc(fhandle_t *fhp, off_t *offp, int content, bool *eofp, NFSD_DEBUG(4, "nfsrv_seekdsrpc error=%d\n", error); return (error); } +#endif /* * Get the device id and file handle for a DS file. */ int nfsrv_dsgetdevandfh(struct vnode *vp, NFSPROC_T *p, int *mirrorcntp, - fhandle_t *fhp, char *devid) + uint64_t *stripesizp, int *stripecntp, fhandle_t **fhpp, char **devid) { int buflen, error; char *buf; - buflen = 1024; + buflen = sizeof(struct pnfsdsfile) * NFSDEV_MAXMIRRORS * + NFSDEV_MAXSTRIPE; buf = malloc(buflen, M_TEMP, M_WAITOK); - error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, NULL, - fhp, devid, NULL, NULL, NULL, NULL, NULL, NULL); + error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, stripecntp, + stripesizp, p, NULL, fhpp, devid, NULL, NULL, NULL, NULL, NULL, + NULL); free(buf, M_TEMP); return (error); } @@ -6766,7 +7319,7 @@ nfsrv_pnfslookupds(struct vnode *vp, struct vnode *dvp, struct pnfsdsfile *pf, * Set the file handle to the correct one. */ static void -nfsrv_pnfssetfh(struct vnode *vp, struct pnfsdsfile *pf, char *devid, +nfsrv_pnfssetfh(struct vnode *vp, struct pnfsdsfile *pf, char **devid, char *fnamep, struct vnode *nvp, NFSPROC_T *p) { struct nfsnode *np; @@ -6958,6 +7511,7 @@ nfsvno_seek(struct nfsrv_descript *nd, struct vnode *vp, u_long cmd, int error, ret; ASSERT_VOP_LOCKED(vp, "nfsvno_seek vp"); +#ifdef notnow /* * Attempt to seek on a DS file. A return of ENOENT implies * there is no DS file to seek on. @@ -6968,6 +7522,7 @@ nfsvno_seek(struct nfsrv_descript *nd, struct vnode *vp, u_long cmd, vput(vp); return (error); } +#endif /* * Do the VOP_IOCTL() call. For the case where *offp == file_size, @@ -7003,6 +7558,7 @@ nfsvno_allocate(struct vnode *vp, off_t off, off_t len, struct ucred *cred, off_t olen; ASSERT_VOP_ELOCKED(vp, "nfsvno_allocate vp"); +#ifdef notnow /* * Attempt to allocate on a DS file. A return of ENOENT implies * there is no DS file to allocate on. @@ -7011,6 +7567,7 @@ nfsvno_allocate(struct vnode *vp, off_t off, off_t len, struct ucred *cred, NULL, NULL, NULL, NULL, &len, 0, NULL); if (error != ENOENT) return (error); +#endif /* * Do the actual VOP_ALLOCATE(), looping so long as @@ -7039,6 +7596,7 @@ nfsvno_deallocate(struct vnode *vp, off_t off, off_t len, struct ucred *cred, off_t olen; ASSERT_VOP_ELOCKED(vp, "nfsvno_deallocate vp"); +#ifdef notnow /* * Attempt to deallocate on a DS file. A return of ENOENT implies * there is no DS file to deallocate on. @@ -7047,6 +7605,7 @@ nfsvno_deallocate(struct vnode *vp, off_t off, off_t len, struct ucred *cred, NULL, NULL, NULL, NULL, &len, 0, NULL); if (error != ENOENT) return (error); +#endif /* * Do the actual VOP_DEALLOCATE(), looping so long as diff --git a/sys/fs/nfsserver/nfs_nfsdserv.c b/sys/fs/nfsserver/nfs_nfsdserv.c index ad0f495bbd6..9e5235f95ed 100644 --- a/sys/fs/nfsserver/nfs_nfsdserv.c +++ b/sys/fs/nfsserver/nfs_nfsdserv.c @@ -5133,11 +5133,15 @@ nfsrvd_layoutget(struct nfsrv_descript *nd, __unused int isdgram, } layp = NULL; +#ifdef notnow if (layouttype == NFSLAYOUT_NFSV4_1_FILES && nfsrv_maxpnfsmirror == 1) layp = malloc(NFSX_V4FILELAYOUT, M_TEMP, M_WAITOK); else if (layouttype == NFSLAYOUT_FLEXFILE) - layp = malloc(NFSX_V4FLEXLAYOUT(nfsrv_maxpnfsmirror), M_TEMP, - M_WAITOK); +#else + if (layouttype == NFSLAYOUT_FLEXFILE) +#endif + layp = malloc(NFSX_V4FLEXLAYOUT(NFSDEV_MAXMIRRORS, + NFSDEV_MAXSTRIPE), M_TEMP, M_WAITOK); else nd->nd_repstat = NFSERR_UNKNLAYOUTTYPE; if (layp != NULL) @@ -5692,7 +5696,7 @@ nfsrvd_allocate(struct nfsrv_descript *nd, __unused int isdgram, nfsquad_t clientid; nfsattrbit_t attrbits; - if (!nfsrv_doallocate) { + if (!nfsrv_doallocate || nfsrv_devidcnt > 0) { /* * If any exported file system, such as a ZFS one, cannot * do VOP_ALLOCATE(), this operation cannot be supported @@ -5824,9 +5828,9 @@ nfsrvd_deallocate(struct nfsrv_descript *nd, __unused int isdgram, } stp->ls_stateid.other[2] = *tl++; /* - * Don't allow this to be done for a DS. + * Don't allow this to be done for a DS or MDS. */ - if ((nd->nd_flag & ND_DSSERVER) != 0) + if ((nd->nd_flag & ND_DSSERVER) != 0 || nfsrv_devidcnt > 0) nd->nd_repstat = NFSERR_NOTSUPP; /* However, allow the proxy stateid. */ if (stp->ls_stateid.seqid == 0xffffffff && @@ -6361,6 +6365,8 @@ nfsrvd_seek(struct nfsrv_descript *nd, __unused int isdgram, nd->nd_repstat = NFSERR_WRONGTYPE; if (nd->nd_repstat == 0 && off < 0) nd->nd_repstat = NFSERR_NXIO; + if (nd->nd_repstat == 0 && nfsrv_devidcnt > 0) + nd->nd_repstat = NFSERR_NOTSUPP; if (nd->nd_repstat == 0) { /* Check permissions for the input file. */ NFSZERO_ATTRBIT(&attrbits); diff --git a/sys/fs/nfsserver/nfs_nfsdstate.c b/sys/fs/nfsserver/nfs_nfsdstate.c index 22e702b001c..17b31867fdc 100644 --- a/sys/fs/nfsserver/nfs_nfsdstate.c +++ b/sys/fs/nfsserver/nfs_nfsdstate.c @@ -236,7 +236,8 @@ static int nfsrv_fndclid(nfsquad_t *clidvec, nfsquad_t clid, int clidcnt); static struct nfslayout *nfsrv_filelayout(struct nfsrv_descript *nd, int iomode, fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs); static struct nfslayout *nfsrv_flexlayout(struct nfsrv_descript *nd, int iomode, - int mirrorcnt, fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs); + int mirrorcnt, uint64_t stripesiz, int stripecnt, fhandle_t *fhp, + fhandle_t *dsfhp, char *devid, fsid_t fs); static int nfsrv_dontlayout(fhandle_t *fhp); static int nfsrv_createdsfile(vnode_t vp, fhandle_t *fhp, struct pnfsdsfile *pf, vnode_t dvp, struct nfsdevice *ds, struct ucred *cred, NFSPROC_T *p, @@ -6626,7 +6627,8 @@ nfsrv_layoutget(struct nfsrv_descript *nd, vnode_t vp, struct nfsexstuff *exp, struct nfslayout *lyp; char *devid; fhandle_t fh, *dsfhp; - int error, mirrorcnt; + int error, mirrorcnt, stripecnt; + uint64_t stripesiz; if (nfsrv_devidcnt == 0) return (NFSERR_UNKNLAYOUTTYPE); @@ -6723,9 +6725,8 @@ nfsrv_layoutget(struct nfsrv_descript *nd, vnode_t vp, struct nfsexstuff *exp, NFSUNLOCKLAYOUT(lhyp); /* Find the device id and file handle. */ - dsfhp = malloc(sizeof(fhandle_t) * NFSDEV_MAXMIRRORS, M_TEMP, M_WAITOK); - devid = malloc(NFSX_V4DEVICEID * NFSDEV_MAXMIRRORS, M_TEMP, M_WAITOK); - error = nfsrv_dsgetdevandfh(vp, p, &mirrorcnt, dsfhp, devid); + error = nfsrv_dsgetdevandfh(vp, p, &mirrorcnt, &stripesiz, &stripecnt, + &dsfhp, &devid); NFSD_DEBUG(4, "layoutget devandfh=%d\n", error); if (error == 0) { if (layouttype == NFSLAYOUT_NFSV4_1_FILES) { @@ -6735,11 +6736,11 @@ nfsrv_layoutget(struct nfsrv_descript *nd, vnode_t vp, struct nfsexstuff *exp, lyp = nfsrv_filelayout(nd, *iomode, &fh, dsfhp, devid, vp->v_mount->mnt_stat.f_fsid); } else { - if (NFSX_V4FLEXLAYOUT(mirrorcnt) > maxcnt) + if (NFSX_V4FLEXLAYOUT(mirrorcnt, stripecnt) > maxcnt) error = NFSERR_TOOSMALL; else lyp = nfsrv_flexlayout(nd, *iomode, mirrorcnt, - &fh, dsfhp, devid, + stripesiz, stripecnt, &fh, dsfhp, devid, vp->v_mount->mnt_stat.f_fsid); } } @@ -6814,15 +6815,16 @@ nfsrv_filelayout(struct nfsrv_descript *nd, int iomode, fhandle_t *fhp, */ static struct nfslayout * nfsrv_flexlayout(struct nfsrv_descript *nd, int iomode, int mirrorcnt, - fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs) + uint64_t stripesiz, int stripecnt, fhandle_t *fhp, fhandle_t *dsfhp, + char *devid, fsid_t fs) { uint32_t *tl; struct nfslayout *lyp; - uint64_t lenval; - int i; + int i, j; - lyp = malloc(sizeof(struct nfslayout) + NFSX_V4FLEXLAYOUT(mirrorcnt), - M_NFSDSTATE, M_WAITOK | M_ZERO); + lyp = malloc(sizeof(struct nfslayout) + + NFSX_V4FLEXLAYOUT(mirrorcnt, stripecnt), M_NFSDSTATE, + M_WAITOK | M_ZERO); lyp->lay_type = NFSLAYOUT_FLEXFILE; if (iomode == NFSLAYOUTIOMODE_RW) lyp->lay_flags = NFSLAY_RW; @@ -6836,41 +6838,42 @@ nfsrv_flexlayout(struct nfsrv_descript *nd, int iomode, int mirrorcnt, /* Fill in the xdr for the files layout. */ tl = (uint32_t *)lyp->lay_xdr; - lenval = 0; - txdr_hyper(lenval, tl); tl += 2; /* Stripe unit. */ + txdr_hyper(stripesiz, tl); tl += 2; /* Stripe unit. */ *tl++ = txdr_unsigned(mirrorcnt); /* # of mirrors. */ for (i = 0; i < mirrorcnt; i++) { - *tl++ = txdr_unsigned(1); /* One stripe. */ - NFSBCOPY(devid, tl, NFSX_V4DEVICEID); /* Device ID. */ - tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); - devid += NFSX_V4DEVICEID; - *tl++ = txdr_unsigned(1); /* Efficiency. */ - *tl++ = 0; /* Proxy Stateid. */ - *tl++ = 0x55555555; - *tl++ = 0x55555555; - *tl++ = 0x55555555; - *tl++ = txdr_unsigned(1); /* 1 file handle. */ - *tl++ = txdr_unsigned(NFSX_V4PNFSFH); - NFSBCOPY(dsfhp, tl, sizeof(*dsfhp)); - tl += (NFSM_RNDUP(NFSX_V4PNFSFH) / NFSX_UNSIGNED); - dsfhp++; - if (nfsrv_flexlinuxhack != 0) { - *tl++ = txdr_unsigned(strlen(FLEX_UID0)); - *tl = 0; /* 0 pad string. */ - NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0)); - *tl++ = txdr_unsigned(strlen(FLEX_UID0)); - *tl = 0; /* 0 pad string. */ - NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0)); - } else { - *tl++ = txdr_unsigned(strlen(FLEX_OWNERID)); - NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED); - *tl++ = txdr_unsigned(strlen(FLEX_OWNERID)); - NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED); + *tl++ = txdr_unsigned(stripecnt); /* Stripe cnt. */ + for (j = 0; j < stripecnt; j++) { + NFSBCOPY(devid, tl, NFSX_V4DEVICEID); /* Device ID. */ + tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED); + devid += NFSX_V4DEVICEID; + *tl++ = txdr_unsigned(1); /* Efficiency. */ + *tl++ = 0; /* Proxy Stateid. */ + *tl++ = 0x55555555; + *tl++ = 0x55555555; + *tl++ = 0x55555555; + *tl++ = txdr_unsigned(1); /* 1 file handle. */ + *tl++ = txdr_unsigned(NFSX_V4PNFSFH); + NFSBCOPY(dsfhp, tl, sizeof(*dsfhp)); + tl += (NFSM_RNDUP(NFSX_V4PNFSFH) / NFSX_UNSIGNED); + dsfhp++; + if (nfsrv_flexlinuxhack != 0) { + *tl++ = txdr_unsigned(strlen(FLEX_UID0)); + *tl = 0; /* 0 pad string. */ + NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0)); + *tl++ = txdr_unsigned(strlen(FLEX_UID0)); + *tl = 0; /* 0 pad string. */ + NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0)); + } else { + *tl++ = txdr_unsigned(strlen(FLEX_OWNERID)); + NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED); + *tl++ = txdr_unsigned(strlen(FLEX_OWNERID)); + NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED); + } } } *tl++ = txdr_unsigned(0); /* ff_flags. */ *tl = txdr_unsigned(60); /* Status interval hint. */ - lyp->lay_layoutlen = NFSX_V4FLEXLAYOUT(mirrorcnt); + lyp->lay_layoutlen = NFSX_V4FLEXLAYOUT(mirrorcnt, stripecnt); return (lyp); } @@ -7126,7 +7129,7 @@ nfsrv_layoutreturn(struct nfsrv_descript *nd, vnode_t vp, error = nfsvno_getfh(vp, &fh, p); if (error == 0) { error = nfsrv_updatemdsattr(vp, &na, p); - if (error != 0) + if (error != 0 && error != ESTALE) printf("nfsrv_layoutreturn: updatemdsattr" " failed=%d\n", error); } @@ -8454,7 +8457,7 @@ nfsrv_mdscopymr(char *mdspathp, char *dspathp, char *curdspathp, char *buf, struct vnode *vp, *curvp; struct pnfsdsfile *pf; struct nfsmount *nmp, *curnmp; - int dsdir, error, mirrorcnt, ippos; + int dsdir, error, ippos; vp = NULL; curvp = NULL; @@ -8591,7 +8594,7 @@ nfsrv_mdscopymr(char *mdspathp, char *dspathp, char *curdspathp, char *buf, * on the MDS file (as checked via the nmp argument), * nfsrv_dsgetsockmnt() returns EEXIST, so no copying will occur. */ - error = nfsrv_dsgetsockmnt(vp, 0, buf, buflenp, &mirrorcnt, p, + error = nfsrv_dsgetsockmnt(vp, 0, buf, buflenp, NULL, NULL, NULL, p, NULL, NULL, NULL, fname, nvpp, &nmp, curnmp, &ippos, &dsdir); if (curvp != NULL) vput(curvp);