vfs_syscalls.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: vfs_syscalls.c,v 1.556 2022/11/02 20:38:22 andvar Exp $        */
    2 
    3 /*-
    4  * Copyright (c) 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  * Copyright (c) 1989, 1993
   34  *      The Regents of the University of California.  All rights reserved.
   35  * (c) UNIX System Laboratories, Inc.
   36  * All or some portions of this file are derived from material licensed
   37  * to the University of California by American Telephone and Telegraph
   38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   39  * the permission of UNIX System Laboratories, Inc.
   40  *
   41  * Redistribution and use in source and binary forms, with or without
   42  * modification, are permitted provided that the following conditions
   43  * are met:
   44  * 1. Redistributions of source code must retain the above copyright
   45  *    notice, this list of conditions and the following disclaimer.
   46  * 2. Redistributions in binary form must reproduce the above copyright
   47  *    notice, this list of conditions and the following disclaimer in the
   48  *    documentation and/or other materials provided with the distribution.
   49  * 3. Neither the name of the University nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   63  * SUCH DAMAGE.
   64  *
   65  *      @(#)vfs_syscalls.c      8.42 (Berkeley) 7/31/95
   66  */
   67 
   68 /*
   69  * Virtual File System System Calls
   70  */
   71 
   72 #include <sys/cdefs.h>
   73 __KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.556 2022/11/02 20:38:22 andvar Exp $");
   74 
   75 #ifdef _KERNEL_OPT
   76 #include "opt_fileassoc.h"
   77 #include "veriexec.h"
   78 #endif
   79 
   80 #include <sys/param.h>
   81 #include <sys/systm.h>
   82 #include <sys/namei.h>
   83 #include <sys/filedesc.h>
   84 #include <sys/kernel.h>
   85 #include <sys/file.h>
   86 #include <sys/fcntl.h>
   87 #include <sys/stat.h>
   88 #include <sys/vnode.h>
   89 #include <sys/mount.h>
   90 #include <sys/fstrans.h>
   91 #include <sys/proc.h>
   92 #include <sys/uio.h>
   93 #include <sys/kmem.h>
   94 #include <sys/dirent.h>
   95 #include <sys/sysctl.h>
   96 #include <sys/syscallargs.h>
   97 #include <sys/vfs_syscalls.h>
   98 #include <sys/quota.h>
   99 #include <sys/quotactl.h>
  100 #include <sys/ktrace.h>
  101 #ifdef FILEASSOC
  102 #include <sys/fileassoc.h>
  103 #endif /* FILEASSOC */
  104 #include <sys/extattr.h>
  105 #include <sys/verified_exec.h>
  106 #include <sys/kauth.h>
  107 #include <sys/atomic.h>
  108 #include <sys/module.h>
  109 #include <sys/buf.h>
  110 #include <sys/event.h>
  111 #include <sys/compat_stub.h>
  112 
  113 #include <miscfs/genfs/genfs.h>
  114 #include <miscfs/specfs/specdev.h>
  115 
  116 #include <nfs/rpcv2.h>
  117 #include <nfs/nfsproto.h>
  118 #include <nfs/nfs.h>
  119 #include <nfs/nfs_var.h>
  120 
  121 /* XXX this shouldn't be here */
  122 #ifndef OFF_T_MAX
  123 #define OFF_T_MAX __type_max(off_t)
  124 #endif
  125 
  126 static int change_flags(struct vnode *, u_long, struct lwp *);
  127 static int change_mode(struct vnode *, int, struct lwp *);
  128 static int change_owner(struct vnode *, uid_t, gid_t, struct lwp *, int);
  129 static int do_sys_openat(lwp_t *, int, const char *, int, int, int *);
  130 static int do_sys_mkdirat(struct lwp *l, int, const char *, mode_t,
  131     enum uio_seg);
  132 static int do_sys_mkfifoat(struct lwp *, int, const char *, mode_t);
  133 static int do_sys_symlinkat(struct lwp *, const char *, int, const char *,
  134     enum uio_seg);
  135 static int do_sys_renameat(struct lwp *l, int, const char *, int, const char *,
  136     enum uio_seg, int);
  137 static int do_sys_readlinkat(struct lwp *, int, const char *, char *,
  138     size_t, register_t *);
  139 static int do_sys_unlinkat(struct lwp *, int, const char *, int, enum uio_seg);
  140 
  141 static int fd_nameiat(struct lwp *, int, struct nameidata *);
  142 static int fd_nameiat_simple_user(struct lwp *, int, const char *,
  143     namei_simple_flags_t, struct vnode **);
  144 
  145 /*
  146  * This table is used to maintain compatibility with 4.3BSD
  147  * and NetBSD 0.9 mount syscalls - and possibly other systems.
  148  * Note, the order is important!
  149  *
  150  * Do not modify this table. It should only contain filesystems
  151  * supported by NetBSD 0.9 and 4.3BSD.
  152  */
  153 const char * const mountcompatnames[] = {
  154         NULL,           /* 0 = MOUNT_NONE */
  155         MOUNT_FFS,      /* 1 = MOUNT_UFS */
  156         MOUNT_NFS,      /* 2 */
  157         MOUNT_MFS,      /* 3 */
  158         MOUNT_MSDOS,    /* 4 */
  159         MOUNT_CD9660,   /* 5 = MOUNT_ISOFS */
  160         MOUNT_FDESC,    /* 6 */
  161         MOUNT_KERNFS,   /* 7 */
  162         NULL,           /* 8 = MOUNT_DEVFS */
  163         MOUNT_AFS,      /* 9 */
  164 };
  165 
  166 const u_int nmountcompatnames = __arraycount(mountcompatnames);
  167 
  168 /*
  169  * Filter event method for EVFILT_FS.
  170  */
  171 static struct klist fs_klist;
  172 static kmutex_t fs_klist_lock;
  173 
  174 CTASSERT((NOTE_SUBMIT & VQ_MOUNT) == 0);
  175 CTASSERT((NOTE_SUBMIT & VQ_UNMOUNT) == 0);
  176 
  177 void
  178 vfs_evfilt_fs_init(void)
  179 {
  180         klist_init(&fs_klist);
  181         mutex_init(&fs_klist_lock, MUTEX_DEFAULT, IPL_NONE);
  182 }
  183 
  184 static int
  185 filt_fsattach(struct knote *kn)
  186 {
  187         mutex_enter(&fs_klist_lock);
  188         kn->kn_flags |= EV_CLEAR;
  189         klist_insert(&fs_klist, kn);
  190         mutex_exit(&fs_klist_lock);
  191 
  192         return 0;
  193 }
  194 
  195 static void
  196 filt_fsdetach(struct knote *kn)
  197 {
  198         mutex_enter(&fs_klist_lock);
  199         klist_remove(&fs_klist, kn);
  200         mutex_exit(&fs_klist_lock);
  201 }
  202 
  203 static int
  204 filt_fs(struct knote *kn, long hint)
  205 {
  206         int rv;
  207 
  208         if (hint & NOTE_SUBMIT) {
  209                 KASSERT(mutex_owned(&fs_klist_lock));
  210                 kn->kn_fflags |= hint & ~NOTE_SUBMIT;
  211         } else {
  212                 mutex_enter(&fs_klist_lock);
  213         }
  214 
  215         rv = (kn->kn_fflags != 0);
  216 
  217         if ((hint & NOTE_SUBMIT) == 0) {
  218                 mutex_exit(&fs_klist_lock);
  219         }
  220 
  221         return rv;
  222 }
  223 
  224 /* referenced in kern_event.c */
  225 const struct filterops fs_filtops = {
  226         .f_flags = FILTEROP_MPSAFE,
  227         .f_attach = filt_fsattach,
  228         .f_detach = filt_fsdetach,
  229         .f_event = filt_fs,
  230 };
  231 
  232 static int 
  233 fd_nameiat(struct lwp *l, int fdat, struct nameidata *ndp)
  234 {
  235         file_t *dfp;
  236         int error;
  237 
  238         if (fdat != AT_FDCWD) {
  239                 if ((error = fd_getvnode(fdat, &dfp)) != 0)
  240                         goto out;
  241 
  242                 NDAT(ndp, dfp->f_vnode);
  243         }
  244 
  245         error = namei(ndp);
  246 
  247         if (fdat != AT_FDCWD)
  248                 fd_putfile(fdat);
  249 out:
  250         return error;   
  251 }
  252 
  253 static int
  254 fd_nameiat_simple_user(struct lwp *l, int fdat, const char *path,
  255     namei_simple_flags_t sflags, struct vnode **vp_ret)
  256 {
  257         file_t *dfp;
  258         struct vnode *dvp;
  259         int error;
  260 
  261         if (fdat != AT_FDCWD) {
  262                 if ((error = fd_getvnode(fdat, &dfp)) != 0)
  263                         goto out;
  264 
  265                 dvp = dfp->f_vnode;
  266         } else {
  267                 dvp = NULL;
  268         }
  269 
  270         error = nameiat_simple_user(dvp, path, sflags, vp_ret);
  271 
  272         if (fdat != AT_FDCWD)
  273                 fd_putfile(fdat);
  274 out:
  275         return error;   
  276 }
  277 
  278 static int
  279 open_setfp(struct lwp *l, file_t *fp, struct vnode *vp, int indx, int flags)
  280 {
  281         int error;
  282 
  283         fp->f_flag = flags & FMASK;
  284         fp->f_type = DTYPE_VNODE;
  285         fp->f_ops = &vnops;
  286         fp->f_vnode = vp;
  287 
  288         if (flags & (O_EXLOCK | O_SHLOCK)) {
  289                 struct flock lf;
  290                 int type;
  291 
  292                 lf.l_whence = SEEK_SET;
  293                 lf.l_start = 0;
  294                 lf.l_len = 0;
  295                 if (flags & O_EXLOCK)
  296                         lf.l_type = F_WRLCK;
  297                 else
  298                         lf.l_type = F_RDLCK;
  299                 type = F_FLOCK;
  300                 if ((flags & FNONBLOCK) == 0)
  301                         type |= F_WAIT;
  302                 VOP_UNLOCK(vp);
  303                 error = VOP_ADVLOCK(vp, fp, F_SETLK, &lf, type);
  304                 if (error) {
  305                         (void) vn_close(vp, fp->f_flag, fp->f_cred);
  306                         fd_abort(l->l_proc, fp, indx);
  307                         return error;
  308                 }
  309                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  310                 atomic_or_uint(&fp->f_flag, FHASLOCK);
  311         }
  312         if (flags & O_CLOEXEC)
  313                 fd_set_exclose(l, indx, true);
  314         return 0;
  315 }
  316 
  317 static int
  318 mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
  319     void *data, size_t *data_len)
  320 {
  321         struct mount *mp;
  322         int error = 0, saved_flags;
  323 
  324         mp = vp->v_mount;
  325         saved_flags = mp->mnt_flag;
  326 
  327         /* We can operate only on VV_ROOT nodes. */
  328         if ((vp->v_vflag & VV_ROOT) == 0) {
  329                 error = EINVAL;
  330                 goto out;
  331         }
  332 
  333         /*
  334          * We only allow the filesystem to be reloaded if it
  335          * is currently mounted read-only.  Additionally, we
  336          * prevent read-write to read-only downgrades.
  337          */
  338         if ((flags & (MNT_RELOAD | MNT_RDONLY)) != 0 &&
  339             (mp->mnt_flag & MNT_RDONLY) == 0 &&
  340             (mp->mnt_iflag & IMNT_CAN_RWTORO) == 0) {
  341                 error = EOPNOTSUPP;     /* Needs translation */
  342                 goto out;
  343         }
  344 
  345         /*
  346          * Enabling MNT_UNION requires a covered mountpoint and
  347          * must not happen on the root mount.
  348          */
  349         if ((flags & MNT_UNION) != 0 && mp->mnt_vnodecovered == NULLVP) {
  350                 error = EOPNOTSUPP;
  351                 goto out;
  352         }
  353 
  354         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
  355             KAUTH_REQ_SYSTEM_MOUNT_UPDATE, mp, KAUTH_ARG(flags), data);
  356         if (error)
  357                 goto out;
  358 
  359         error = vfs_suspend(mp, 0);
  360         if (error)
  361                 goto out;
  362 
  363         mutex_enter(mp->mnt_updating);
  364 
  365         mp->mnt_flag &= ~MNT_OP_FLAGS;
  366         mp->mnt_flag |= flags & MNT_OP_FLAGS;
  367 
  368         /*
  369          * Set the mount level flags.
  370          */
  371         if ((flags & MNT_RDONLY) != (mp->mnt_flag & MNT_RDONLY)) {
  372                 if ((flags & MNT_RDONLY))
  373                         mp->mnt_iflag |= IMNT_WANTRDONLY;
  374                 else
  375                         mp->mnt_iflag |= IMNT_WANTRDWR;
  376         }
  377         mp->mnt_flag &= ~MNT_BASIC_FLAGS;
  378         mp->mnt_flag |= flags & MNT_BASIC_FLAGS;
  379         if ((mp->mnt_iflag & IMNT_WANTRDONLY))
  380                 mp->mnt_flag &= ~MNT_RDONLY;
  381 
  382         error = VFS_MOUNT(mp, path, data, data_len);
  383 
  384         if (error && data != NULL) {
  385                 int error2;
  386 
  387                 /*
  388                  * Update failed; let's try and see if it was an
  389                  * export request.  For compat with 3.0 and earlier.
  390                  */
  391                 error2 = vfs_hooks_reexport(mp, path, data);
  392 
  393                 /*
  394                  * Only update error code if the export request was
  395                  * understood but some problem occurred while
  396                  * processing it.
  397                  */
  398                 if (error2 != EJUSTRETURN)
  399                         error = error2;
  400         }
  401 
  402         if (error == 0 && (mp->mnt_iflag & IMNT_WANTRDONLY))
  403                 mp->mnt_flag |= MNT_RDONLY;
  404         if (error)
  405                 mp->mnt_flag = saved_flags;
  406         mp->mnt_flag &= ~MNT_OP_FLAGS;
  407         mp->mnt_iflag &= ~(IMNT_WANTRDONLY | IMNT_WANTRDWR);
  408         if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) {
  409                 if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0)
  410                         vfs_syncer_add_to_worklist(mp);
  411         } else {
  412                 if ((mp->mnt_iflag & IMNT_ONWORKLIST) != 0)
  413                         vfs_syncer_remove_from_worklist(mp);
  414         }
  415         mutex_exit(mp->mnt_updating);
  416         vfs_resume(mp);
  417 
  418         if ((error == 0) && !(saved_flags & MNT_EXTATTR) && 
  419             (flags & MNT_EXTATTR)) {
  420                 if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, 
  421                                    NULL, 0, NULL) != 0) {
  422                         printf("%s: failed to start extattr, error = %d",
  423                                mp->mnt_stat.f_mntonname, error);
  424                         mp->mnt_flag &= ~MNT_EXTATTR;
  425                 }
  426         }
  427 
  428         if ((error == 0) && (saved_flags & MNT_EXTATTR) && 
  429             !(flags & MNT_EXTATTR)) {
  430                 if (VFS_EXTATTRCTL(mp, EXTATTR_CMD_STOP, 
  431                                    NULL, 0, NULL) != 0) {
  432                         printf("%s: failed to stop extattr, error = %d",
  433                                mp->mnt_stat.f_mntonname, error);
  434                         mp->mnt_flag |= MNT_RDONLY;
  435                 }
  436         }
  437  out:
  438         return (error);
  439 }
  440 
  441 static int
  442 mount_get_vfsops(const char *fstype, enum uio_seg type_seg,
  443     struct vfsops **vfsops)
  444 {
  445         char fstypename[sizeof(((struct statvfs *)NULL)->f_fstypename)];
  446         int error;
  447 
  448         if (type_seg == UIO_USERSPACE) {
  449                 /* Copy file-system type from userspace.  */
  450                 error = copyinstr(fstype, fstypename, sizeof(fstypename), NULL);
  451         } else {
  452                 error = copystr(fstype, fstypename, sizeof(fstypename), NULL);
  453                 KASSERT(error == 0);
  454         }
  455 
  456         if (error) {
  457                 /*
  458                  * Historically, filesystem types were identified by numbers.
  459                  * If we get an integer for the filesystem type instead of a
  460                  * string, we check to see if it matches one of the historic
  461                  * filesystem types.
  462                  */
  463                 u_long fsindex = (u_long)fstype;
  464                 if (fsindex >= nmountcompatnames ||
  465                     mountcompatnames[fsindex] == NULL)
  466                         return ENODEV;
  467                 strlcpy(fstypename, mountcompatnames[fsindex],
  468                     sizeof(fstypename));
  469         }
  470 
  471         /* Accept `ufs' as an alias for `ffs', for compatibility. */
  472         if (strcmp(fstypename, "ufs") == 0)
  473                 fstypename[0] = 'f';
  474 
  475         if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
  476                 return 0;
  477 
  478         /* If we can autoload a vfs module, try again */
  479         (void)module_autoload(fstypename, MODULE_CLASS_VFS);
  480 
  481         if ((*vfsops = vfs_getopsbyname(fstypename)) != NULL)
  482                 return 0;
  483 
  484         return ENODEV;
  485 }
  486 
  487 static int
  488 mount_getargs(struct lwp *l, struct vnode *vp, const char *path, int flags,
  489     void *data, size_t *data_len)
  490 {
  491         struct mount *mp;
  492         int error;
  493 
  494         /* If MNT_GETARGS is specified, it should be the only flag. */
  495         if (flags & ~MNT_GETARGS)
  496                 return EINVAL;
  497 
  498         mp = vp->v_mount;
  499 
  500         /* XXX: probably some notion of "can see" here if we want isolation. */ 
  501         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
  502             KAUTH_REQ_SYSTEM_MOUNT_GET, mp, data, NULL);
  503         if (error)
  504                 return error;
  505 
  506         if ((vp->v_vflag & VV_ROOT) == 0)
  507                 return EINVAL;
  508 
  509         if (vfs_busy(mp))
  510                 return EPERM;
  511 
  512         mutex_enter(mp->mnt_updating);
  513         mp->mnt_flag &= ~MNT_OP_FLAGS;
  514         mp->mnt_flag |= MNT_GETARGS;
  515         error = VFS_MOUNT(mp, path, data, data_len);
  516         mp->mnt_flag &= ~MNT_OP_FLAGS;
  517         mutex_exit(mp->mnt_updating);
  518 
  519         vfs_unbusy(mp);
  520         return (error);
  521 }
  522 
  523 int
  524 sys___mount50(struct lwp *l, const struct sys___mount50_args *uap, register_t *retval)
  525 {
  526         /* {
  527                 syscallarg(const char *) type;
  528                 syscallarg(const char *) path;
  529                 syscallarg(int) flags;
  530                 syscallarg(void *) data;
  531                 syscallarg(size_t) data_len;
  532         } */
  533 
  534         return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
  535             SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE,
  536             SCARG(uap, data_len), retval);
  537 }
  538 
  539 int
  540 do_sys_mount(struct lwp *l, const char *type, enum uio_seg type_seg,
  541     const char *path, int flags, void *data, enum uio_seg data_seg,
  542     size_t data_len, register_t *retval)
  543 {
  544         struct vfsops *vfsops = NULL;   /* XXX gcc4.8 */
  545         struct vnode *vp;
  546         void *data_buf = data;
  547         bool vfsopsrele = false;
  548         size_t alloc_sz = 0;
  549         int error;
  550 
  551         /*
  552          * Get vnode to be covered
  553          */
  554         error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
  555         if (error != 0) {
  556                 vp = NULL;
  557                 goto done;
  558         }
  559 
  560         if (flags & (MNT_GETARGS | MNT_UPDATE)) {
  561                 vfsops = vp->v_mount->mnt_op;
  562         } else {
  563                 /* 'type' is userspace */
  564                 error = mount_get_vfsops(type, type_seg, &vfsops);
  565                 if (error != 0)
  566                         goto done;
  567                 vfsopsrele = true;
  568         }
  569 
  570         /*
  571          * We allow data to be NULL, even for userspace. Some fs's don't need
  572          * it. The others will handle NULL.
  573          */
  574         if (data != NULL && data_seg == UIO_USERSPACE) {
  575                 if (data_len == 0) {
  576                         /* No length supplied, use default for filesystem */
  577                         data_len = vfsops->vfs_min_mount_data;
  578 
  579                         /*
  580                          * Hopefully a longer buffer won't make copyin() fail.
  581                          * For compatibility with 3.0 and earlier.
  582                          */
  583                         if (flags & MNT_UPDATE
  584                             && data_len < sizeof (struct mnt_export_args30))
  585                                 data_len = sizeof (struct mnt_export_args30);
  586                 }
  587                 if ((data_len == 0) || (data_len > VFS_MAX_MOUNT_DATA)) {
  588                         error = EINVAL;
  589                         goto done;
  590                 }
  591                 alloc_sz = data_len;
  592                 data_buf = kmem_alloc(alloc_sz, KM_SLEEP);
  593 
  594                 /* NFS needs the buffer even for mnt_getargs .... */
  595                 error = copyin(data, data_buf, data_len);
  596                 if (error != 0)
  597                         goto done;
  598         }
  599 
  600         if (flags & MNT_GETARGS) {
  601                 if (data_len == 0) {
  602                         error = EINVAL;
  603                         goto done;
  604                 }
  605                 error = mount_getargs(l, vp, path, flags, data_buf, &data_len);
  606                 if (error != 0)
  607                         goto done;
  608                 if (data_seg == UIO_USERSPACE)
  609                         error = copyout(data_buf, data, data_len);
  610                 *retval = data_len;
  611         } else if (flags & MNT_UPDATE) {
  612                 error = mount_update(l, vp, path, flags, data_buf, &data_len);
  613         } else {
  614                 /* Locking is handled internally in mount_domount(). */
  615                 KASSERT(vfsopsrele == true);
  616                 error = mount_domount(l, &vp, vfsops, path, flags, data_buf,
  617                     &data_len);
  618                 vfsopsrele = false;
  619         }
  620         if (!error) {
  621                 mutex_enter(&fs_klist_lock);
  622                 KNOTE(&fs_klist, NOTE_SUBMIT | VQ_MOUNT);
  623                 mutex_exit(&fs_klist_lock);
  624         }
  625 
  626     done:
  627         if (vfsopsrele)
  628                 vfs_delref(vfsops);
  629         if (vp != NULL) {
  630                 vrele(vp);
  631         }
  632         if (data_buf != data)
  633                 kmem_free(data_buf, alloc_sz);
  634         return (error);
  635 }
  636 
  637 /*
  638  * Unmount a file system.
  639  *
  640  * Note: unmount takes a path to the vnode mounted on as argument,
  641  * not special file (as before).
  642  */
  643 /* ARGSUSED */
  644 int
  645 sys_unmount(struct lwp *l, const struct sys_unmount_args *uap, register_t *retval)
  646 {
  647         /* {
  648                 syscallarg(const char *) path;
  649                 syscallarg(int) flags;
  650         } */
  651         struct vnode *vp;
  652         struct mount *mp;
  653         int error;
  654         struct pathbuf *pb;
  655         struct nameidata nd;
  656 
  657         error = pathbuf_copyin(SCARG(uap, path), &pb);
  658         if (error) {
  659                 return error;
  660         }
  661 
  662         NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | TRYEMULROOT, pb);
  663         if ((error = namei(&nd)) != 0) {
  664                 pathbuf_destroy(pb);
  665                 return error;
  666         }
  667         vp = nd.ni_vp;
  668         pathbuf_destroy(pb);
  669 
  670         mp = vp->v_mount;
  671         vfs_ref(mp);
  672         VOP_UNLOCK(vp);
  673 
  674         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
  675             KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT, mp, NULL, NULL);
  676         if (error) {
  677                 vrele(vp);
  678                 vfs_rele(mp);
  679                 return (error);
  680         }
  681 
  682         /*
  683          * Don't allow unmounting the root file system.
  684          */
  685         if (mp->mnt_flag & MNT_ROOTFS) {
  686                 vrele(vp);
  687                 vfs_rele(mp);
  688                 return (EINVAL);
  689         }
  690 
  691         /*
  692          * Must be the root of the filesystem
  693          */
  694         if ((vp->v_vflag & VV_ROOT) == 0) {
  695                 vrele(vp);
  696                 vfs_rele(mp);
  697                 return (EINVAL);
  698         }
  699 
  700         vrele(vp);
  701         error = dounmount(mp, SCARG(uap, flags), l);
  702         vfs_rele(mp);
  703         if (!error) {
  704                 mutex_enter(&fs_klist_lock);
  705                 KNOTE(&fs_klist, NOTE_SUBMIT | VQ_UNMOUNT);
  706                 mutex_exit(&fs_klist_lock);
  707         }
  708         return error;
  709 }
  710 
  711 /*
  712  * Sync each mounted filesystem.
  713  */
  714 #ifdef DEBUG
  715 int syncprt = 0;
  716 struct ctldebug debug0 = { "syncprt", &syncprt };
  717 #endif
  718 
  719 void
  720 do_sys_sync(struct lwp *l)
  721 {
  722         mount_iterator_t *iter;
  723         struct mount *mp;
  724         int asyncflag;
  725 
  726         mountlist_iterator_init(&iter);
  727         while ((mp = mountlist_iterator_next(iter)) != NULL) {
  728                 mutex_enter(mp->mnt_updating);
  729                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
  730                         asyncflag = mp->mnt_flag & MNT_ASYNC;
  731                         mp->mnt_flag &= ~MNT_ASYNC;
  732                         VFS_SYNC(mp, MNT_NOWAIT, l->l_cred);
  733                         if (asyncflag)
  734                                  mp->mnt_flag |= MNT_ASYNC;
  735                 }
  736                 mutex_exit(mp->mnt_updating);
  737         }
  738         mountlist_iterator_destroy(iter);
  739 #ifdef DEBUG
  740         if (syncprt)
  741                 vfs_bufstats();
  742 #endif /* DEBUG */
  743 }
  744 
  745 static bool
  746 sync_vnode_filter(void *cookie, vnode_t *vp)
  747 {
  748 
  749         if (vp->v_numoutput > 0) {
  750                 ++*(int *)cookie;
  751         }
  752         return false;
  753 }
  754 
  755 int
  756 vfs_syncwait(void)
  757 {
  758         int nbusy, nbusy_prev, iter;
  759         struct vnode_iterator *vniter;
  760         mount_iterator_t *mpiter;
  761         struct mount *mp;
  762 
  763         for (nbusy_prev = 0, iter = 0; iter < 20;) {
  764                 nbusy = 0;
  765                 mountlist_iterator_init(&mpiter);
  766                 while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
  767                         vnode_t *vp __diagused;
  768                         vfs_vnode_iterator_init(mp, &vniter);
  769                         vp = vfs_vnode_iterator_next(vniter,
  770                             sync_vnode_filter, &nbusy);
  771                         KASSERT(vp == NULL);
  772                         vfs_vnode_iterator_destroy(vniter);
  773                 }
  774                 mountlist_iterator_destroy(mpiter);
  775 
  776                 if (nbusy == 0)
  777                         break;
  778                 if (nbusy_prev == 0)
  779                         nbusy_prev = nbusy;
  780                 printf("%d ", nbusy);
  781                 kpause("syncwait", false, MAX(1, hz / 25 * iter), NULL);
  782                 if (nbusy >= nbusy_prev) /* we didn't flush anything */
  783                         iter++;
  784                 else
  785                         nbusy_prev = nbusy;
  786         }
  787 
  788         if (nbusy) {
  789 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY)
  790                 printf("giving up\nPrinting vnodes for busy buffers\n");
  791                 mountlist_iterator_init(&mpiter);
  792                 while ((mp = mountlist_iterator_next(mpiter)) != NULL) {
  793                         vnode_t *vp;
  794                         vfs_vnode_iterator_init(mp, &vniter);
  795                         vp = vfs_vnode_iterator_next(vniter,
  796                             NULL, NULL);
  797                         mutex_enter(vp->v_interlock);
  798                         if (vp->v_numoutput > 0)
  799                                 vprint(NULL, vp);
  800                         mutex_exit(vp->v_interlock);
  801                         vrele(vp);
  802                         vfs_vnode_iterator_destroy(vniter);
  803                 }
  804                 mountlist_iterator_destroy(mpiter);
  805 #endif
  806         }
  807 
  808         return nbusy;
  809 }
  810 
  811 /* ARGSUSED */
  812 int
  813 sys_sync(struct lwp *l, const void *v, register_t *retval)
  814 {
  815         do_sys_sync(l);
  816         return (0);
  817 }
  818 
  819 
  820 /*
  821  * Access or change filesystem quotas.
  822  *
  823  * (this is really 14 different calls bundled into one)
  824  */
  825 
  826 static int
  827 do_sys_quotactl_stat(struct mount *mp, struct quotastat *info_u)
  828 {
  829         struct quotastat info_k;
  830         int error;
  831 
  832         /* ensure any padding bytes are cleared */
  833         memset(&info_k, 0, sizeof(info_k));
  834 
  835         error = vfs_quotactl_stat(mp, &info_k);
  836         if (error) {
  837                 return error;
  838         }
  839 
  840         return copyout(&info_k, info_u, sizeof(info_k));
  841 }
  842 
  843 static int
  844 do_sys_quotactl_idtypestat(struct mount *mp, int idtype,
  845     struct quotaidtypestat *info_u)
  846 {
  847         struct quotaidtypestat info_k;
  848         int error;
  849 
  850         /* ensure any padding bytes are cleared */
  851         memset(&info_k, 0, sizeof(info_k));
  852 
  853         error = vfs_quotactl_idtypestat(mp, idtype, &info_k);
  854         if (error) {
  855                 return error;
  856         }
  857 
  858         return copyout(&info_k, info_u, sizeof(info_k));
  859 }
  860 
  861 static int
  862 do_sys_quotactl_objtypestat(struct mount *mp, int objtype,
  863     struct quotaobjtypestat *info_u)
  864 {
  865         struct quotaobjtypestat info_k;
  866         int error;
  867 
  868         /* ensure any padding bytes are cleared */
  869         memset(&info_k, 0, sizeof(info_k));
  870 
  871         error = vfs_quotactl_objtypestat(mp, objtype, &info_k);
  872         if (error) {
  873                 return error;
  874         }
  875 
  876         return copyout(&info_k, info_u, sizeof(info_k));
  877 }
  878 
  879 static int
  880 do_sys_quotactl_get(struct mount *mp, const struct quotakey *key_u,
  881     struct quotaval *val_u)
  882 {
  883         struct quotakey key_k;
  884         struct quotaval val_k;
  885         int error;
  886 
  887         /* ensure any padding bytes are cleared */
  888         memset(&val_k, 0, sizeof(val_k));
  889 
  890         error = copyin(key_u, &key_k, sizeof(key_k));
  891         if (error) {
  892                 return error;
  893         }
  894 
  895         error = vfs_quotactl_get(mp, &key_k, &val_k);
  896         if (error) {
  897                 return error;
  898         }
  899 
  900         return copyout(&val_k, val_u, sizeof(val_k));
  901 }
  902 
  903 static int
  904 do_sys_quotactl_put(struct mount *mp, const struct quotakey *key_u,
  905     const struct quotaval *val_u)
  906 {
  907         struct quotakey key_k;
  908         struct quotaval val_k;
  909         int error;
  910 
  911         error = copyin(key_u, &key_k, sizeof(key_k));
  912         if (error) {
  913                 return error;
  914         }
  915 
  916         error = copyin(val_u, &val_k, sizeof(val_k));
  917         if (error) {
  918                 return error;
  919         }
  920 
  921         return vfs_quotactl_put(mp, &key_k, &val_k);
  922 }
  923 
  924 static int
  925 do_sys_quotactl_del(struct mount *mp, const struct quotakey *key_u)
  926 {
  927         struct quotakey key_k;
  928         int error;
  929 
  930         error = copyin(key_u, &key_k, sizeof(key_k));
  931         if (error) {
  932                 return error;
  933         }
  934 
  935         return vfs_quotactl_del(mp, &key_k);
  936 }
  937 
  938 static int
  939 do_sys_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor_u)
  940 {
  941         struct quotakcursor cursor_k;
  942         int error;
  943 
  944         /* ensure any padding bytes are cleared */
  945         memset(&cursor_k, 0, sizeof(cursor_k));
  946 
  947         error = vfs_quotactl_cursoropen(mp, &cursor_k);
  948         if (error) {
  949                 return error;
  950         }
  951 
  952         return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
  953 }
  954 
  955 static int
  956 do_sys_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor_u)
  957 {
  958         struct quotakcursor cursor_k;
  959         int error;
  960 
  961         error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
  962         if (error) {
  963                 return error;
  964         }
  965 
  966         return vfs_quotactl_cursorclose(mp, &cursor_k);
  967 }
  968 
  969 static int
  970 do_sys_quotactl_cursorskipidtype(struct mount *mp,
  971     struct quotakcursor *cursor_u, int idtype)
  972 {
  973         struct quotakcursor cursor_k;
  974         int error;
  975 
  976         error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
  977         if (error) {
  978                 return error;
  979         }
  980 
  981         error = vfs_quotactl_cursorskipidtype(mp, &cursor_k, idtype);
  982         if (error) {
  983                 return error;
  984         }
  985 
  986         return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
  987 }
  988 
  989 static int
  990 do_sys_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor_u,
  991     struct quotakey *keys_u, struct quotaval *vals_u, unsigned maxnum,
  992     unsigned *ret_u)
  993 {
  994 #define CGET_STACK_MAX 8
  995         struct quotakcursor cursor_k;
  996         struct quotakey stackkeys[CGET_STACK_MAX];
  997         struct quotaval stackvals[CGET_STACK_MAX];
  998         struct quotakey *keys_k;
  999         struct quotaval *vals_k;
 1000         unsigned ret_k;
 1001         int error;
 1002 
 1003         if (maxnum > 128) {
 1004                 maxnum = 128;
 1005         }
 1006 
 1007         error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
 1008         if (error) {
 1009                 return error;
 1010         }
 1011 
 1012         if (maxnum <= CGET_STACK_MAX) {
 1013                 keys_k = stackkeys;
 1014                 vals_k = stackvals;
 1015                 /* ensure any padding bytes are cleared */
 1016                 memset(keys_k, 0, maxnum * sizeof(keys_k[0]));
 1017                 memset(vals_k, 0, maxnum * sizeof(vals_k[0]));
 1018         } else {
 1019                 keys_k = kmem_zalloc(maxnum * sizeof(keys_k[0]), KM_SLEEP);
 1020                 vals_k = kmem_zalloc(maxnum * sizeof(vals_k[0]), KM_SLEEP);
 1021         }
 1022 
 1023         error = vfs_quotactl_cursorget(mp, &cursor_k, keys_k, vals_k, maxnum,
 1024                                        &ret_k);
 1025         if (error) {
 1026                 goto fail;
 1027         }
 1028 
 1029         error = copyout(keys_k, keys_u, ret_k * sizeof(keys_k[0]));
 1030         if (error) {
 1031                 goto fail;
 1032         }
 1033 
 1034         error = copyout(vals_k, vals_u, ret_k * sizeof(vals_k[0]));
 1035         if (error) {
 1036                 goto fail;
 1037         }
 1038 
 1039         error = copyout(&ret_k, ret_u, sizeof(ret_k));
 1040         if (error) {
 1041                 goto fail;
 1042         }
 1043 
 1044         /* do last to maximize the chance of being able to recover a failure */
 1045         error = copyout(&cursor_k, cursor_u, sizeof(cursor_k));
 1046 
 1047 fail:
 1048         if (keys_k != stackkeys) {
 1049                 kmem_free(keys_k, maxnum * sizeof(keys_k[0]));
 1050         }
 1051         if (vals_k != stackvals) {
 1052                 kmem_free(vals_k, maxnum * sizeof(vals_k[0]));
 1053         }
 1054         return error;
 1055 }
 1056 
 1057 static int
 1058 do_sys_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor_u,
 1059     int *ret_u)
 1060 {
 1061         struct quotakcursor cursor_k;
 1062         int ret_k;
 1063         int error;
 1064 
 1065         error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
 1066         if (error) {
 1067                 return error;
 1068         }
 1069 
 1070         error = vfs_quotactl_cursoratend(mp, &cursor_k, &ret_k);
 1071         if (error) {
 1072                 return error;
 1073         }
 1074 
 1075         error = copyout(&ret_k, ret_u, sizeof(ret_k));
 1076         if (error) {
 1077                 return error;
 1078         }
 1079 
 1080         return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
 1081 }
 1082 
 1083 static int
 1084 do_sys_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor_u)
 1085 {
 1086         struct quotakcursor cursor_k;
 1087         int error;
 1088 
 1089         error = copyin(cursor_u, &cursor_k, sizeof(cursor_k));
 1090         if (error) {
 1091                 return error;
 1092         }
 1093 
 1094         error = vfs_quotactl_cursorrewind(mp, &cursor_k);
 1095         if (error) {
 1096                 return error;
 1097         }
 1098 
 1099         return copyout(&cursor_k, cursor_u, sizeof(cursor_k));
 1100 }
 1101 
 1102 static int
 1103 do_sys_quotactl_quotaon(struct mount *mp, int idtype, const char *path_u)
 1104 {
 1105         char *path_k;
 1106         int error;
 1107 
 1108         /* XXX this should probably be a struct pathbuf */
 1109         path_k = PNBUF_GET();
 1110         error = copyin(path_u, path_k, PATH_MAX);
 1111         if (error) {
 1112                 PNBUF_PUT(path_k);
 1113                 return error;
 1114         }
 1115 
 1116         error = vfs_quotactl_quotaon(mp, idtype, path_k);
 1117 
 1118         PNBUF_PUT(path_k);
 1119         return error;
 1120 }
 1121 
 1122 static int
 1123 do_sys_quotactl_quotaoff(struct mount *mp, int idtype)
 1124 {
 1125         return vfs_quotactl_quotaoff(mp, idtype);
 1126 }
 1127 
 1128 int
 1129 do_sys_quotactl(const char *path_u, const struct quotactl_args *args)
 1130 {
 1131         struct mount *mp;
 1132         struct vnode *vp;
 1133         int error;
 1134 
 1135         error = namei_simple_user(path_u, NSM_FOLLOW_TRYEMULROOT, &vp);
 1136         if (error != 0)
 1137                 return (error);
 1138         mp = vp->v_mount;
 1139 
 1140         switch (args->qc_op) {
 1141             case QUOTACTL_STAT:
 1142                 error = do_sys_quotactl_stat(mp, args->u.stat.qc_info);
 1143                 break;
 1144             case QUOTACTL_IDTYPESTAT:
 1145                 error = do_sys_quotactl_idtypestat(mp,
 1146                                 args->u.idtypestat.qc_idtype,
 1147                                 args->u.idtypestat.qc_info);
 1148                 break;
 1149             case QUOTACTL_OBJTYPESTAT:
 1150                 error = do_sys_quotactl_objtypestat(mp,
 1151                                 args->u.objtypestat.qc_objtype,
 1152                                 args->u.objtypestat.qc_info);
 1153                 break;
 1154             case QUOTACTL_GET:
 1155                 error = do_sys_quotactl_get(mp,
 1156                                 args->u.get.qc_key,
 1157                                 args->u.get.qc_val);
 1158                 break;
 1159             case QUOTACTL_PUT:
 1160                 error = do_sys_quotactl_put(mp,
 1161                                 args->u.put.qc_key,
 1162                                 args->u.put.qc_val);
 1163                 break;
 1164             case QUOTACTL_DEL:
 1165                 error = do_sys_quotactl_del(mp, args->u.del.qc_key);
 1166                 break;
 1167             case QUOTACTL_CURSOROPEN:
 1168                 error = do_sys_quotactl_cursoropen(mp,
 1169                                 args->u.cursoropen.qc_cursor);
 1170                 break;
 1171             case QUOTACTL_CURSORCLOSE:
 1172                 error = do_sys_quotactl_cursorclose(mp,
 1173                                 args->u.cursorclose.qc_cursor);
 1174                 break;
 1175             case QUOTACTL_CURSORSKIPIDTYPE:
 1176                 error = do_sys_quotactl_cursorskipidtype(mp,
 1177                                 args->u.cursorskipidtype.qc_cursor,
 1178                                 args->u.cursorskipidtype.qc_idtype);
 1179                 break;
 1180             case QUOTACTL_CURSORGET:
 1181                 error = do_sys_quotactl_cursorget(mp,
 1182                                 args->u.cursorget.qc_cursor,
 1183                                 args->u.cursorget.qc_keys,
 1184                                 args->u.cursorget.qc_vals,
 1185                                 args->u.cursorget.qc_maxnum,
 1186                                 args->u.cursorget.qc_ret);
 1187                 break;
 1188             case QUOTACTL_CURSORATEND:
 1189                 error = do_sys_quotactl_cursoratend(mp,
 1190                                 args->u.cursoratend.qc_cursor,
 1191                                 args->u.cursoratend.qc_ret);
 1192                 break;
 1193             case QUOTACTL_CURSORREWIND:
 1194                 error = do_sys_quotactl_cursorrewind(mp,
 1195                                 args->u.cursorrewind.qc_cursor);
 1196                 break;
 1197             case QUOTACTL_QUOTAON:
 1198                 error = do_sys_quotactl_quotaon(mp,
 1199                                 args->u.quotaon.qc_idtype,
 1200                                 args->u.quotaon.qc_quotafile);
 1201                 break;
 1202             case QUOTACTL_QUOTAOFF:
 1203                 error = do_sys_quotactl_quotaoff(mp,
 1204                                 args->u.quotaoff.qc_idtype);
 1205                 break;
 1206             default:
 1207                 error = EINVAL;
 1208                 break;
 1209         }
 1210 
 1211         vrele(vp);
 1212         return error;
 1213 }
 1214 
 1215 /* ARGSUSED */
 1216 int
 1217 sys___quotactl(struct lwp *l, const struct sys___quotactl_args *uap,
 1218     register_t *retval)
 1219 {
 1220         /* {
 1221                 syscallarg(const char *) path;
 1222                 syscallarg(struct quotactl_args *) args;
 1223         } */
 1224         struct quotactl_args args;
 1225         int error;
 1226 
 1227         error = copyin(SCARG(uap, args), &args, sizeof(args));
 1228         if (error) {
 1229                 return error;
 1230         }
 1231 
 1232         return do_sys_quotactl(SCARG(uap, path), &args);
 1233 }
 1234 
 1235 int
 1236 dostatvfs(struct mount *mp, struct statvfs *sp, struct lwp *l, int flags,
 1237     int root)
 1238 {
 1239         struct cwdinfo *cwdi = l->l_proc->p_cwdi;
 1240         bool chrooted;
 1241         int error = 0;
 1242 
 1243         KASSERT(l == curlwp);
 1244 
 1245         /*
 1246          * This is safe unlocked.  cwdi_rdir never goes non-NULL -> NULL,
 1247          * since it would imply chroots can be escaped.  Just make sure this
 1248          * routine is self-consistent.
 1249          */
 1250         chrooted = (atomic_load_relaxed(&cwdi->cwdi_rdir) != NULL);
 1251 
 1252         /*
 1253          * If MNT_NOWAIT or MNT_LAZY is specified, do not
 1254          * refresh the fsstat cache. MNT_WAIT or MNT_LAZY
 1255          * overrides MNT_NOWAIT.
 1256          */
 1257         if (flags == MNT_NOWAIT || flags == MNT_LAZY ||
 1258             (flags != MNT_WAIT && flags != 0)) {
 1259                 memcpy(sp, &mp->mnt_stat, sizeof(*sp));
 1260         } else {
 1261                 /* Get the filesystem stats now */
 1262                 memset(sp, 0, sizeof(*sp));
 1263                 if ((error = VFS_STATVFS(mp, sp)) != 0)
 1264                         return error;
 1265                 if (!chrooted)
 1266                         (void)memcpy(&mp->mnt_stat, sp, sizeof(mp->mnt_stat));
 1267         }
 1268 
 1269         if (chrooted) {
 1270                 size_t len;
 1271                 char *bp;
 1272                 char c;
 1273                 char *path = PNBUF_GET();
 1274 
 1275                 bp = path + MAXPATHLEN;
 1276                 *--bp = '\0';
 1277                 rw_enter(&cwdi->cwdi_lock, RW_READER);
 1278                 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, path,
 1279                     MAXPATHLEN / 2, 0, l);
 1280                 rw_exit(&cwdi->cwdi_lock);
 1281                 if (error) {
 1282                         PNBUF_PUT(path);
 1283                         return error;
 1284                 }
 1285                 len = strlen(bp);
 1286                 if (len != 1) {
 1287                         /*
 1288                          * for mount points that are below our root, we can see
 1289                          * them, so we fix up the pathname and return them. The
 1290                          * rest we cannot see, so we don't allow viewing the
 1291                          * data.
 1292                          */
 1293                         if (strncmp(bp, sp->f_mntonname, len) == 0 &&
 1294                             ((c = sp->f_mntonname[len]) == '/' || c == '\0')) {
 1295                                 (void)strlcpy(sp->f_mntonname,
 1296                                     c == '\0' ? "/" : &sp->f_mntonname[len],
 1297                                     sizeof(sp->f_mntonname));
 1298                         } else {
 1299                                 if (root)
 1300                                         (void)strlcpy(sp->f_mntonname, "/",
 1301                                             sizeof(sp->f_mntonname));
 1302                                 else
 1303                                         error = EPERM;
 1304                         }
 1305                 }
 1306                 PNBUF_PUT(path);
 1307         }
 1308         sp->f_flag = mp->mnt_flag & MNT_VISFLAGMASK;
 1309         return error;
 1310 }
 1311 
 1312 /*
 1313  * Get filesystem statistics by path.
 1314  */
 1315 int
 1316 do_sys_pstatvfs(struct lwp *l, const char *path, int flags, struct statvfs *sb)
 1317 {
 1318         struct mount *mp;
 1319         int error;
 1320         struct vnode *vp;
 1321 
 1322         error = namei_simple_user(path, NSM_FOLLOW_TRYEMULROOT, &vp);
 1323         if (error != 0)
 1324                 return error;
 1325         mp = vp->v_mount;
 1326         error = dostatvfs(mp, sb, l, flags, 1);
 1327         vrele(vp);
 1328         return error;
 1329 }
 1330 
 1331 /* ARGSUSED */
 1332 int
 1333 sys___statvfs190(struct lwp *l, const struct sys___statvfs190_args *uap, register_t *retval)
 1334 {
 1335         /* {
 1336                 syscallarg(const char *) path;
 1337                 syscallarg(struct statvfs *) buf;
 1338                 syscallarg(int) flags;
 1339         } */
 1340         struct statvfs *sb;
 1341         int error;
 1342 
 1343         sb = STATVFSBUF_GET();
 1344         error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
 1345         if (error == 0)
 1346                 error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
 1347         STATVFSBUF_PUT(sb);
 1348         return error;
 1349 }
 1350 
 1351 /*
 1352  * Get filesystem statistics by fd.
 1353  */
 1354 int
 1355 do_sys_fstatvfs(struct lwp *l, int fd, int flags, struct statvfs *sb)
 1356 {
 1357         file_t *fp;
 1358         struct mount *mp;
 1359         int error;
 1360 
 1361         /* fd_getvnode() will use the descriptor for us */
 1362         if ((error = fd_getvnode(fd, &fp)) != 0)
 1363                 return (error);
 1364         mp = fp->f_vnode->v_mount;
 1365         error = dostatvfs(mp, sb, curlwp, flags, 1);
 1366         fd_putfile(fd);
 1367         return error;
 1368 }
 1369 
 1370 /* ARGSUSED */
 1371 int
 1372 sys___fstatvfs190(struct lwp *l, const struct sys___fstatvfs190_args *uap, register_t *retval)
 1373 {
 1374         /* {
 1375                 syscallarg(int) fd;
 1376                 syscallarg(struct statvfs *) buf;
 1377                 syscallarg(int) flags;
 1378         } */
 1379         struct statvfs *sb;
 1380         int error;
 1381 
 1382         sb = STATVFSBUF_GET();
 1383         error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
 1384         if (error == 0)
 1385                 error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
 1386         STATVFSBUF_PUT(sb);
 1387         return error;
 1388 }
 1389 
 1390 
 1391 /*
 1392  * Get statistics on all filesystems.
 1393  */
 1394 int
 1395 do_sys_getvfsstat(struct lwp *l, void *sfsp, size_t bufsize, int flags,
 1396     int (*copyfn)(const void *, void *, size_t), size_t entry_sz,
 1397     register_t *retval)
 1398 {
 1399         int root = 0;
 1400         mount_iterator_t *iter;
 1401         struct proc *p = l->l_proc;
 1402         struct mount *mp;
 1403         struct statvfs *sb;
 1404         size_t count, maxcount;
 1405         int error = 0;
 1406 
 1407         sb = STATVFSBUF_GET();
 1408         maxcount = bufsize / entry_sz;
 1409         count = 0;
 1410         mountlist_iterator_init(&iter);
 1411         while ((mp = mountlist_iterator_next(iter)) != NULL) {
 1412                 if (sfsp && count < maxcount) {
 1413                         error = dostatvfs(mp, sb, l, flags, 0);
 1414                         if (error) {
 1415                                 error = 0;
 1416                                 continue;
 1417                         }
 1418                         error = copyfn(sb, sfsp, entry_sz);
 1419                         if (error)
 1420                                 goto out;
 1421                         sfsp = (char *)sfsp + entry_sz;
 1422                         root |= strcmp(sb->f_mntonname, "/") == 0;
 1423                 }
 1424                 count++;
 1425         }
 1426 
 1427         if (root == 0 && p->p_cwdi->cwdi_rdir) {
 1428                 /*
 1429                  * fake a root entry
 1430                  */
 1431                 error = dostatvfs(p->p_cwdi->cwdi_rdir->v_mount,
 1432                     sb, l, flags, 1);
 1433                 if (error != 0)
 1434                         goto out;
 1435                 if (sfsp) {
 1436                         error = copyfn(sb, sfsp, entry_sz);
 1437                         if (error != 0)
 1438                                 goto out;
 1439                 }
 1440                 count++;
 1441         }
 1442         if (sfsp && count > maxcount)
 1443                 *retval = maxcount;
 1444         else
 1445                 *retval = count;
 1446 out:
 1447         mountlist_iterator_destroy(iter);
 1448         STATVFSBUF_PUT(sb);
 1449         return error;
 1450 }
 1451 
 1452 int
 1453 sys___getvfsstat90(struct lwp *l, const struct sys___getvfsstat90_args *uap,
 1454     register_t *retval)
 1455 {
 1456         /* {
 1457                 syscallarg(struct statvfs *) buf;
 1458                 syscallarg(size_t) bufsize;
 1459                 syscallarg(int) flags;
 1460         } */
 1461 
 1462         return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
 1463             SCARG(uap, flags), copyout, sizeof (struct statvfs), retval);
 1464 }
 1465 
 1466 /*
 1467  * Change current working directory to a given file descriptor.
 1468  */
 1469 int
 1470 do_sys_fchdir(struct lwp *l, int fd, register_t *retval)
 1471 {
 1472         struct proc *p = l->l_proc;
 1473         struct cwdinfo *cwdi;
 1474         struct vnode *vp, *tdp;
 1475         struct mount *mp;
 1476         file_t *fp;
 1477         int error;
 1478 
 1479         /* fd_getvnode() will use the descriptor for us */
 1480         if ((error = fd_getvnode(fd, &fp)) != 0)
 1481                 return error;
 1482         vp = fp->f_vnode;
 1483 
 1484         vref(vp);
 1485         vn_lock(vp, LK_SHARED | LK_RETRY);
 1486         if (vp->v_type != VDIR)
 1487                 error = ENOTDIR;
 1488         else
 1489                 error = VOP_ACCESS(vp, VEXEC, l->l_cred);
 1490         if (error) {
 1491                 vput(vp);
 1492                 goto out;
 1493         }
 1494         while ((mp = vp->v_mountedhere) != NULL) {
 1495                 error = vfs_busy(mp);
 1496                 vput(vp);
 1497                 if (error != 0)
 1498                         goto out;
 1499                 error = VFS_ROOT(mp, LK_SHARED, &tdp);
 1500                 vfs_unbusy(mp);
 1501                 if (error)
 1502                         goto out;
 1503                 vp = tdp;
 1504         }
 1505         VOP_UNLOCK(vp);
 1506 
 1507         /*
 1508          * Disallow changing to a directory not under the process's
 1509          * current root directory (if there is one).
 1510          */
 1511         cwdi = p->p_cwdi;
 1512         rw_enter(&cwdi->cwdi_lock, RW_WRITER);
 1513         if (cwdi->cwdi_rdir && !vn_isunder(vp, NULL, l)) {
 1514                 vrele(vp);
 1515                 error = EPERM;  /* operation not permitted */
 1516         } else {
 1517                 vrele(cwdi->cwdi_cdir);
 1518                 cwdi->cwdi_cdir = vp;
 1519         }
 1520         rw_exit(&cwdi->cwdi_lock);
 1521 
 1522 out:
 1523         fd_putfile(fd);
 1524         return error;
 1525 }
 1526 
 1527 /*
 1528  * Change current working directory to a given file descriptor.
 1529  */
 1530 /* ARGSUSED */
 1531 int
 1532 sys_fchdir(struct lwp *l, const struct sys_fchdir_args *uap, register_t *retval)
 1533 {
 1534         /* {
 1535                 syscallarg(int) fd;
 1536         } */
 1537         return do_sys_fchdir(l, SCARG(uap, fd), retval);
 1538 }
 1539 
 1540 /*
 1541  * Change this process's notion of the root directory to a given file
 1542  * descriptor.
 1543  */
 1544 int
 1545 sys_fchroot(struct lwp *l, const struct sys_fchroot_args *uap, register_t *retval)
 1546 {
 1547         struct vnode    *vp;
 1548         file_t  *fp;
 1549         int              error, fd = SCARG(uap, fd);
 1550 
 1551         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
 1552             KAUTH_REQ_SYSTEM_CHROOT_FCHROOT, NULL, NULL, NULL)) != 0)
 1553                 return error;
 1554         /* fd_getvnode() will use the descriptor for us */
 1555         if ((error = fd_getvnode(fd, &fp)) != 0)
 1556                 return error;
 1557         vp = fp->f_vnode;
 1558         vn_lock(vp, LK_SHARED | LK_RETRY);
 1559         if (vp->v_type != VDIR)
 1560                 error = ENOTDIR;
 1561         else
 1562                 error = VOP_ACCESS(vp, VEXEC, l->l_cred);
 1563         VOP_UNLOCK(vp);
 1564         if (error)
 1565                 goto out;
 1566         vref(vp);
 1567         change_root(vp);
 1568 
 1569  out:
 1570         fd_putfile(fd);
 1571         return (error);
 1572 }
 1573 
 1574 /*
 1575  * Change current working directory (``.'').
 1576  */
 1577 int
 1578 do_sys_chdir(struct lwp *l, const char *path, enum uio_seg seg,
 1579     register_t *retval)
 1580 {
 1581         struct proc *p = l->l_proc;
 1582         struct cwdinfo * cwdi;
 1583         int error;
 1584         struct vnode *vp;
 1585 
 1586         if ((error = chdir_lookup(path, seg, &vp, l)) != 0)
 1587                 return error;
 1588         cwdi = p->p_cwdi;
 1589         rw_enter(&cwdi->cwdi_lock, RW_WRITER);
 1590         vrele(cwdi->cwdi_cdir);
 1591         cwdi->cwdi_cdir = vp;
 1592         rw_exit(&cwdi->cwdi_lock);
 1593         return 0;
 1594 }
 1595 
 1596 /*
 1597  * Change current working directory (``.'').
 1598  */
 1599 /* ARGSUSED */
 1600 int
 1601 sys_chdir(struct lwp *l, const struct sys_chdir_args *uap, register_t *retval)
 1602 {
 1603         /* {
 1604                 syscallarg(const char *) path;
 1605         } */
 1606         return do_sys_chdir(l, SCARG(uap, path), UIO_USERSPACE, retval);
 1607 }
 1608 
 1609 /*
 1610  * Change notion of root (``/'') directory.
 1611  */
 1612 /* ARGSUSED */
 1613 int
 1614 sys_chroot(struct lwp *l, const struct sys_chroot_args *uap, register_t *retval)
 1615 {
 1616         /* {
 1617                 syscallarg(const char *) path;
 1618         } */
 1619         int error;
 1620         struct vnode *vp;
 1621 
 1622         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_CHROOT,
 1623             KAUTH_REQ_SYSTEM_CHROOT_CHROOT, NULL, NULL, NULL)) != 0)
 1624                 return (error);
 1625 
 1626         error = chdir_lookup(SCARG(uap, path), UIO_USERSPACE, &vp, l);
 1627         if (error == 0)
 1628                 change_root(vp);
 1629         return error;
 1630 }
 1631 
 1632 /*
 1633  * Common routine for chroot and fchroot.
 1634  * NB: callers need to properly authorize the change root operation.
 1635  */
 1636 void
 1637 change_root(struct vnode *vp)
 1638 {
 1639         kauth_cred_t ncred;
 1640         struct lwp *l = curlwp;
 1641         struct proc *p = l->l_proc;
 1642         struct cwdinfo *cwdi = p->p_cwdi;
 1643 
 1644         ncred = kauth_cred_alloc();
 1645 
 1646         rw_enter(&cwdi->cwdi_lock, RW_WRITER);
 1647         if (cwdi->cwdi_rdir != NULL)
 1648                 vrele(cwdi->cwdi_rdir);
 1649         cwdi->cwdi_rdir = vp;
 1650 
 1651         /*
 1652          * Prevent escaping from chroot by putting the root under
 1653          * the working directory.  Silently chdir to / if we aren't
 1654          * already there.
 1655          */
 1656         if (!vn_isunder(cwdi->cwdi_cdir, vp, l)) {
 1657                 /*
 1658                  * XXX would be more failsafe to change directory to a
 1659                  * deadfs node here instead
 1660                  */
 1661                 vrele(cwdi->cwdi_cdir);
 1662                 vref(vp);
 1663                 cwdi->cwdi_cdir = vp;
 1664         }
 1665         rw_exit(&cwdi->cwdi_lock);
 1666 
 1667         /* Get a write lock on the process credential. */
 1668         proc_crmod_enter();
 1669 
 1670         kauth_cred_clone(p->p_cred, ncred);
 1671         kauth_proc_chroot(ncred, p->p_cwdi);
 1672 
 1673         /* Broadcast our credentials to the process and other LWPs. */
 1674         proc_crmod_leave(ncred, p->p_cred, true);
 1675 }
 1676 
 1677 /*
 1678  * Common routine for chroot and chdir.
 1679  * XXX "where" should be enum uio_seg
 1680  */
 1681 int
 1682 chdir_lookup(const char *path, int where, struct vnode **vpp, struct lwp *l)
 1683 {
 1684         struct pathbuf *pb;
 1685         struct nameidata nd;
 1686         int error;
 1687 
 1688         error = pathbuf_maybe_copyin(path, where, &pb);
 1689         if (error) {
 1690                 return error;
 1691         }
 1692         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
 1693         if ((error = namei(&nd)) != 0) {
 1694                 pathbuf_destroy(pb);
 1695                 return error;
 1696         }
 1697         *vpp = nd.ni_vp;
 1698         pathbuf_destroy(pb);
 1699 
 1700         if ((*vpp)->v_type != VDIR)
 1701                 error = ENOTDIR;
 1702         else
 1703                 error = VOP_ACCESS(*vpp, VEXEC, l->l_cred);
 1704 
 1705         if (error)
 1706                 vput(*vpp);
 1707         else
 1708                 VOP_UNLOCK(*vpp);
 1709         return (error);
 1710 }
 1711 
 1712 /*
 1713  * Internals of sys_open - path has already been converted into a pathbuf
 1714  * (so we can easily reuse this function from other parts of the kernel,
 1715  * like posix_spawn post-processing).
 1716  */
 1717 int
 1718 do_open(lwp_t *l, struct vnode *dvp, struct pathbuf *pb, int open_flags, 
 1719         int open_mode, int *fd)
 1720 {
 1721         struct proc *p = l->l_proc;
 1722         struct cwdinfo *cwdi = p->p_cwdi;
 1723         file_t *fp;
 1724         struct vnode *vp;
 1725         int dupfd;
 1726         bool dupfd_move;
 1727         int flags, cmode;
 1728         int indx, error;
 1729 
 1730         if (open_flags & O_SEARCH) {
 1731                 open_flags &= ~(int)O_SEARCH;
 1732         }
 1733 
 1734         /*
 1735          * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
 1736          * may be specified.
 1737          */     
 1738         if ((open_flags & O_EXEC) && (open_flags & O_ACCMODE))
 1739                 return EINVAL;
 1740 
 1741         flags = FFLAGS(open_flags);
 1742         if ((flags & (FREAD | FWRITE)) == 0)
 1743                 return EINVAL;
 1744 
 1745         if ((error = fd_allocfile(&fp, &indx)) != 0) {
 1746                 return error;
 1747         }
 1748 
 1749         /* We're going to read cwdi->cwdi_cmask unlocked here. */
 1750         cmode = ((open_mode &~ cwdi->cwdi_cmask) & ALLPERMS) &~ S_ISTXT;
 1751         
 1752         error = vn_open(dvp, pb, TRYEMULROOT, flags, cmode,
 1753             &vp, &dupfd_move, &dupfd);
 1754         if (error != 0) {
 1755                 fd_abort(p, fp, indx);
 1756                 if (error == ERESTART)
 1757                         error = EINTR;
 1758                 return error;
 1759         }
 1760 
 1761         if (vp == NULL) {
 1762                 fd_abort(p, fp, indx);
 1763                 error = fd_dupopen(dupfd, dupfd_move, flags, &indx);
 1764                 if (error)
 1765                         return error;
 1766                 *fd = indx;
 1767         } else {
 1768                 error = open_setfp(l, fp, vp, indx, flags);
 1769                 if (error)
 1770                         return error;
 1771                 VOP_UNLOCK(vp);
 1772                 *fd = indx;
 1773                 fd_affix(p, fp, indx);
 1774         }
 1775 
 1776         return 0;
 1777 }
 1778 
 1779 int
 1780 fd_open(const char *path, int open_flags, int open_mode, int *fd)
 1781 {
 1782         struct pathbuf *pb;
 1783         int error, oflags;
 1784 
 1785         oflags = FFLAGS(open_flags);
 1786         if ((oflags & (FREAD | FWRITE)) == 0)
 1787                 return EINVAL;
 1788 
 1789         pb = pathbuf_create(path);
 1790         if (pb == NULL)
 1791                 return ENOMEM;
 1792 
 1793         error = do_open(curlwp, NULL, pb, open_flags, open_mode, fd);
 1794         pathbuf_destroy(pb);
 1795 
 1796         return error;
 1797 }
 1798 
 1799 static int
 1800 do_sys_openat(lwp_t *l, int fdat, const char *path, int flags,
 1801     int mode, int *fd)
 1802 {
 1803         file_t *dfp = NULL;
 1804         struct vnode *dvp = NULL;
 1805         struct pathbuf *pb;
 1806         const char *pathstring = NULL;
 1807         int error;
 1808 
 1809         if (path == NULL) {
 1810                 MODULE_HOOK_CALL(vfs_openat_10_hook, (&pb), enosys(), error);
 1811                 if (error == ENOSYS)
 1812                         goto no_compat;
 1813                 if (error)
 1814                         return error;
 1815         } else {
 1816 no_compat:
 1817                 error = pathbuf_copyin(path, &pb);
 1818                 if (error)
 1819                         return error;
 1820         }
 1821 
 1822         pathstring = pathbuf_stringcopy_get(pb);
 1823 
 1824         /* 
 1825          * fdat is ignored if:
 1826          * 1) if fdat is AT_FDCWD, which means use current directory as base.
 1827          * 2) if path is absolute, then fdat is useless.
 1828          */
 1829         if (fdat != AT_FDCWD && pathstring[0] != '/') {
 1830                 /* fd_getvnode() will use the descriptor for us */
 1831                 if ((error = fd_getvnode(fdat, &dfp)) != 0)
 1832                         goto out;
 1833 
 1834                 dvp = dfp->f_vnode;
 1835         }
 1836 
 1837         error = do_open(l, dvp, pb, flags, mode, fd);
 1838 
 1839         if (dfp != NULL)
 1840                 fd_putfile(fdat);
 1841 out:
 1842         pathbuf_stringcopy_put(pb, pathstring);
 1843         pathbuf_destroy(pb);
 1844         return error;
 1845 }
 1846 
 1847 int
 1848 sys_open(struct lwp *l, const struct sys_open_args *uap, register_t *retval)
 1849 {
 1850         /* {
 1851                 syscallarg(const char *) path;
 1852                 syscallarg(int) flags;
 1853                 syscallarg(int) mode;
 1854         } */
 1855         int error;
 1856         int fd;
 1857 
 1858         error = do_sys_openat(l, AT_FDCWD, SCARG(uap, path),
 1859                               SCARG(uap, flags), SCARG(uap, mode), &fd);
 1860 
 1861         if (error == 0)
 1862                 *retval = fd;
 1863 
 1864         return error;
 1865 }
 1866 
 1867 int
 1868 sys_openat(struct lwp *l, const struct sys_openat_args *uap, register_t *retval)
 1869 {
 1870         /* {
 1871                 syscallarg(int) fd;
 1872                 syscallarg(const char *) path;
 1873                 syscallarg(int) oflags;
 1874                 syscallarg(int) mode;
 1875         } */
 1876         int error;
 1877         int fd;
 1878 
 1879         error = do_sys_openat(l, SCARG(uap, fd), SCARG(uap, path),
 1880                               SCARG(uap, oflags), SCARG(uap, mode), &fd);
 1881 
 1882         if (error == 0)
 1883                 *retval = fd;
 1884 
 1885         return error;
 1886 }
 1887 
 1888 static void
 1889 vfs__fhfree(fhandle_t *fhp)
 1890 {
 1891         size_t fhsize;
 1892 
 1893         fhsize = FHANDLE_SIZE(fhp);
 1894         kmem_free(fhp, fhsize);
 1895 }
 1896 
 1897 /*
 1898  * vfs_composefh: compose a filehandle.
 1899  */
 1900 
 1901 int
 1902 vfs_composefh(struct vnode *vp, fhandle_t *fhp, size_t *fh_size)
 1903 {
 1904         struct mount *mp;
 1905         struct fid *fidp;
 1906         int error;
 1907         size_t needfhsize;
 1908         size_t fidsize;
 1909 
 1910         mp = vp->v_mount;
 1911         fidp = NULL;
 1912         if (*fh_size < FHANDLE_SIZE_MIN) {
 1913                 fidsize = 0;
 1914         } else {
 1915                 fidsize = *fh_size - offsetof(fhandle_t, fh_fid);
 1916                 if (fhp != NULL) {
 1917                         memset(fhp, 0, *fh_size);
 1918                         fhp->fh_fsid = mp->mnt_stat.f_fsidx;
 1919                         fidp = &fhp->fh_fid;
 1920                 }
 1921         }
 1922         error = VFS_VPTOFH(vp, fidp, &fidsize);
 1923         needfhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
 1924         if (error == 0 && *fh_size < needfhsize) {
 1925                 error = E2BIG;
 1926         }
 1927         *fh_size = needfhsize;
 1928         return error;
 1929 }
 1930 
 1931 int
 1932 vfs_composefh_alloc(struct vnode *vp, fhandle_t **fhpp)
 1933 {
 1934         struct mount *mp;
 1935         fhandle_t *fhp;
 1936         size_t fhsize;
 1937         size_t fidsize;
 1938         int error;
 1939 
 1940         mp = vp->v_mount;
 1941         fidsize = 0;
 1942         error = VFS_VPTOFH(vp, NULL, &fidsize);
 1943         KASSERT(error != 0);
 1944         if (error != E2BIG) {
 1945                 goto out;
 1946         }
 1947         fhsize = FHANDLE_SIZE_FROM_FILEID_SIZE(fidsize);
 1948         fhp = kmem_zalloc(fhsize, KM_SLEEP);
 1949         fhp->fh_fsid = mp->mnt_stat.f_fsidx;
 1950         error = VFS_VPTOFH(vp, &fhp->fh_fid, &fidsize);
 1951         if (error == 0) {
 1952                 KASSERT((FHANDLE_SIZE(fhp) == fhsize &&
 1953                     FHANDLE_FILEID(fhp)->fid_len == fidsize));
 1954                 *fhpp = fhp;
 1955         } else {
 1956                 kmem_free(fhp, fhsize);
 1957         }
 1958 out:
 1959         return error;
 1960 }
 1961 
 1962 void
 1963 vfs_composefh_free(fhandle_t *fhp)
 1964 {
 1965 
 1966         vfs__fhfree(fhp);
 1967 }
 1968 
 1969 /*
 1970  * vfs_fhtovp: lookup a vnode by a filehandle.
 1971  */
 1972 
 1973 int
 1974 vfs_fhtovp(fhandle_t *fhp, struct vnode **vpp)
 1975 {
 1976         struct mount *mp;
 1977         int error;
 1978 
 1979         *vpp = NULL;
 1980         mp = vfs_getvfs(FHANDLE_FSID(fhp));
 1981         if (mp == NULL) {
 1982                 error = ESTALE;
 1983                 goto out;
 1984         }
 1985         if (mp->mnt_op->vfs_fhtovp == NULL) {
 1986                 error = EOPNOTSUPP;
 1987                 goto out;
 1988         }
 1989         error = VFS_FHTOVP(mp, FHANDLE_FILEID(fhp), LK_EXCLUSIVE, vpp);
 1990 out:
 1991         return error;
 1992 }
 1993 
 1994 /*
 1995  * vfs_copyinfh_alloc: allocate and copyin a filehandle, given
 1996  * the needed size.
 1997  */
 1998 
 1999 int
 2000 vfs_copyinfh_alloc(const void *ufhp, size_t fhsize, fhandle_t **fhpp)
 2001 {
 2002         fhandle_t *fhp;
 2003         int error;
 2004 
 2005         if (fhsize > FHANDLE_SIZE_MAX) {
 2006                 return EINVAL;
 2007         }
 2008         if (fhsize < FHANDLE_SIZE_MIN) {
 2009                 return EINVAL;
 2010         }
 2011 again:
 2012         fhp = kmem_alloc(fhsize, KM_SLEEP);
 2013         error = copyin(ufhp, fhp, fhsize);
 2014         if (error == 0) {
 2015                 /* XXX this check shouldn't be here */
 2016                 if (FHANDLE_SIZE(fhp) == fhsize) {
 2017                         *fhpp = fhp;
 2018                         return 0;
 2019                 } else if (fhsize == NFSX_V2FH && FHANDLE_SIZE(fhp) < fhsize) {
 2020                         /*
 2021                          * a kludge for nfsv2 padded handles.
 2022                          */
 2023                         size_t sz;
 2024 
 2025                         sz = FHANDLE_SIZE(fhp);
 2026                         kmem_free(fhp, fhsize);
 2027                         fhsize = sz;
 2028                         goto again;
 2029                 } else {
 2030                         /*
 2031                          * userland told us wrong size.
 2032                          */
 2033                         error = EINVAL;
 2034                 }
 2035         }
 2036         kmem_free(fhp, fhsize);
 2037         return error;
 2038 }
 2039 
 2040 void
 2041 vfs_copyinfh_free(fhandle_t *fhp)
 2042 {
 2043 
 2044         vfs__fhfree(fhp);
 2045 }
 2046 
 2047 /*
 2048  * Get file handle system call
 2049  */
 2050 int
 2051 sys___getfh30(struct lwp *l, const struct sys___getfh30_args *uap, register_t *retval)
 2052 {
 2053         /* {
 2054                 syscallarg(char *) fname;
 2055                 syscallarg(fhandle_t *) fhp;
 2056                 syscallarg(size_t *) fh_size;
 2057         } */
 2058         struct vnode *vp;
 2059         fhandle_t *fh;
 2060         int error;
 2061         struct pathbuf *pb;
 2062         struct nameidata nd;
 2063         size_t sz;
 2064         size_t usz;
 2065 
 2066         /*
 2067          * Must be super user
 2068          */
 2069         error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
 2070             0, NULL, NULL, NULL);
 2071         if (error)
 2072                 return (error);
 2073 
 2074         error = pathbuf_copyin(SCARG(uap, fname), &pb);
 2075         if (error) {
 2076                 return error;
 2077         }
 2078         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
 2079         error = namei(&nd);
 2080         if (error) {
 2081                 pathbuf_destroy(pb);
 2082                 return error;
 2083         }
 2084         vp = nd.ni_vp;
 2085         pathbuf_destroy(pb);
 2086 
 2087         error = vfs_composefh_alloc(vp, &fh);
 2088         vput(vp);
 2089         if (error != 0) {
 2090                 return error;
 2091         }
 2092         error = copyin(SCARG(uap, fh_size), &usz, sizeof(size_t));
 2093         if (error != 0) {
 2094                 goto out;
 2095         }
 2096         sz = FHANDLE_SIZE(fh);
 2097         error = copyout(&sz, SCARG(uap, fh_size), sizeof(size_t));
 2098         if (error != 0) {
 2099                 goto out;
 2100         }
 2101         if (usz >= sz) {
 2102                 error = copyout(fh, SCARG(uap, fhp), sz);
 2103         } else {
 2104                 error = E2BIG;
 2105         }
 2106 out:
 2107         vfs_composefh_free(fh);
 2108         return (error);
 2109 }
 2110 
 2111 /*
 2112  * Open a file given a file handle.
 2113  *
 2114  * Check permissions, allocate an open file structure,
 2115  * and call the device open routine if any.
 2116  */
 2117 
 2118 int
 2119 dofhopen(struct lwp *l, const void *ufhp, size_t fhsize, int oflags,
 2120     register_t *retval)
 2121 {
 2122         file_t *fp;
 2123         struct vnode *vp = NULL;
 2124         kauth_cred_t cred = l->l_cred;
 2125         file_t *nfp;
 2126         int indx, error;
 2127         struct vattr va;
 2128         fhandle_t *fh;
 2129         int flags;
 2130         proc_t *p;
 2131 
 2132         p = curproc;
 2133 
 2134         /*
 2135          * Must be super user
 2136          */
 2137         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
 2138             0, NULL, NULL, NULL)))
 2139                 return (error);
 2140 
 2141         if (oflags & O_SEARCH) {
 2142                 oflags &= ~(int)O_SEARCH;
 2143         }
 2144 
 2145         flags = FFLAGS(oflags);
 2146         if ((flags & (FREAD | FWRITE)) == 0)
 2147                 return (EINVAL);
 2148         if ((flags & O_CREAT))
 2149                 return (EINVAL);
 2150         if ((error = fd_allocfile(&nfp, &indx)) != 0)
 2151                 return (error);
 2152         fp = nfp;
 2153         error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
 2154         if (error != 0) {
 2155                 goto bad;
 2156         }
 2157         error = vfs_fhtovp(fh, &vp);
 2158         vfs_copyinfh_free(fh);
 2159         if (error != 0) {
 2160                 goto bad;
 2161         }
 2162 
 2163         /* Now do an effective vn_open */
 2164 
 2165         if (vp->v_type == VSOCK) {
 2166                 error = EOPNOTSUPP;
 2167                 goto bad;
 2168         }
 2169         error = vn_openchk(vp, cred, flags);
 2170         if (error != 0)
 2171                 goto bad;
 2172         if (flags & O_TRUNC) {
 2173                 VOP_UNLOCK(vp);                 /* XXX */
 2174                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);   /* XXX */
 2175                 vattr_null(&va);
 2176                 va.va_size = 0;
 2177                 error = VOP_SETATTR(vp, &va, cred);
 2178                 if (error)
 2179                         goto bad;
 2180         }
 2181         if ((error = VOP_OPEN(vp, flags, cred)) != 0)
 2182                 goto bad;
 2183         if (flags & FWRITE) {
 2184                 mutex_enter(vp->v_interlock);
 2185                 vp->v_writecount++;
 2186                 mutex_exit(vp->v_interlock);
 2187         }
 2188 
 2189         /* done with modified vn_open, now finish what sys_open does. */
 2190         if ((error = open_setfp(l, fp, vp, indx, flags)))
 2191                 return error;
 2192 
 2193         VOP_UNLOCK(vp);
 2194         *retval = indx;
 2195         fd_affix(p, fp, indx);
 2196         return (0);
 2197 
 2198 bad:
 2199         fd_abort(p, fp, indx);
 2200         if (vp != NULL)
 2201                 vput(vp);
 2202         if (error == EDUPFD || error == EMOVEFD) {
 2203                 /* XXX should probably close curlwp->l_dupfd */
 2204                 error = EOPNOTSUPP;
 2205         }
 2206         return (error);
 2207 }
 2208 
 2209 int
 2210 sys___fhopen40(struct lwp *l, const struct sys___fhopen40_args *uap, register_t *retval)
 2211 {
 2212         /* {
 2213                 syscallarg(const void *) fhp;
 2214                 syscallarg(size_t) fh_size;
 2215                 syscallarg(int) flags;
 2216         } */
 2217 
 2218         return dofhopen(l, SCARG(uap, fhp), SCARG(uap, fh_size),
 2219             SCARG(uap, flags), retval);
 2220 }
 2221 
 2222 int
 2223 do_fhstat(struct lwp *l, const void *ufhp, size_t fhsize, struct stat *sb)
 2224 {
 2225         int error;
 2226         fhandle_t *fh;
 2227         struct vnode *vp;
 2228 
 2229         /*
 2230          * Must be super user
 2231          */
 2232         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
 2233             0, NULL, NULL, NULL)))
 2234                 return (error);
 2235 
 2236         error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
 2237         if (error != 0)
 2238                 return error;
 2239 
 2240         error = vfs_fhtovp(fh, &vp);
 2241         vfs_copyinfh_free(fh);
 2242         if (error != 0)
 2243                 return error;
 2244 
 2245         error = vn_stat(vp, sb);
 2246         vput(vp);
 2247         return error;
 2248 }
 2249 
 2250 
 2251 /* ARGSUSED */
 2252 int
 2253 sys___fhstat50(struct lwp *l, const struct sys___fhstat50_args *uap, register_t *retval)
 2254 {
 2255         /* {
 2256                 syscallarg(const void *) fhp;
 2257                 syscallarg(size_t) fh_size;
 2258                 syscallarg(struct stat *) sb;
 2259         } */
 2260         struct stat sb;
 2261         int error;
 2262 
 2263         error = do_fhstat(l, SCARG(uap, fhp), SCARG(uap, fh_size), &sb);
 2264         if (error)
 2265                 return error;
 2266         return copyout(&sb, SCARG(uap, sb), sizeof(sb));
 2267 }
 2268 
 2269 int
 2270 do_fhstatvfs(struct lwp *l, const void *ufhp, size_t fhsize, struct statvfs *sb,
 2271     int flags)
 2272 {
 2273         fhandle_t *fh;
 2274         struct mount *mp;
 2275         struct vnode *vp;
 2276         int error;
 2277 
 2278         /*
 2279          * Must be super user
 2280          */
 2281         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FILEHANDLE,
 2282             0, NULL, NULL, NULL)))
 2283                 return error;
 2284 
 2285         error = vfs_copyinfh_alloc(ufhp, fhsize, &fh);
 2286         if (error != 0)
 2287                 return error;
 2288 
 2289         error = vfs_fhtovp(fh, &vp);
 2290         vfs_copyinfh_free(fh);
 2291         if (error != 0)
 2292                 return error;
 2293 
 2294         mp = vp->v_mount;
 2295         error = dostatvfs(mp, sb, l, flags, 1);
 2296         vput(vp);
 2297         return error;
 2298 }
 2299 
 2300 /* ARGSUSED */
 2301 int
 2302 sys___fhstatvfs190(struct lwp *l, const struct sys___fhstatvfs190_args *uap, register_t *retval)
 2303 {
 2304         /* {
 2305                 syscallarg(const void *) fhp;
 2306                 syscallarg(size_t) fh_size;
 2307                 syscallarg(struct statvfs *) buf;
 2308                 syscallarg(int) flags;
 2309         } */
 2310         struct statvfs *sb = STATVFSBUF_GET();
 2311         int error;
 2312 
 2313         error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size), sb,
 2314             SCARG(uap, flags));
 2315         if (error == 0)
 2316                 error = copyout(sb, SCARG(uap, buf), sizeof(*sb));
 2317         STATVFSBUF_PUT(sb);
 2318         return error;
 2319 }
 2320 
 2321 int
 2322 do_posix_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
 2323     dev_t dev)
 2324 {
 2325 
 2326         /*
 2327          * The POSIX mknod(2) call is an alias for mkfifo(2) for S_IFIFO
 2328          * in mode and dev=0.
 2329          *
 2330          * In all the other cases it's implementation defined behavior.
 2331          */
 2332 
 2333         if ((mode & S_IFIFO) && dev == 0)
 2334                 return do_sys_mkfifoat(l, fdat, pathname, mode);
 2335         else
 2336                 return do_sys_mknodat(l, fdat, pathname, mode, dev,
 2337                     UIO_USERSPACE);
 2338 }
 2339 
 2340 /*
 2341  * Create a special file.
 2342  */
 2343 /* ARGSUSED */
 2344 int
 2345 sys___mknod50(struct lwp *l, const struct sys___mknod50_args *uap,
 2346     register_t *retval)
 2347 {
 2348         /* {
 2349                 syscallarg(const char *) path;
 2350                 syscallarg(mode_t) mode;
 2351                 syscallarg(dev_t) dev;
 2352         } */
 2353         return do_posix_mknodat(l, AT_FDCWD, SCARG(uap, path),
 2354             SCARG(uap, mode), SCARG(uap, dev));
 2355 }
 2356 
 2357 int
 2358 sys_mknodat(struct lwp *l, const struct sys_mknodat_args *uap,
 2359     register_t *retval)
 2360 {
 2361         /* {
 2362                 syscallarg(int) fd;
 2363                 syscallarg(const char *) path;
 2364                 syscallarg(mode_t) mode;
 2365                 syscallarg(int) pad;
 2366                 syscallarg(dev_t) dev;
 2367         } */
 2368 
 2369         return do_posix_mknodat(l, SCARG(uap, fd), SCARG(uap, path),
 2370             SCARG(uap, mode), SCARG(uap, dev));
 2371 }
 2372 
 2373 int
 2374 do_sys_mknod(struct lwp *l, const char *pathname, mode_t mode, dev_t dev,
 2375     enum uio_seg seg)
 2376 {
 2377         return do_sys_mknodat(l, AT_FDCWD, pathname, mode, dev, seg);
 2378 }
 2379 
 2380 int
 2381 do_sys_mknodat(struct lwp *l, int fdat, const char *pathname, mode_t mode,
 2382     dev_t dev, enum uio_seg seg)
 2383 {
 2384         struct proc *p = l->l_proc;
 2385         struct vnode *vp;
 2386         struct vattr vattr;
 2387         int error, optype;
 2388         struct pathbuf *pb;
 2389         struct nameidata nd;
 2390         const char *pathstring;
 2391 
 2392         if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MKNOD,
 2393             0, NULL, NULL, NULL)) != 0)
 2394                 return (error);
 2395 
 2396         optype = VOP_MKNOD_DESCOFFSET;
 2397 
 2398         error = pathbuf_maybe_copyin(pathname, seg, &pb);
 2399         if (error) {
 2400                 return error;
 2401         }
 2402         pathstring = pathbuf_stringcopy_get(pb);
 2403         if (pathstring == NULL) {
 2404                 pathbuf_destroy(pb);
 2405                 return ENOMEM;
 2406         }
 2407 
 2408         NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
 2409 
 2410         if ((error = fd_nameiat(l, fdat, &nd)) != 0)
 2411                 goto out;
 2412         vp = nd.ni_vp;
 2413 
 2414         if (vp != NULL)
 2415                 error = EEXIST;
 2416         else {
 2417                 vattr_null(&vattr);
 2418                 /* We will read cwdi->cwdi_cmask unlocked. */
 2419                 vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
 2420                 vattr.va_rdev = dev;
 2421 
 2422                 switch (mode & S_IFMT) {
 2423                 case S_IFMT:    /* used by badsect to flag bad sectors */
 2424                         vattr.va_type = VBAD;
 2425                         break;
 2426                 case S_IFCHR:
 2427                         vattr.va_type = VCHR;
 2428                         break;
 2429                 case S_IFBLK:
 2430                         vattr.va_type = VBLK;
 2431                         break;
 2432                 case S_IFWHT:
 2433                         optype = VOP_WHITEOUT_DESCOFFSET;
 2434                         break;
 2435                 case S_IFREG:
 2436 #if NVERIEXEC > 0
 2437                         error = veriexec_openchk(l, nd.ni_vp, pathstring,
 2438                             O_CREAT);
 2439 #endif /* NVERIEXEC > 0 */
 2440                         vattr.va_type = VREG;
 2441                         vattr.va_rdev = VNOVAL;
 2442                         optype = VOP_CREATE_DESCOFFSET;
 2443                         break;
 2444                 default:
 2445                         error = EINVAL;
 2446                         break;
 2447                 }
 2448 
 2449                 if (error == 0 && optype == VOP_MKNOD_DESCOFFSET &&
 2450                     vattr.va_rdev == VNOVAL)
 2451                         error = EINVAL;
 2452         }
 2453 
 2454         if (!error) {
 2455                 switch (optype) {
 2456                 case VOP_WHITEOUT_DESCOFFSET:
 2457                         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 2458                         if (error)
 2459                                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 2460                         vput(nd.ni_dvp);
 2461                         break;
 2462 
 2463                 case VOP_MKNOD_DESCOFFSET:
 2464                         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 2465                                                 &nd.ni_cnd, &vattr);
 2466                         if (error == 0)
 2467                                 vrele(nd.ni_vp);
 2468                         vput(nd.ni_dvp);
 2469                         break;
 2470 
 2471                 case VOP_CREATE_DESCOFFSET:
 2472                         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
 2473                                                 &nd.ni_cnd, &vattr);
 2474                         if (error == 0)
 2475                                 vrele(nd.ni_vp);
 2476                         vput(nd.ni_dvp);
 2477                         break;
 2478                 }
 2479         } else {
 2480                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 2481                 if (nd.ni_dvp == vp)
 2482                         vrele(nd.ni_dvp);
 2483                 else
 2484                         vput(nd.ni_dvp);
 2485                 if (vp)
 2486                         vrele(vp);
 2487         }
 2488 out:
 2489         pathbuf_stringcopy_put(pb, pathstring);
 2490         pathbuf_destroy(pb);
 2491         return (error);
 2492 }
 2493 
 2494 /*
 2495  * Create a named pipe.
 2496  */
 2497 /* ARGSUSED */
 2498 int
 2499 sys_mkfifo(struct lwp *l, const struct sys_mkfifo_args *uap, register_t *retval)
 2500 {
 2501         /* {
 2502                 syscallarg(const char *) path;
 2503                 syscallarg(int) mode;
 2504         } */
 2505         return do_sys_mkfifoat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap, mode));
 2506 }
 2507 
 2508 int
 2509 sys_mkfifoat(struct lwp *l, const struct sys_mkfifoat_args *uap, 
 2510     register_t *retval)
 2511 {
 2512         /* {
 2513                 syscallarg(int) fd;
 2514                 syscallarg(const char *) path;
 2515                 syscallarg(int) mode;
 2516         } */
 2517 
 2518         return do_sys_mkfifoat(l, SCARG(uap, fd), SCARG(uap, path), 
 2519             SCARG(uap, mode));
 2520 }
 2521 
 2522 static int
 2523 do_sys_mkfifoat(struct lwp *l, int fdat, const char *path, mode_t mode)
 2524 {
 2525         struct proc *p = l->l_proc;
 2526         struct vattr vattr;
 2527         int error;
 2528         struct pathbuf *pb;
 2529         struct nameidata nd;
 2530 
 2531         error = pathbuf_copyin(path, &pb);
 2532         if (error) {
 2533                 return error;
 2534         }
 2535         NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, pb);
 2536 
 2537         if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
 2538                 pathbuf_destroy(pb);
 2539                 return error;
 2540         }
 2541         if (nd.ni_vp != NULL) {
 2542                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 2543                 if (nd.ni_dvp == nd.ni_vp)
 2544                         vrele(nd.ni_dvp);
 2545                 else
 2546                         vput(nd.ni_dvp);
 2547                 vrele(nd.ni_vp);
 2548                 pathbuf_destroy(pb);
 2549                 return (EEXIST);
 2550         }
 2551         vattr_null(&vattr);
 2552         vattr.va_type = VFIFO;
 2553         /* We will read cwdi->cwdi_cmask unlocked. */
 2554         vattr.va_mode = (mode & ALLPERMS) &~ p->p_cwdi->cwdi_cmask;
 2555         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 2556         if (error == 0)
 2557                 vrele(nd.ni_vp);
 2558         vput(nd.ni_dvp);
 2559         pathbuf_destroy(pb);
 2560         return (error);
 2561 }
 2562 
 2563 /*
 2564  * Make a hard file link.
 2565  */
 2566 /* ARGSUSED */
 2567 int
 2568 do_sys_linkat(struct lwp *l, int fdpath, const char *path, int fdlink,
 2569     const char *link, int follow, register_t *retval) 
 2570 {
 2571         struct vnode *vp;
 2572         struct pathbuf *linkpb;
 2573         struct nameidata nd;
 2574         namei_simple_flags_t ns_flags;
 2575         int error;
 2576 
 2577         if (follow & AT_SYMLINK_FOLLOW)
 2578                 ns_flags = NSM_FOLLOW_TRYEMULROOT;
 2579         else
 2580                 ns_flags = NSM_NOFOLLOW_TRYEMULROOT;
 2581 
 2582         error = fd_nameiat_simple_user(l, fdpath, path, ns_flags, &vp);
 2583         if (error != 0)
 2584                 return (error);
 2585         error = pathbuf_copyin(link, &linkpb);
 2586         if (error) {
 2587                 goto out1;
 2588         }
 2589         NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
 2590         if ((error = fd_nameiat(l, fdlink, &nd)) != 0)
 2591                 goto out2;
 2592         if (nd.ni_vp) {
 2593                 error = EEXIST;
 2594                 goto abortop;
 2595         }
 2596         /* Prevent hard links on directories. */
 2597         if (vp->v_type == VDIR) {
 2598                 error = EPERM;
 2599                 goto abortop;
 2600         }
 2601         /* Prevent cross-mount operation. */
 2602         if (nd.ni_dvp->v_mount != vp->v_mount) {
 2603                 error = EXDEV;
 2604                 goto abortop;
 2605         }
 2606         error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 2607         VOP_UNLOCK(nd.ni_dvp);
 2608         vrele(nd.ni_dvp);
 2609 out2:
 2610         pathbuf_destroy(linkpb);
 2611 out1:
 2612         vrele(vp);
 2613         return (error);
 2614 abortop:
 2615         VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 2616         if (nd.ni_dvp == nd.ni_vp)
 2617                 vrele(nd.ni_dvp);
 2618         else
 2619                 vput(nd.ni_dvp);
 2620         if (nd.ni_vp != NULL)
 2621                 vrele(nd.ni_vp);
 2622         goto out2;
 2623 }
 2624 
 2625 int
 2626 sys_link(struct lwp *l, const struct sys_link_args *uap, register_t *retval)
 2627 {
 2628         /* {
 2629                 syscallarg(const char *) path;
 2630                 syscallarg(const char *) link;
 2631         } */
 2632         const char *path = SCARG(uap, path);
 2633         const char *link = SCARG(uap, link);
 2634 
 2635         return do_sys_linkat(l, AT_FDCWD, path, AT_FDCWD, link,
 2636             AT_SYMLINK_FOLLOW, retval);
 2637 }
 2638 
 2639 int
 2640 sys_linkat(struct lwp *l, const struct sys_linkat_args *uap,
 2641     register_t *retval)
 2642 {
 2643         /* {
 2644                 syscallarg(int) fd1;
 2645                 syscallarg(const char *) name1;
 2646                 syscallarg(int) fd2;
 2647                 syscallarg(const char *) name2;
 2648                 syscallarg(int) flags;
 2649         } */
 2650         int fd1 = SCARG(uap, fd1);
 2651         const char *name1 = SCARG(uap, name1);
 2652         int fd2 = SCARG(uap, fd2);
 2653         const char *name2 = SCARG(uap, name2);
 2654         int follow;
 2655 
 2656         follow = SCARG(uap, flags) & AT_SYMLINK_FOLLOW;
 2657 
 2658         return do_sys_linkat(l, fd1, name1, fd2, name2, follow, retval);
 2659 }
 2660 
 2661 
 2662 int
 2663 do_sys_symlink(const char *patharg, const char *link, enum uio_seg seg)
 2664 {
 2665         return do_sys_symlinkat(NULL, patharg, AT_FDCWD, link, seg);
 2666 }
 2667 
 2668 static int
 2669 do_sys_symlinkat(struct lwp *l, const char *patharg, int fdat,
 2670     const char *link, enum uio_seg seg)
 2671 {
 2672         struct proc *p = curproc;
 2673         struct vattr vattr;
 2674         char *path;
 2675         int error;
 2676         size_t len;
 2677         struct pathbuf *linkpb;
 2678         struct nameidata nd;
 2679 
 2680         KASSERT(l != NULL || fdat == AT_FDCWD);
 2681 
 2682         path = PNBUF_GET();
 2683         if (seg == UIO_USERSPACE) {
 2684                 if ((error = copyinstr(patharg, path, MAXPATHLEN, &len)) != 0)
 2685                         goto out1;
 2686                 if ((error = pathbuf_copyin(link, &linkpb)) != 0)
 2687                         goto out1;
 2688         } else {
 2689                 len = strlen(patharg) + 1;
 2690                 KASSERT(len <= MAXPATHLEN);
 2691                 memcpy(path, patharg, len);
 2692                 linkpb = pathbuf_create(link);
 2693                 if (linkpb == NULL) {
 2694                         error = ENOMEM;
 2695                         goto out1;
 2696                 }
 2697         }
 2698         ktrkuser("symlink-target", path, len - 1);
 2699 
 2700         NDINIT(&nd, CREATE, LOCKPARENT | TRYEMULROOT, linkpb);
 2701         if ((error = fd_nameiat(l, fdat, &nd)) != 0)
 2702                 goto out2;
 2703         if (nd.ni_vp) {
 2704                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 2705                 if (nd.ni_dvp == nd.ni_vp)
 2706                         vrele(nd.ni_dvp);
 2707                 else
 2708                         vput(nd.ni_dvp);
 2709                 vrele(nd.ni_vp);
 2710                 error = EEXIST;
 2711                 goto out2;
 2712         }
 2713         vattr_null(&vattr);
 2714         vattr.va_type = VLNK;
 2715         /* We will read cwdi->cwdi_cmask unlocked. */
 2716         vattr.va_mode = ACCESSPERMS &~ p->p_cwdi->cwdi_cmask;
 2717         error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
 2718         if (error == 0)
 2719                 vrele(nd.ni_vp);
 2720         vput(nd.ni_dvp);
 2721 out2:
 2722         pathbuf_destroy(linkpb);
 2723 out1:
 2724         PNBUF_PUT(path);
 2725         return (error);
 2726 }
 2727 
 2728 /*
 2729  * Make a symbolic link.
 2730  */
 2731 /* ARGSUSED */
 2732 int
 2733 sys_symlink(struct lwp *l, const struct sys_symlink_args *uap, register_t *retval)
 2734 {
 2735         /* {
 2736                 syscallarg(const char *) path;
 2737                 syscallarg(const char *) link;
 2738         } */
 2739 
 2740         return do_sys_symlinkat(l, SCARG(uap, path), AT_FDCWD, SCARG(uap, link),
 2741             UIO_USERSPACE);
 2742 }
 2743 
 2744 int
 2745 sys_symlinkat(struct lwp *l, const struct sys_symlinkat_args *uap, 
 2746     register_t *retval)
 2747 {
 2748         /* {
 2749                 syscallarg(const char *) path1;
 2750                 syscallarg(int) fd;
 2751                 syscallarg(const char *) path2;
 2752         } */
 2753 
 2754         return do_sys_symlinkat(l, SCARG(uap, path1), SCARG(uap, fd),
 2755             SCARG(uap, path2), UIO_USERSPACE);
 2756 }
 2757 
 2758 /*
 2759  * Delete a whiteout from the filesystem.
 2760  */
 2761 /* ARGSUSED */
 2762 int
 2763 sys_undelete(struct lwp *l, const struct sys_undelete_args *uap, register_t *retval)
 2764 {
 2765         /* {
 2766                 syscallarg(const char *) path;
 2767         } */
 2768         int error;
 2769         struct pathbuf *pb;
 2770         struct nameidata nd;
 2771 
 2772         error = pathbuf_copyin(SCARG(uap, path), &pb);
 2773         if (error) {
 2774                 return error;
 2775         }
 2776 
 2777         NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | TRYEMULROOT, pb);
 2778         error = namei(&nd);
 2779         if (error) {
 2780                 pathbuf_destroy(pb);
 2781                 return (error);
 2782         }
 2783 
 2784         if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 2785                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 2786                 if (nd.ni_dvp == nd.ni_vp)
 2787                         vrele(nd.ni_dvp);
 2788                 else
 2789                         vput(nd.ni_dvp);
 2790                 if (nd.ni_vp)
 2791                         vrele(nd.ni_vp);
 2792                 pathbuf_destroy(pb);
 2793                 return (EEXIST);
 2794         }
 2795         if ((error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) != 0)
 2796                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 2797         vput(nd.ni_dvp);
 2798         pathbuf_destroy(pb);
 2799         return (error);
 2800 }
 2801 
 2802 /*
 2803  * Delete a name from the filesystem.
 2804  */
 2805 /* ARGSUSED */
 2806 int
 2807 sys_unlink(struct lwp *l, const struct sys_unlink_args *uap, register_t *retval)
 2808 {
 2809         /* {
 2810                 syscallarg(const char *) path;
 2811         } */
 2812 
 2813         return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path), 0, UIO_USERSPACE);
 2814 }
 2815 
 2816 int
 2817 sys_unlinkat(struct lwp *l, const struct sys_unlinkat_args *uap,
 2818     register_t *retval)
 2819 {
 2820         /* {
 2821                 syscallarg(int) fd;
 2822                 syscallarg(const char *) path;
 2823                 syscallarg(int) flag;
 2824         } */
 2825 
 2826         return do_sys_unlinkat(l, SCARG(uap, fd), SCARG(uap, path),
 2827             SCARG(uap, flag), UIO_USERSPACE);
 2828 }
 2829 
 2830 int
 2831 do_sys_unlink(const char *arg, enum uio_seg seg)
 2832 {
 2833         return do_sys_unlinkat(NULL, AT_FDCWD, arg, 0, seg);
 2834 }
 2835 
 2836 static int
 2837 do_sys_unlinkat(struct lwp *l, int fdat, const char *arg, int flags,
 2838     enum uio_seg seg)
 2839 {
 2840         struct vnode *vp;
 2841         int error;
 2842         struct pathbuf *pb;
 2843         struct nameidata nd;
 2844         const char *pathstring;
 2845 
 2846         KASSERT(l != NULL || fdat == AT_FDCWD);
 2847 
 2848         error = pathbuf_maybe_copyin(arg, seg, &pb);
 2849         if (error) {
 2850                 return error;
 2851         }
 2852         pathstring = pathbuf_stringcopy_get(pb);
 2853         if (pathstring == NULL) {
 2854                 pathbuf_destroy(pb);
 2855                 return ENOMEM;
 2856         }
 2857 
 2858         NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | TRYEMULROOT, pb);
 2859         if ((error = fd_nameiat(l, fdat, &nd)) != 0)
 2860                 goto out;
 2861         vp = nd.ni_vp;
 2862 
 2863         /*
 2864          * The root of a mounted filesystem cannot be deleted.
 2865          */
 2866         if ((vp->v_vflag & VV_ROOT) != 0) {
 2867                 error = EBUSY;
 2868                 goto abort;
 2869         }
 2870 
 2871         if ((vp->v_type == VDIR) && (vp->v_mountedhere != NULL)) {
 2872                 error = EBUSY;
 2873                 goto abort;
 2874         }
 2875 
 2876         /*
 2877          * No rmdir "." please.
 2878          */
 2879         if (nd.ni_dvp == vp) {
 2880                 error = EINVAL;
 2881                 goto abort;
 2882         }
 2883 
 2884         /*
 2885          * AT_REMOVEDIR is required to remove a directory
 2886          */
 2887         if (vp->v_type == VDIR) {
 2888                 if (!(flags & AT_REMOVEDIR)) {
 2889                         error = EPERM;
 2890                         goto abort;
 2891                 } else {
 2892                         error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 2893                         vput(nd.ni_dvp);
 2894                         goto out;
 2895                 }
 2896         }
 2897 
 2898         /*
 2899          * Starting here we only deal with non directories.
 2900          */
 2901         if (flags & AT_REMOVEDIR) {
 2902                 error = ENOTDIR;
 2903                 goto abort;
 2904         }
 2905 
 2906 #if NVERIEXEC > 0
 2907         /* Handle remove requests for veriexec entries. */
 2908         if ((error = veriexec_removechk(curlwp, nd.ni_vp, pathstring)) != 0) {
 2909                 goto abort;
 2910         }
 2911 #endif /* NVERIEXEC > 0 */
 2912         
 2913 #ifdef FILEASSOC
 2914         (void)fileassoc_file_delete(vp);
 2915 #endif /* FILEASSOC */
 2916         error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 2917         vput(nd.ni_dvp);
 2918         goto out;
 2919 
 2920 abort:
 2921         VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 2922         if (nd.ni_dvp == vp)
 2923                 vrele(nd.ni_dvp);
 2924         else
 2925                 vput(nd.ni_dvp);
 2926         vput(vp);
 2927 
 2928 out:
 2929         pathbuf_stringcopy_put(pb, pathstring);
 2930         pathbuf_destroy(pb);
 2931         return (error);
 2932 }
 2933 
 2934 /*
 2935  * Reposition read/write file offset.
 2936  */
 2937 int
 2938 sys_lseek(struct lwp *l, const struct sys_lseek_args *uap, register_t *retval)
 2939 {
 2940         /* {
 2941                 syscallarg(int) fd;
 2942                 syscallarg(int) pad;
 2943                 syscallarg(off_t) offset;
 2944                 syscallarg(int) whence;
 2945         } */
 2946         file_t *fp;
 2947         int error, fd;
 2948 
 2949         switch (SCARG(uap, whence)) {
 2950         case SEEK_CUR:
 2951         case SEEK_END:
 2952         case SEEK_SET:
 2953                 break;
 2954         default:
 2955                 return EINVAL;
 2956         }
 2957 
 2958         fd = SCARG(uap, fd);
 2959 
 2960         if ((fp = fd_getfile(fd)) == NULL)
 2961                 return (EBADF);
 2962 
 2963         if (fp->f_ops->fo_seek == NULL) {
 2964                 error = ESPIPE;
 2965                 goto out;
 2966         }
 2967 
 2968         error = (*fp->f_ops->fo_seek)(fp, SCARG(uap, offset),
 2969             SCARG(uap, whence), (off_t *)retval, FOF_UPDATE_OFFSET);
 2970  out:
 2971         fd_putfile(fd);
 2972         return (error);
 2973 }
 2974 
 2975 /*
 2976  * Positional read system call.
 2977  */
 2978 int
 2979 sys_pread(struct lwp *l, const struct sys_pread_args *uap, register_t *retval)
 2980 {
 2981         /* {
 2982                 syscallarg(int) fd;
 2983                 syscallarg(void *) buf;
 2984                 syscallarg(size_t) nbyte;
 2985                 syscallarg(off_t) offset;
 2986         } */
 2987         file_t *fp;
 2988         off_t offset;
 2989         int error, fd = SCARG(uap, fd);
 2990 
 2991         if ((fp = fd_getfile(fd)) == NULL)
 2992                 return (EBADF);
 2993 
 2994         if ((fp->f_flag & FREAD) == 0) {
 2995                 fd_putfile(fd);
 2996                 return (EBADF);
 2997         }
 2998 
 2999         if (fp->f_ops->fo_seek == NULL) {
 3000                 error = ESPIPE;
 3001                 goto out;
 3002         }
 3003 
 3004         offset = SCARG(uap, offset);
 3005         error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
 3006         if (error)
 3007                 goto out;
 3008 
 3009         /* dofileread() will unuse the descriptor for us */
 3010         return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
 3011             &offset, 0, retval));
 3012 
 3013  out:
 3014         fd_putfile(fd);
 3015         return (error);
 3016 }
 3017 
 3018 /*
 3019  * Positional scatter read system call.
 3020  */
 3021 int
 3022 sys_preadv(struct lwp *l, const struct sys_preadv_args *uap, register_t *retval)
 3023 {
 3024         /* {
 3025                 syscallarg(int) fd;
 3026                 syscallarg(const struct iovec *) iovp;
 3027                 syscallarg(int) iovcnt;
 3028                 syscallarg(off_t) offset;
 3029         } */
 3030         off_t offset = SCARG(uap, offset);
 3031 
 3032         return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
 3033             SCARG(uap, iovcnt), &offset, 0, retval);
 3034 }
 3035 
 3036 /*
 3037  * Positional write system call.
 3038  */
 3039 int
 3040 sys_pwrite(struct lwp *l, const struct sys_pwrite_args *uap, register_t *retval)
 3041 {
 3042         /* {
 3043                 syscallarg(int) fd;
 3044                 syscallarg(const void *) buf;
 3045                 syscallarg(size_t) nbyte;
 3046                 syscallarg(off_t) offset;
 3047         } */
 3048         file_t *fp;
 3049         off_t offset;
 3050         int error, fd = SCARG(uap, fd);
 3051 
 3052         if ((fp = fd_getfile(fd)) == NULL)
 3053                 return (EBADF);
 3054 
 3055         if ((fp->f_flag & FWRITE) == 0) {
 3056                 fd_putfile(fd);
 3057                 return (EBADF);
 3058         }
 3059 
 3060         if (fp->f_ops->fo_seek == NULL) {
 3061                 error = ESPIPE;
 3062                 goto out;
 3063         }
 3064 
 3065         offset = SCARG(uap, offset);
 3066         error = (*fp->f_ops->fo_seek)(fp, offset, SEEK_SET, &offset, 0);
 3067         if (error)
 3068                 goto out;
 3069 
 3070         /* dofilewrite() will unuse the descriptor for us */
 3071         return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
 3072             &offset, 0, retval));
 3073 
 3074  out:
 3075         fd_putfile(fd);
 3076         return (error);
 3077 }
 3078 
 3079 /*
 3080  * Positional gather write system call.
 3081  */
 3082 int
 3083 sys_pwritev(struct lwp *l, const struct sys_pwritev_args *uap, register_t *retval)
 3084 {
 3085         /* {
 3086                 syscallarg(int) fd;
 3087                 syscallarg(const struct iovec *) iovp;
 3088                 syscallarg(int) iovcnt;
 3089                 syscallarg(off_t) offset;
 3090         } */
 3091         off_t offset = SCARG(uap, offset);
 3092 
 3093         return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
 3094             SCARG(uap, iovcnt), &offset, 0, retval);
 3095 }
 3096 
 3097 /*
 3098  * Check access permissions.
 3099  */
 3100 int
 3101 sys_access(struct lwp *l, const struct sys_access_args *uap, register_t *retval)
 3102 {
 3103         /* {
 3104                 syscallarg(const char *) path;
 3105                 syscallarg(int) flags;
 3106         } */
 3107 
 3108         return do_sys_accessat(l, AT_FDCWD, SCARG(uap, path),
 3109              SCARG(uap, flags), 0);
 3110 }
 3111 
 3112 int
 3113 do_sys_accessat(struct lwp *l, int fdat, const char *path,
 3114     int mode, int flags)
 3115 {
 3116         kauth_cred_t cred;
 3117         struct vnode *vp;
 3118         int error, nd_flag, vmode;
 3119         struct pathbuf *pb;
 3120         struct nameidata nd;
 3121 
 3122         CTASSERT(F_OK == 0);
 3123         if ((mode & ~(R_OK | W_OK | X_OK)) != 0) {
 3124                 /* nonsense mode */
 3125                 return EINVAL;
 3126         }
 3127 
 3128         nd_flag = FOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT;
 3129         if (flags & AT_SYMLINK_NOFOLLOW)
 3130                 nd_flag &= ~FOLLOW;
 3131 
 3132         error = pathbuf_copyin(path, &pb);
 3133         if (error) 
 3134                 return error;
 3135 
 3136         NDINIT(&nd, LOOKUP, nd_flag, pb);
 3137 
 3138         /* Override default credentials */
 3139         cred = kauth_cred_dup(l->l_cred);
 3140         if (!(flags & AT_EACCESS)) {
 3141                 kauth_cred_seteuid(cred, kauth_cred_getuid(l->l_cred));
 3142                 kauth_cred_setegid(cred, kauth_cred_getgid(l->l_cred));
 3143         }
 3144         nd.ni_cnd.cn_cred = cred;
 3145 
 3146         if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
 3147                 pathbuf_destroy(pb);
 3148                 goto out;
 3149         }
 3150         vp = nd.ni_vp;
 3151         pathbuf_destroy(pb);
 3152 
 3153         /* Flags == 0 means only check for existence. */
 3154         if (mode) {
 3155                 vmode = 0;
 3156                 if (mode & R_OK)
 3157                         vmode |= VREAD;
 3158                 if (mode & W_OK)
 3159                         vmode |= VWRITE;
 3160                 if (mode & X_OK)
 3161                         vmode |= VEXEC;
 3162 
 3163                 error = VOP_ACCESS(vp, vmode, cred);
 3164                 if (!error && (vmode & VWRITE))
 3165                         error = vn_writechk(vp);
 3166         }
 3167         vput(vp);
 3168 out:
 3169         kauth_cred_free(cred);
 3170         return (error);
 3171 }
 3172 
 3173 int
 3174 sys_faccessat(struct lwp *l, const struct sys_faccessat_args *uap,
 3175     register_t *retval)
 3176 {
 3177         /* {
 3178                 syscallarg(int) fd;
 3179                 syscallarg(const char *) path;
 3180                 syscallarg(int) amode;
 3181                 syscallarg(int) flag;
 3182         } */
 3183 
 3184         return do_sys_accessat(l, SCARG(uap, fd), SCARG(uap, path),
 3185              SCARG(uap, amode), SCARG(uap, flag));
 3186 }
 3187 
 3188 /*
 3189  * Common code for all sys_stat functions, including compat versions.
 3190  */
 3191 int
 3192 do_sys_stat(const char *userpath, unsigned int nd_flag,
 3193     struct stat *sb)
 3194 {
 3195         return do_sys_statat(NULL, AT_FDCWD, userpath, nd_flag, sb);
 3196 }
 3197 
 3198 int
 3199 do_sys_statat(struct lwp *l, int fdat, const char *userpath,
 3200     unsigned int nd_flag, struct stat *sb) 
 3201 {
 3202         int error;
 3203         struct pathbuf *pb;
 3204         struct nameidata nd;
 3205 
 3206         KASSERT(l != NULL || fdat == AT_FDCWD);
 3207 
 3208         error = pathbuf_copyin(userpath, &pb);
 3209         if (error) {
 3210                 return error;
 3211         }
 3212 
 3213         NDINIT(&nd, LOOKUP, nd_flag | LOCKLEAF | TRYEMULROOT, pb);
 3214 
 3215         error = fd_nameiat(l, fdat, &nd);
 3216         if (error != 0) {
 3217                 pathbuf_destroy(pb);
 3218                 return error;
 3219         }
 3220         error = vn_stat(nd.ni_vp, sb);
 3221         vput(nd.ni_vp);
 3222         pathbuf_destroy(pb);
 3223         return error;
 3224 }
 3225 
 3226 /*
 3227  * Get file status; this version follows links.
 3228  */
 3229 /* ARGSUSED */
 3230 int
 3231 sys___stat50(struct lwp *l, const struct sys___stat50_args *uap, register_t *retval)
 3232 {
 3233         /* {
 3234                 syscallarg(const char *) path;
 3235                 syscallarg(struct stat *) ub;
 3236         } */
 3237         struct stat sb;
 3238         int error;
 3239 
 3240         error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), FOLLOW, &sb);
 3241         if (error)
 3242                 return error;
 3243         return copyout(&sb, SCARG(uap, ub), sizeof(sb));
 3244 }
 3245 
 3246 /*
 3247  * Get file status; this version does not follow links.
 3248  */
 3249 /* ARGSUSED */
 3250 int
 3251 sys___lstat50(struct lwp *l, const struct sys___lstat50_args *uap, register_t *retval)
 3252 {
 3253         /* {
 3254                 syscallarg(const char *) path;
 3255                 syscallarg(struct stat *) ub;
 3256         } */
 3257         struct stat sb;
 3258         int error;
 3259 
 3260         error = do_sys_statat(l, AT_FDCWD, SCARG(uap, path), NOFOLLOW, &sb);
 3261         if (error)
 3262                 return error;
 3263         return copyout(&sb, SCARG(uap, ub), sizeof(sb));
 3264 }
 3265 
 3266 int
 3267 sys_fstatat(struct lwp *l, const struct sys_fstatat_args *uap,
 3268     register_t *retval)
 3269 {
 3270         /* {
 3271                 syscallarg(int) fd;
 3272                 syscallarg(const char *) path;
 3273                 syscallarg(struct stat *) buf;
 3274                 syscallarg(int) flag;
 3275         } */
 3276         unsigned int nd_flag;
 3277         struct stat sb;
 3278         int error;
 3279 
 3280         if (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW)
 3281                 nd_flag = NOFOLLOW;
 3282         else
 3283                 nd_flag = FOLLOW;
 3284 
 3285         error = do_sys_statat(l, SCARG(uap, fd), SCARG(uap, path), nd_flag, 
 3286             &sb);
 3287         if (error)
 3288                 return error;
 3289         return copyout(&sb, SCARG(uap, buf), sizeof(sb));
 3290 }
 3291 
 3292 static int
 3293 kern_pathconf(register_t *retval, const char *path, int name, int flag)
 3294 {
 3295         int error;
 3296         struct pathbuf *pb;
 3297         struct nameidata nd;
 3298 
 3299         error = pathbuf_copyin(path, &pb);
 3300         if (error) {
 3301                 return error;
 3302         }
 3303         NDINIT(&nd, LOOKUP, flag | LOCKLEAF | TRYEMULROOT, pb);
 3304         if ((error = namei(&nd)) != 0) {
 3305                 pathbuf_destroy(pb);
 3306                 return error;
 3307         }
 3308         error = VOP_PATHCONF(nd.ni_vp, name, retval);
 3309         vput(nd.ni_vp);
 3310         pathbuf_destroy(pb);
 3311         return error;
 3312 }
 3313 
 3314 /*
 3315  * Get configurable pathname variables.
 3316  */
 3317 /* ARGSUSED */
 3318 int
 3319 sys_pathconf(struct lwp *l, const struct sys_pathconf_args *uap,
 3320     register_t *retval)
 3321 {
 3322         /* {
 3323                 syscallarg(const char *) path;
 3324                 syscallarg(int) name;
 3325         } */
 3326         return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name), 
 3327             FOLLOW);
 3328 }
 3329 
 3330 /* ARGSUSED */
 3331 int
 3332 sys_lpathconf(struct lwp *l, const struct sys_lpathconf_args *uap,
 3333     register_t *retval)
 3334 {
 3335         /* {
 3336                 syscallarg(const char *) path;
 3337                 syscallarg(int) name;
 3338         } */
 3339         return kern_pathconf(retval, SCARG(uap, path), SCARG(uap, name), 
 3340             NOFOLLOW);
 3341 }
 3342 
 3343 /*
 3344  * Return target name of a symbolic link.
 3345  */
 3346 /* ARGSUSED */
 3347 int
 3348 sys_readlink(struct lwp *l, const struct sys_readlink_args *uap,
 3349     register_t *retval)
 3350 {
 3351         /* {
 3352                 syscallarg(const char *) path;
 3353                 syscallarg(char *) buf;
 3354                 syscallarg(size_t) count;
 3355         } */
 3356         return do_sys_readlinkat(l, AT_FDCWD, SCARG(uap, path),
 3357             SCARG(uap, buf), SCARG(uap, count), retval);
 3358 }
 3359 
 3360 static int
 3361 do_sys_readlinkat(struct lwp *l, int fdat, const char *path, char *buf,
 3362     size_t count, register_t *retval)
 3363 {
 3364         struct vnode *vp;
 3365         struct iovec aiov;
 3366         struct uio auio;
 3367         int error;
 3368         struct pathbuf *pb;
 3369         struct nameidata nd;
 3370 
 3371         error = pathbuf_copyin(path, &pb);
 3372         if (error) {
 3373                 return error;
 3374         }
 3375         NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | TRYEMULROOT, pb);
 3376         if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
 3377                 pathbuf_destroy(pb);
 3378                 return error;
 3379         }
 3380         vp = nd.ni_vp;
 3381         pathbuf_destroy(pb);
 3382         if (vp->v_type != VLNK)
 3383                 error = EINVAL;
 3384         else if (!(vp->v_mount->mnt_flag & MNT_SYMPERM) ||
 3385             (error = VOP_ACCESS(vp, VREAD, l->l_cred)) == 0) {
 3386                 aiov.iov_base = buf;
 3387                 aiov.iov_len = count;
 3388                 auio.uio_iov = &aiov;
 3389                 auio.uio_iovcnt = 1;
 3390                 auio.uio_offset = 0;
 3391                 auio.uio_rw = UIO_READ;
 3392                 KASSERT(l == curlwp);
 3393                 auio.uio_vmspace = l->l_proc->p_vmspace;
 3394                 auio.uio_resid = count;
 3395                 if ((error = VOP_READLINK(vp, &auio, l->l_cred)) == 0)
 3396                         *retval = count - auio.uio_resid;
 3397         }
 3398         vput(vp);
 3399         return (error);
 3400 }
 3401 
 3402 int
 3403 sys_readlinkat(struct lwp *l, const struct sys_readlinkat_args *uap,
 3404     register_t *retval)
 3405 {
 3406         /* {
 3407                 syscallarg(int) fd;
 3408                 syscallarg(const char *) path;
 3409                 syscallarg(char *) buf;
 3410                 syscallarg(size_t) bufsize;
 3411         } */
 3412 
 3413         return do_sys_readlinkat(l, SCARG(uap, fd), SCARG(uap, path),
 3414             SCARG(uap, buf), SCARG(uap, bufsize), retval);
 3415 }
 3416 
 3417 /*
 3418  * Change flags of a file given a path name.
 3419  */
 3420 /* ARGSUSED */
 3421 int
 3422 sys_chflags(struct lwp *l, const struct sys_chflags_args *uap, register_t *retval)
 3423 {
 3424         /* {
 3425                 syscallarg(const char *) path;
 3426                 syscallarg(u_long) flags;
 3427         } */
 3428         struct vnode *vp;
 3429         int error;
 3430 
 3431         error = namei_simple_user(SCARG(uap, path),
 3432                                 NSM_FOLLOW_TRYEMULROOT, &vp);
 3433         if (error != 0)
 3434                 return (error);
 3435         error = change_flags(vp, SCARG(uap, flags), l);
 3436         vput(vp);
 3437         return (error);
 3438 }
 3439 
 3440 /*
 3441  * Change flags of a file given a file descriptor.
 3442  */
 3443 /* ARGSUSED */
 3444 int
 3445 sys_fchflags(struct lwp *l, const struct sys_fchflags_args *uap, register_t *retval)
 3446 {
 3447         /* {
 3448                 syscallarg(int) fd;
 3449                 syscallarg(u_long) flags;
 3450         } */
 3451         struct vnode *vp;
 3452         file_t *fp;
 3453         int error;
 3454 
 3455         /* fd_getvnode() will use the descriptor for us */
 3456         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 3457                 return (error);
 3458         vp = fp->f_vnode;
 3459         error = change_flags(vp, SCARG(uap, flags), l);
 3460         VOP_UNLOCK(vp);
 3461         fd_putfile(SCARG(uap, fd));
 3462         return (error);
 3463 }
 3464 
 3465 /*
 3466  * Change flags of a file given a path name; this version does
 3467  * not follow links.
 3468  */
 3469 int
 3470 sys_lchflags(struct lwp *l, const struct sys_lchflags_args *uap, register_t *retval)
 3471 {
 3472         /* {
 3473                 syscallarg(const char *) path;
 3474                 syscallarg(u_long) flags;
 3475         } */
 3476         struct vnode *vp;
 3477         int error;
 3478 
 3479         error = namei_simple_user(SCARG(uap, path),
 3480                                 NSM_NOFOLLOW_TRYEMULROOT, &vp);
 3481         if (error != 0)
 3482                 return (error);
 3483         error = change_flags(vp, SCARG(uap, flags), l);
 3484         vput(vp);
 3485         return (error);
 3486 }
 3487 
 3488 /*
 3489  * Common routine to change flags of a file.
 3490  */
 3491 int
 3492 change_flags(struct vnode *vp, u_long flags, struct lwp *l)
 3493 {
 3494         struct vattr vattr;
 3495         int error;
 3496 
 3497         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 3498 
 3499         vattr_null(&vattr);
 3500         vattr.va_flags = flags;
 3501         error = VOP_SETATTR(vp, &vattr, l->l_cred);
 3502 
 3503         return (error);
 3504 }
 3505 
 3506 /*
 3507  * Change mode of a file given path name; this version follows links.
 3508  */
 3509 /* ARGSUSED */
 3510 int
 3511 sys_chmod(struct lwp *l, const struct sys_chmod_args *uap, register_t *retval)
 3512 {
 3513         /* {
 3514                 syscallarg(const char *) path;
 3515                 syscallarg(int) mode;
 3516         } */
 3517         return do_sys_chmodat(l, AT_FDCWD, SCARG(uap, path),
 3518                               SCARG(uap, mode), 0);
 3519 }
 3520 
 3521 int
 3522 do_sys_chmodat(struct lwp *l, int fdat, const char *path, int mode, int flags)
 3523 {
 3524         int error;
 3525         struct vnode *vp;
 3526         namei_simple_flags_t ns_flag;
 3527 
 3528         if (flags & AT_SYMLINK_NOFOLLOW)
 3529                 ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
 3530         else
 3531                 ns_flag = NSM_FOLLOW_TRYEMULROOT;
 3532 
 3533         error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
 3534         if (error != 0)
 3535                 return error;
 3536 
 3537         error = change_mode(vp, mode, l);
 3538 
 3539         vrele(vp);
 3540 
 3541         return (error);
 3542 }
 3543 
 3544 /*
 3545  * Change mode of a file given a file descriptor.
 3546  */
 3547 /* ARGSUSED */
 3548 int
 3549 sys_fchmod(struct lwp *l, const struct sys_fchmod_args *uap, register_t *retval)
 3550 {
 3551         /* {
 3552                 syscallarg(int) fd;
 3553                 syscallarg(int) mode;
 3554         } */
 3555         file_t *fp;
 3556         int error;
 3557 
 3558         /* fd_getvnode() will use the descriptor for us */
 3559         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 3560                 return (error);
 3561         error = change_mode(fp->f_vnode, SCARG(uap, mode), l);
 3562         fd_putfile(SCARG(uap, fd));
 3563         return (error);
 3564 }
 3565 
 3566 int
 3567 sys_fchmodat(struct lwp *l, const struct sys_fchmodat_args *uap,
 3568     register_t *retval)
 3569 {
 3570         /* {
 3571                 syscallarg(int) fd;
 3572                 syscallarg(const char *) path;
 3573                 syscallarg(int) mode;
 3574                 syscallarg(int) flag;
 3575         } */
 3576 
 3577         return do_sys_chmodat(l, SCARG(uap, fd), SCARG(uap, path),
 3578                               SCARG(uap, mode), SCARG(uap, flag));
 3579 }
 3580 
 3581 /*
 3582  * Change mode of a file given path name; this version does not follow links.
 3583  */
 3584 /* ARGSUSED */
 3585 int
 3586 sys_lchmod(struct lwp *l, const struct sys_lchmod_args *uap, register_t *retval)
 3587 {
 3588         /* {
 3589                 syscallarg(const char *) path;
 3590                 syscallarg(int) mode;
 3591         } */
 3592         int error;
 3593         struct vnode *vp;
 3594 
 3595         error = namei_simple_user(SCARG(uap, path),
 3596                                 NSM_NOFOLLOW_TRYEMULROOT, &vp);
 3597         if (error != 0)
 3598                 return (error);
 3599 
 3600         error = change_mode(vp, SCARG(uap, mode), l);
 3601 
 3602         vrele(vp);
 3603         return (error);
 3604 }
 3605 
 3606 /*
 3607  * Common routine to set mode given a vnode.
 3608  */
 3609 static int
 3610 change_mode(struct vnode *vp, int mode, struct lwp *l)
 3611 {
 3612         struct vattr vattr;
 3613         int error;
 3614 
 3615         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 3616         vattr_null(&vattr);
 3617         vattr.va_mode = mode & ALLPERMS;
 3618         error = VOP_SETATTR(vp, &vattr, l->l_cred);
 3619         VOP_UNLOCK(vp);
 3620         return (error);
 3621 }
 3622 
 3623 /*
 3624  * Set ownership given a path name; this version follows links.
 3625  */
 3626 /* ARGSUSED */
 3627 int
 3628 sys_chown(struct lwp *l, const struct sys_chown_args *uap, register_t *retval)
 3629 {
 3630         /* {
 3631                 syscallarg(const char *) path;
 3632                 syscallarg(uid_t) uid;
 3633                 syscallarg(gid_t) gid;
 3634         } */
 3635         return do_sys_chownat(l, AT_FDCWD, SCARG(uap, path), SCARG(uap,uid),
 3636                               SCARG(uap, gid), 0);
 3637 }
 3638 
 3639 int
 3640 do_sys_chownat(struct lwp *l, int fdat, const char *path, uid_t uid,
 3641    gid_t gid, int flags)
 3642 {
 3643         int error;
 3644         struct vnode *vp;
 3645         namei_simple_flags_t ns_flag;
 3646 
 3647         if (flags & AT_SYMLINK_NOFOLLOW)
 3648                 ns_flag = NSM_NOFOLLOW_TRYEMULROOT;
 3649         else
 3650                 ns_flag = NSM_FOLLOW_TRYEMULROOT;
 3651 
 3652         error = fd_nameiat_simple_user(l, fdat, path, ns_flag, &vp);
 3653         if (error != 0)
 3654                 return error;
 3655 
 3656         error = change_owner(vp, uid, gid, l, 0);
 3657 
 3658         vrele(vp);
 3659 
 3660         return (error);
 3661 }
 3662 
 3663 /*
 3664  * Set ownership given a path name; this version follows links.
 3665  * Provides POSIX semantics.
 3666  */
 3667 /* ARGSUSED */
 3668 int
 3669 sys___posix_chown(struct lwp *l, const struct sys___posix_chown_args *uap, register_t *retval)
 3670 {
 3671         /* {
 3672                 syscallarg(const char *) path;
 3673                 syscallarg(uid_t) uid;
 3674                 syscallarg(gid_t) gid;
 3675         } */
 3676         int error;
 3677         struct vnode *vp;
 3678 
 3679         error = namei_simple_user(SCARG(uap, path),
 3680                                 NSM_FOLLOW_TRYEMULROOT, &vp);
 3681         if (error != 0)
 3682                 return (error);
 3683 
 3684         error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
 3685 
 3686         vrele(vp);
 3687         return (error);
 3688 }
 3689 
 3690 /*
 3691  * Set ownership given a file descriptor.
 3692  */
 3693 /* ARGSUSED */
 3694 int
 3695 sys_fchown(struct lwp *l, const struct sys_fchown_args *uap, register_t *retval)
 3696 {
 3697         /* {
 3698                 syscallarg(int) fd;
 3699                 syscallarg(uid_t) uid;
 3700                 syscallarg(gid_t) gid;
 3701         } */
 3702         int error;
 3703         file_t *fp;
 3704 
 3705         /* fd_getvnode() will use the descriptor for us */
 3706         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 3707                 return (error);
 3708         error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
 3709             l, 0);
 3710         fd_putfile(SCARG(uap, fd));
 3711         return (error);
 3712 }
 3713 
 3714 int
 3715 sys_fchownat(struct lwp *l, const struct sys_fchownat_args *uap,
 3716     register_t *retval)
 3717 {
 3718         /* {
 3719                 syscallarg(int) fd;
 3720                 syscallarg(const char *) path;
 3721                 syscallarg(uid_t) owner;
 3722                 syscallarg(gid_t) group;
 3723                 syscallarg(int) flag;
 3724         } */
 3725 
 3726         return do_sys_chownat(l, SCARG(uap, fd), SCARG(uap, path),
 3727                               SCARG(uap, owner), SCARG(uap, group),
 3728                               SCARG(uap, flag));
 3729 }
 3730 
 3731 /*
 3732  * Set ownership given a file descriptor, providing POSIX/XPG semantics.
 3733  */
 3734 /* ARGSUSED */
 3735 int
 3736 sys___posix_fchown(struct lwp *l, const struct sys___posix_fchown_args *uap, register_t *retval)
 3737 {
 3738         /* {
 3739                 syscallarg(int) fd;
 3740                 syscallarg(uid_t) uid;
 3741                 syscallarg(gid_t) gid;
 3742         } */
 3743         int error;
 3744         file_t *fp;
 3745 
 3746         /* fd_getvnode() will use the descriptor for us */
 3747         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 3748                 return (error);
 3749         error = change_owner(fp->f_vnode, SCARG(uap, uid), SCARG(uap, gid),
 3750             l, 1);
 3751         fd_putfile(SCARG(uap, fd));
 3752         return (error);
 3753 }
 3754 
 3755 /*
 3756  * Set ownership given a path name; this version does not follow links.
 3757  */
 3758 /* ARGSUSED */
 3759 int
 3760 sys_lchown(struct lwp *l, const struct sys_lchown_args *uap, register_t *retval)
 3761 {
 3762         /* {
 3763                 syscallarg(const char *) path;
 3764                 syscallarg(uid_t) uid;
 3765                 syscallarg(gid_t) gid;
 3766         } */
 3767         int error;
 3768         struct vnode *vp;
 3769 
 3770         error = namei_simple_user(SCARG(uap, path),
 3771                                 NSM_NOFOLLOW_TRYEMULROOT, &vp);
 3772         if (error != 0)
 3773                 return (error);
 3774 
 3775         error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 0);
 3776 
 3777         vrele(vp);
 3778         return (error);
 3779 }
 3780 
 3781 /*
 3782  * Set ownership given a path name; this version does not follow links.
 3783  * Provides POSIX/XPG semantics.
 3784  */
 3785 /* ARGSUSED */
 3786 int
 3787 sys___posix_lchown(struct lwp *l, const struct sys___posix_lchown_args *uap, register_t *retval)
 3788 {
 3789         /* {
 3790                 syscallarg(const char *) path;
 3791                 syscallarg(uid_t) uid;
 3792                 syscallarg(gid_t) gid;
 3793         } */
 3794         int error;
 3795         struct vnode *vp;
 3796 
 3797         error = namei_simple_user(SCARG(uap, path),
 3798                                 NSM_NOFOLLOW_TRYEMULROOT, &vp);
 3799         if (error != 0)
 3800                 return (error);
 3801 
 3802         error = change_owner(vp, SCARG(uap, uid), SCARG(uap, gid), l, 1);
 3803 
 3804         vrele(vp);
 3805         return (error);
 3806 }
 3807 
 3808 /*
 3809  * Common routine to set ownership given a vnode.
 3810  */
 3811 static int
 3812 change_owner(struct vnode *vp, uid_t uid, gid_t gid, struct lwp *l,
 3813     int posix_semantics)
 3814 {
 3815         struct vattr vattr;
 3816         mode_t newmode;
 3817         int error;
 3818 
 3819         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 3820         if ((error = VOP_GETATTR(vp, &vattr, l->l_cred)) != 0)
 3821                 goto out;
 3822 
 3823 #define CHANGED(x) ((int)(x) != -1)
 3824         newmode = vattr.va_mode;
 3825         if (posix_semantics) {
 3826                 /*
 3827                  * POSIX/XPG semantics: if the caller is not the super-user,
 3828                  * clear set-user-id and set-group-id bits.  Both POSIX and
 3829                  * the XPG consider the behaviour for calls by the super-user
 3830                  * implementation-defined; we leave the set-user-id and set-
 3831                  * group-id settings intact in that case.
 3832                  */
 3833                 if (vattr.va_mode & S_ISUID) {
 3834                         if (kauth_authorize_vnode(l->l_cred,
 3835                             KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0)
 3836                                 newmode &= ~S_ISUID;
 3837                 }
 3838                 if (vattr.va_mode & S_ISGID) {
 3839                         if (kauth_authorize_vnode(l->l_cred,
 3840                             KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0)
 3841                                 newmode &= ~S_ISGID;
 3842                 }
 3843         } else {
 3844                 /*
 3845                  * NetBSD semantics: when changing owner and/or group,
 3846                  * clear the respective bit(s).
 3847                  */
 3848                 if (CHANGED(uid))
 3849                         newmode &= ~S_ISUID;
 3850                 if (CHANGED(gid))
 3851                         newmode &= ~S_ISGID;
 3852         }
 3853         /* Update va_mode iff altered. */
 3854         if (vattr.va_mode == newmode)
 3855                 newmode = VNOVAL;
 3856 
 3857         vattr_null(&vattr);
 3858         vattr.va_uid = CHANGED(uid) ? uid : (uid_t)VNOVAL;
 3859         vattr.va_gid = CHANGED(gid) ? gid : (gid_t)VNOVAL;
 3860         vattr.va_mode = newmode;
 3861         error = VOP_SETATTR(vp, &vattr, l->l_cred);
 3862 #undef CHANGED
 3863 
 3864 out:
 3865         VOP_UNLOCK(vp);
 3866         return (error);
 3867 }
 3868 
 3869 /*
 3870  * Set the access and modification times given a path name; this
 3871  * version follows links.
 3872  */
 3873 /* ARGSUSED */
 3874 int
 3875 sys___utimes50(struct lwp *l, const struct sys___utimes50_args *uap,
 3876     register_t *retval)
 3877 {
 3878         /* {
 3879                 syscallarg(const char *) path;
 3880                 syscallarg(const struct timeval *) tptr;
 3881         } */
 3882 
 3883         return do_sys_utimes(l, NULL, SCARG(uap, path), FOLLOW,
 3884             SCARG(uap, tptr), UIO_USERSPACE);
 3885 }
 3886 
 3887 /*
 3888  * Set the access and modification times given a file descriptor.
 3889  */
 3890 /* ARGSUSED */
 3891 int
 3892 sys___futimes50(struct lwp *l, const struct sys___futimes50_args *uap,
 3893     register_t *retval)
 3894 {
 3895         /* {
 3896                 syscallarg(int) fd;
 3897                 syscallarg(const struct timeval *) tptr;
 3898         } */
 3899         int error;
 3900         file_t *fp;
 3901 
 3902         /* fd_getvnode() will use the descriptor for us */
 3903         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 3904                 return (error);
 3905         error = do_sys_utimes(l, fp->f_vnode, NULL, 0, SCARG(uap, tptr),
 3906             UIO_USERSPACE);
 3907         fd_putfile(SCARG(uap, fd));
 3908         return (error);
 3909 }
 3910 
 3911 int
 3912 sys_futimens(struct lwp *l, const struct sys_futimens_args *uap,
 3913     register_t *retval)
 3914 {
 3915         /* {
 3916                 syscallarg(int) fd;
 3917                 syscallarg(const struct timespec *) tptr;
 3918         } */
 3919         int error;
 3920         file_t *fp;
 3921 
 3922         /* fd_getvnode() will use the descriptor for us */
 3923         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 3924                 return (error);
 3925         error = do_sys_utimensat(l, AT_FDCWD, fp->f_vnode, NULL, 0,
 3926             SCARG(uap, tptr), UIO_USERSPACE);
 3927         fd_putfile(SCARG(uap, fd));
 3928         return (error);
 3929 }
 3930 
 3931 /*
 3932  * Set the access and modification times given a path name; this
 3933  * version does not follow links.
 3934  */
 3935 int
 3936 sys___lutimes50(struct lwp *l, const struct sys___lutimes50_args *uap,
 3937     register_t *retval)
 3938 {
 3939         /* {
 3940                 syscallarg(const char *) path;
 3941                 syscallarg(const struct timeval *) tptr;
 3942         } */
 3943 
 3944         return do_sys_utimes(l, NULL, SCARG(uap, path), NOFOLLOW,
 3945             SCARG(uap, tptr), UIO_USERSPACE);
 3946 }
 3947 
 3948 int
 3949 sys_utimensat(struct lwp *l, const struct sys_utimensat_args *uap,
 3950     register_t *retval)
 3951 {
 3952         /* {
 3953                 syscallarg(int) fd;
 3954                 syscallarg(const char *) path;
 3955                 syscallarg(const struct timespec *) tptr;
 3956                 syscallarg(int) flag;
 3957         } */
 3958         int follow;
 3959         const struct timespec *tptr;
 3960         int error;
 3961 
 3962         tptr = SCARG(uap, tptr);
 3963         follow = (SCARG(uap, flag) & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
 3964 
 3965         error = do_sys_utimensat(l, SCARG(uap, fd), NULL, 
 3966             SCARG(uap, path), follow, tptr, UIO_USERSPACE);
 3967 
 3968         return error;
 3969 }
 3970 
 3971 /*
 3972  * Common routine to set access and modification times given a vnode.
 3973  */
 3974 int
 3975 do_sys_utimens(struct lwp *l, struct vnode *vp, const char *path, int flag,
 3976     const struct timespec *tptr, enum uio_seg seg)
 3977 {
 3978         return do_sys_utimensat(l, AT_FDCWD, vp, path, flag, tptr, seg);
 3979 }
 3980 
 3981 int
 3982 do_sys_utimensat(struct lwp *l, int fdat, struct vnode *vp,
 3983     const char *path, int flag, const struct timespec *tptr, enum uio_seg seg)
 3984 {
 3985         struct vattr vattr;
 3986         int error, dorele = 0;
 3987         namei_simple_flags_t sflags;
 3988         bool vanull, setbirthtime;
 3989         struct timespec ts[2];
 3990 
 3991         KASSERT(l != NULL || fdat == AT_FDCWD);
 3992 
 3993         /* 
 3994          * I have checked all callers and they pass either FOLLOW,
 3995          * NOFOLLOW, or 0 (when they don't pass a path), and NOFOLLOW
 3996          * is 0. More to the point, they don't pass anything else.
 3997          * Let's keep it that way at least until the namei interfaces
 3998          * are fully sanitized.
 3999          */
 4000         KASSERT(flag == NOFOLLOW || flag == FOLLOW);
 4001         sflags = (flag == FOLLOW) ? 
 4002                 NSM_FOLLOW_TRYEMULROOT : NSM_NOFOLLOW_TRYEMULROOT;
 4003 
 4004         if (tptr == NULL) {
 4005                 vanull = true;
 4006                 nanotime(&ts[0]);
 4007                 ts[1] = ts[0];
 4008         } else {
 4009                 vanull = false;
 4010                 if (seg != UIO_SYSSPACE) {
 4011                         error = copyin(tptr, ts, sizeof (ts));
 4012                         if (error != 0)
 4013                                 return error;
 4014                 } else {
 4015                         ts[0] = tptr[0];
 4016                         ts[1] = tptr[1];
 4017                 }
 4018         }
 4019 
 4020         if (ts[0].tv_nsec == UTIME_NOW) {
 4021                 nanotime(&ts[0]);
 4022                 if (ts[1].tv_nsec == UTIME_NOW) {
 4023                         vanull = true;
 4024                         ts[1] = ts[0];
 4025                 }
 4026         } else if (ts[1].tv_nsec == UTIME_NOW)
 4027                 nanotime(&ts[1]);
 4028 
 4029         if (vp == NULL) {
 4030                 /* note: SEG describes TPTR, not PATH; PATH is always user */
 4031                 error = fd_nameiat_simple_user(l, fdat, path, sflags, &vp);
 4032                 if (error != 0)
 4033                         return error;
 4034                 dorele = 1;
 4035         }
 4036 
 4037         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4038         setbirthtime = (VOP_GETATTR(vp, &vattr, l->l_cred) == 0 &&
 4039             timespeccmp(&ts[1], &vattr.va_birthtime, <));
 4040         vattr_null(&vattr);
 4041 
 4042         if (ts[0].tv_nsec != UTIME_OMIT)
 4043                 vattr.va_atime = ts[0];
 4044 
 4045         if (ts[1].tv_nsec != UTIME_OMIT) {
 4046                 vattr.va_mtime = ts[1];
 4047                 if (setbirthtime)
 4048                         vattr.va_birthtime = ts[1];
 4049         }
 4050 
 4051         if (vanull)
 4052                 vattr.va_vaflags |= VA_UTIMES_NULL;
 4053         error = VOP_SETATTR(vp, &vattr, l->l_cred);
 4054         VOP_UNLOCK(vp);
 4055 
 4056         if (dorele != 0)
 4057                 vrele(vp);
 4058 
 4059         return error;
 4060 }
 4061 
 4062 int
 4063 do_sys_utimes(struct lwp *l, struct vnode *vp, const char *path, int flag,
 4064     const struct timeval *tptr, enum uio_seg seg)
 4065 {
 4066         struct timespec ts[2];
 4067         struct timespec *tsptr = NULL;
 4068         int error;
 4069         
 4070         if (tptr != NULL) {
 4071                 struct timeval tv[2];
 4072 
 4073                 if (seg != UIO_SYSSPACE) {
 4074                         error = copyin(tptr, tv, sizeof(tv));
 4075                         if (error != 0)
 4076                                 return error;
 4077                         tptr = tv;
 4078                 }
 4079 
 4080                 if ((tptr[0].tv_usec == UTIME_NOW) || 
 4081                     (tptr[0].tv_usec == UTIME_OMIT))
 4082                         ts[0].tv_nsec = tptr[0].tv_usec;
 4083                 else {
 4084                         if (tptr[0].tv_usec < 0 || tptr[0].tv_usec >= 1000000)
 4085                                 return EINVAL;
 4086 
 4087                         TIMEVAL_TO_TIMESPEC(&tptr[0], &ts[0]);
 4088                 }
 4089 
 4090                 if ((tptr[1].tv_usec == UTIME_NOW) || 
 4091                     (tptr[1].tv_usec == UTIME_OMIT))
 4092                         ts[1].tv_nsec = tptr[1].tv_usec;
 4093                 else {
 4094                         if (tptr[1].tv_usec < 0 || tptr[1].tv_usec >= 1000000)
 4095                                 return EINVAL;
 4096 
 4097                         TIMEVAL_TO_TIMESPEC(&tptr[1], &ts[1]);
 4098                 }
 4099 
 4100                 tsptr = &ts[0]; 
 4101         }
 4102 
 4103         return do_sys_utimens(l, vp, path, flag, tsptr, UIO_SYSSPACE);
 4104 }
 4105 
 4106 /*
 4107  * Truncate a file given its path name.
 4108  */
 4109 /* ARGSUSED */
 4110 int
 4111 sys_truncate(struct lwp *l, const struct sys_truncate_args *uap, register_t *retval)
 4112 {
 4113         /* {
 4114                 syscallarg(const char *) path;
 4115                 syscallarg(int) pad;
 4116                 syscallarg(off_t) length;
 4117         } */
 4118         struct vnode *vp;
 4119         struct vattr vattr;
 4120         int error;
 4121 
 4122         if (SCARG(uap, length) < 0)
 4123                 return EINVAL;
 4124 
 4125         error = namei_simple_user(SCARG(uap, path),
 4126                                 NSM_FOLLOW_TRYEMULROOT, &vp);
 4127         if (error != 0)
 4128                 return (error);
 4129         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4130         if (vp->v_type == VDIR)
 4131                 error = EISDIR;
 4132         else if ((error = vn_writechk(vp)) == 0 &&
 4133             (error = VOP_ACCESS(vp, VWRITE, l->l_cred)) == 0) {
 4134                 vattr_null(&vattr);
 4135                 vattr.va_size = SCARG(uap, length);
 4136                 error = VOP_SETATTR(vp, &vattr, l->l_cred);
 4137         }
 4138         vput(vp);
 4139         return (error);
 4140 }
 4141 
 4142 /*
 4143  * Truncate a file given a file descriptor.
 4144  */
 4145 /* ARGSUSED */
 4146 int
 4147 sys_ftruncate(struct lwp *l, const struct sys_ftruncate_args *uap, register_t *retval)
 4148 {
 4149         /* {
 4150                 syscallarg(int) fd;
 4151                 syscallarg(int) pad;
 4152                 syscallarg(off_t) length;
 4153         } */
 4154         struct vattr vattr;
 4155         struct vnode *vp;
 4156         file_t *fp;
 4157         int error;
 4158 
 4159         if (SCARG(uap, length) < 0)
 4160                 return EINVAL;
 4161 
 4162         /* fd_getvnode() will use the descriptor for us */
 4163         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 4164                 return (error);
 4165         if ((fp->f_flag & FWRITE) == 0) {
 4166                 error = EINVAL;
 4167                 goto out;
 4168         }
 4169         vp = fp->f_vnode;
 4170         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4171         if (vp->v_type == VDIR)
 4172                 error = EISDIR;
 4173         else if ((error = vn_writechk(vp)) == 0) {
 4174                 vattr_null(&vattr);
 4175                 vattr.va_size = SCARG(uap, length);
 4176                 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
 4177         }
 4178         VOP_UNLOCK(vp);
 4179  out:
 4180         fd_putfile(SCARG(uap, fd));
 4181         return (error);
 4182 }
 4183 
 4184 /*
 4185  * Sync an open file.
 4186  */
 4187 /* ARGSUSED */
 4188 int
 4189 sys_fsync(struct lwp *l, const struct sys_fsync_args *uap, register_t *retval)
 4190 {
 4191         /* {
 4192                 syscallarg(int) fd;
 4193         } */
 4194         struct vnode *vp;
 4195         file_t *fp;
 4196         int error;
 4197 
 4198         /* fd_getvnode() will use the descriptor for us */
 4199         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 4200                 return (error);
 4201         vp = fp->f_vnode;
 4202         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4203         error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT, 0, 0);
 4204         VOP_UNLOCK(vp);
 4205         fd_putfile(SCARG(uap, fd));
 4206         return (error);
 4207 }
 4208 
 4209 /*
 4210  * Sync a range of file data.  API modeled after that found in AIX.
 4211  *
 4212  * FDATASYNC indicates that we need only save enough metadata to be able
 4213  * to re-read the written data.
 4214  */
 4215 /* ARGSUSED */
 4216 int
 4217 sys_fsync_range(struct lwp *l, const struct sys_fsync_range_args *uap, register_t *retval)
 4218 {
 4219         /* {
 4220                 syscallarg(int) fd;
 4221                 syscallarg(int) flags;
 4222                 syscallarg(off_t) start;
 4223                 syscallarg(off_t) length;
 4224         } */
 4225         struct vnode *vp;
 4226         file_t *fp;
 4227         int flags, nflags;
 4228         off_t s, e, len;
 4229         int error;
 4230 
 4231         /* fd_getvnode() will use the descriptor for us */
 4232         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 4233                 return (error);
 4234 
 4235         if ((fp->f_flag & FWRITE) == 0) {
 4236                 error = EBADF;
 4237                 goto out;
 4238         }
 4239 
 4240         flags = SCARG(uap, flags);
 4241         if (((flags & (FDATASYNC | FFILESYNC)) == 0) ||
 4242             ((~flags & (FDATASYNC | FFILESYNC)) == 0)) {
 4243                 error = EINVAL;
 4244                 goto out;
 4245         }
 4246         /* Now set up the flags for value(s) to pass to VOP_FSYNC() */
 4247         if (flags & FDATASYNC)
 4248                 nflags = FSYNC_DATAONLY | FSYNC_WAIT;
 4249         else
 4250                 nflags = FSYNC_WAIT;
 4251         if (flags & FDISKSYNC)
 4252                 nflags |= FSYNC_CACHE;
 4253 
 4254         len = SCARG(uap, length);
 4255         /* If length == 0, we do the whole file, and s = e = 0 will do that */
 4256         if (len) {
 4257                 s = SCARG(uap, start);
 4258                 if (s < 0 || len < 0 || len > OFF_T_MAX - s) {
 4259                         error = EINVAL;
 4260                         goto out;
 4261                 }
 4262                 e = s + len;
 4263                 KASSERT(s <= e);
 4264         } else {
 4265                 e = 0;
 4266                 s = 0;
 4267         }
 4268 
 4269         vp = fp->f_vnode;
 4270         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4271         error = VOP_FSYNC(vp, fp->f_cred, nflags, s, e);
 4272         VOP_UNLOCK(vp);
 4273 out:
 4274         fd_putfile(SCARG(uap, fd));
 4275         return (error);
 4276 }
 4277 
 4278 /*
 4279  * Sync the data of an open file.
 4280  */
 4281 /* ARGSUSED */
 4282 int
 4283 sys_fdatasync(struct lwp *l, const struct sys_fdatasync_args *uap, register_t *retval)
 4284 {
 4285         /* {
 4286                 syscallarg(int) fd;
 4287         } */
 4288         struct vnode *vp;
 4289         file_t *fp;
 4290         int error;
 4291 
 4292         /* fd_getvnode() will use the descriptor for us */
 4293         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 4294                 return (error);
 4295         vp = fp->f_vnode;
 4296         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4297         error = VOP_FSYNC(vp, fp->f_cred, FSYNC_WAIT|FSYNC_DATAONLY, 0, 0);
 4298         VOP_UNLOCK(vp);
 4299         fd_putfile(SCARG(uap, fd));
 4300         return (error);
 4301 }
 4302 
 4303 /*
 4304  * Rename files, (standard) BSD semantics frontend.
 4305  */
 4306 /* ARGSUSED */
 4307 int
 4308 sys_rename(struct lwp *l, const struct sys_rename_args *uap, register_t *retval)
 4309 {
 4310         /* {
 4311                 syscallarg(const char *) from;
 4312                 syscallarg(const char *) to;
 4313         } */
 4314 
 4315         return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD, 
 4316             SCARG(uap, to), UIO_USERSPACE, 0));
 4317 }
 4318 
 4319 int
 4320 sys_renameat(struct lwp *l, const struct sys_renameat_args *uap, 
 4321     register_t *retval)
 4322 {
 4323         /* {
 4324                 syscallarg(int) fromfd;
 4325                 syscallarg(const char *) from;
 4326                 syscallarg(int) tofd;
 4327                 syscallarg(const char *) to;
 4328         } */
 4329 
 4330         return (do_sys_renameat(l, SCARG(uap, fromfd), SCARG(uap, from),
 4331             SCARG(uap, tofd), SCARG(uap, to), UIO_USERSPACE, 0));
 4332 }
 4333 
 4334 /*
 4335  * Rename files, POSIX semantics frontend.
 4336  */
 4337 /* ARGSUSED */
 4338 int
 4339 sys___posix_rename(struct lwp *l, const struct sys___posix_rename_args *uap, register_t *retval)
 4340 {
 4341         /* {
 4342                 syscallarg(const char *) from;
 4343                 syscallarg(const char *) to;
 4344         } */
 4345 
 4346         return (do_sys_renameat(l, AT_FDCWD, SCARG(uap, from), AT_FDCWD,
 4347             SCARG(uap, to), UIO_USERSPACE, 1));
 4348 }
 4349 
 4350 /*
 4351  * Rename files.  Source and destination must either both be directories,
 4352  * or both not be directories.  If target is a directory, it must be empty.
 4353  * If `from' and `to' refer to the same object, the value of the `retain'
 4354  * argument is used to determine whether `from' will be
 4355  *
 4356  * (retain == 0)        deleted unless `from' and `to' refer to the same
 4357  *                      object in the file system's name space (BSD).
 4358  * (retain == 1)        always retained (POSIX).
 4359  *
 4360  * XXX Synchronize with nfsrv_rename in nfs_serv.c.
 4361  */
 4362 int
 4363 do_sys_rename(const char *from, const char *to, enum uio_seg seg, int retain)
 4364 {
 4365         return do_sys_renameat(NULL, AT_FDCWD, from, AT_FDCWD, to, seg, retain);
 4366 }
 4367 
 4368 static int
 4369 do_sys_renameat(struct lwp *l, int fromfd, const char *from, int tofd,
 4370     const char *to, enum uio_seg seg, int retain)
 4371 {
 4372         struct pathbuf *fpb, *tpb;
 4373         struct nameidata fnd, tnd;
 4374         struct vnode *fdvp, *fvp;
 4375         struct vnode *tdvp, *tvp;
 4376         struct mount *mp, *tmp;
 4377         int error;
 4378 
 4379         KASSERT(l != NULL || (fromfd == AT_FDCWD && tofd == AT_FDCWD));
 4380 
 4381         error = pathbuf_maybe_copyin(from, seg, &fpb);
 4382         if (error)
 4383                 goto out0;
 4384         KASSERT(fpb != NULL);
 4385 
 4386         error = pathbuf_maybe_copyin(to, seg, &tpb);
 4387         if (error)
 4388                 goto out1;
 4389         KASSERT(tpb != NULL);
 4390 
 4391         /*
 4392          * Lookup from.
 4393          *
 4394          * XXX LOCKPARENT is wrong because we don't actually want it
 4395          * locked yet, but (a) namei is insane, and (b) VOP_RENAME is
 4396          * insane, so for the time being we need to leave it like this.
 4397          */
 4398         NDINIT(&fnd, DELETE, (LOCKPARENT | TRYEMULROOT), fpb);
 4399         if ((error = fd_nameiat(l, fromfd, &fnd)) != 0)
 4400                 goto out2;
 4401 
 4402         /*
 4403          * Pull out the important results of the lookup, fdvp and fvp.
 4404          * Of course, fvp is bogus because we're about to unlock fdvp.
 4405          */
 4406         fdvp = fnd.ni_dvp;
 4407         fvp = fnd.ni_vp;
 4408         mp = fdvp->v_mount;
 4409         KASSERT(fdvp != NULL);
 4410         KASSERT(fvp != NULL);
 4411         KASSERT((fdvp == fvp) || (VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE));
 4412         /*
 4413          * Bracket the operation with fstrans_start()/fstrans_done().
 4414          *
 4415          * Inside the bracket this file system cannot be unmounted so
 4416          * a vnode on this file system cannot change its v_mount.
 4417          * A vnode on another file system may still change to dead mount.
 4418          */
 4419         fstrans_start(mp);
 4420 
 4421         /*
 4422          * Make sure neither fdvp nor fvp is locked.
 4423          */
 4424         if (fdvp != fvp)
 4425                 VOP_UNLOCK(fdvp);
 4426         /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
 4427         /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
 4428 
 4429         /*
 4430          * Reject renaming `.' and `..'.  Can't do this until after
 4431          * namei because we need namei's parsing to find the final
 4432          * component name.  (namei should just leave us with the final
 4433          * component name and not look it up itself, but anyway...)
 4434          *
 4435          * This was here before because we used to relookup from
 4436          * instead of to and relookup requires the caller to check
 4437          * this, but now file systems may depend on this check, so we
 4438          * must retain it until the file systems are all rototilled.
 4439          */
 4440         if (((fnd.ni_cnd.cn_namelen == 1) &&
 4441                 (fnd.ni_cnd.cn_nameptr[0] == '.')) ||
 4442             ((fnd.ni_cnd.cn_namelen == 2) &&
 4443                 (fnd.ni_cnd.cn_nameptr[0] == '.') &&
 4444                 (fnd.ni_cnd.cn_nameptr[1] == '.'))) {
 4445                 error = EINVAL; /* XXX EISDIR?  */
 4446                 goto abort0;
 4447         }
 4448 
 4449         /*
 4450          * Lookup to.
 4451          *
 4452          * XXX LOCKPARENT is wrong, but...insanity, &c.  Also, using
 4453          * fvp here to decide whether to add CREATEDIR is a load of
 4454          * bollocks because fvp might be the wrong node by now, since
 4455          * fdvp is unlocked.
 4456          *
 4457          * XXX Why not pass CREATEDIR always?
 4458          */
 4459         NDINIT(&tnd, RENAME,
 4460             (LOCKPARENT | NOCACHE | TRYEMULROOT |
 4461                 ((fvp->v_type == VDIR)? CREATEDIR : 0)),
 4462             tpb);
 4463         if ((error = fd_nameiat(l, tofd, &tnd)) != 0)
 4464                 goto abort0;
 4465 
 4466         /*
 4467          * Pull out the important results of the lookup, tdvp and tvp.
 4468          * Of course, tvp is bogus because we're about to unlock tdvp.
 4469          */
 4470         tdvp = tnd.ni_dvp;
 4471         tvp = tnd.ni_vp;
 4472         KASSERT(tdvp != NULL);
 4473         KASSERT((tdvp == tvp) || (VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE));
 4474 
 4475         if (fvp->v_type == VDIR)
 4476                 tnd.ni_cnd.cn_flags |= WILLBEDIR;
 4477         /*
 4478          * Make sure neither tdvp nor tvp is locked.
 4479          */
 4480         if (tdvp != tvp)
 4481                 VOP_UNLOCK(tdvp);
 4482         /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
 4483         /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
 4484 
 4485         /*
 4486          * Reject renaming onto `.' or `..'.  relookup is unhappy with
 4487          * these, which is why we must do this here.  Once upon a time
 4488          * we relooked up from instead of to, and consequently didn't
 4489          * need this check, but now that we relookup to instead of
 4490          * from, we need this; and we shall need it forever forward
 4491          * until the VOP_RENAME protocol changes, because file systems
 4492          * will no doubt begin to depend on this check.
 4493          */
 4494         if ((tnd.ni_cnd.cn_namelen == 1) && (tnd.ni_cnd.cn_nameptr[0] == '.')) {
 4495                 error = EISDIR;
 4496                 goto abort1;
 4497         }
 4498         if ((tnd.ni_cnd.cn_namelen == 2) &&
 4499             (tnd.ni_cnd.cn_nameptr[0] == '.') &&
 4500             (tnd.ni_cnd.cn_nameptr[1] == '.')) {
 4501                 error = EINVAL;
 4502                 goto abort1;
 4503         }
 4504 
 4505         /*
 4506          * Make sure the mount points match.  Although we don't hold
 4507          * any vnode locks, the v_mount on fdvp file system are stable.
 4508          *
 4509          * Unmounting another file system at an inopportune moment may
 4510          * cause tdvp to disappear and change its v_mount to dead.
 4511          *
 4512          * So in either case different v_mount means cross-device rename.
 4513          */
 4514         KASSERT(mp != NULL);
 4515         tmp = tdvp->v_mount;
 4516 
 4517         if (mp != tmp) {
 4518                 error = EXDEV;
 4519                 goto abort1;
 4520         }
 4521 
 4522         /*
 4523          * Take the vfs rename lock to avoid cross-directory screw cases.
 4524          * Nothing is locked currently, so taking this lock is safe.
 4525          */
 4526         error = VFS_RENAMELOCK_ENTER(mp);
 4527         if (error)
 4528                 goto abort1;
 4529 
 4530         /*
 4531          * Now fdvp, fvp, tdvp, and (if nonnull) tvp are referenced,
 4532          * and nothing is locked except for the vfs rename lock.
 4533          *
 4534          * The next step is a little rain dance to conform to the
 4535          * insane lock protocol, even though it does nothing to ward
 4536          * off race conditions.
 4537          *
 4538          * We need tdvp and tvp to be locked.  However, because we have
 4539          * unlocked tdvp in order to hold no locks while we take the
 4540          * vfs rename lock, tvp may be wrong here, and we can't safely
 4541          * lock it even if the sensible file systems will just unlock
 4542          * it straight away.  Consequently, we must lock tdvp and then
 4543          * relookup tvp to get it locked.
 4544          *
 4545          * Finally, because the VOP_RENAME protocol is brain-damaged
 4546          * and various file systems insanely depend on the semantics of
 4547          * this brain damage, the lookup of to must be the last lookup
 4548          * before VOP_RENAME.
 4549          */
 4550         vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
 4551         error = relookup(tdvp, &tnd.ni_vp, &tnd.ni_cnd, 0);
 4552         if (error)
 4553                 goto abort2;
 4554 
 4555         /*
 4556          * Drop the old tvp and pick up the new one -- which might be
 4557          * the same, but that doesn't matter to us.  After this, tdvp
 4558          * and tvp should both be locked.
 4559          */
 4560         if (tvp != NULL)
 4561                 vrele(tvp);
 4562         tvp = tnd.ni_vp;
 4563         KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
 4564         KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
 4565 
 4566         /*
 4567          * The old do_sys_rename had various consistency checks here
 4568          * involving fvp and tvp.  fvp is bogus already here, and tvp
 4569          * will become bogus soon in any sensible file system, so the
 4570          * only purpose in putting these checks here is to give lip
 4571          * service to these screw cases and to acknowledge that they
 4572          * exist, not actually to handle them, but here you go
 4573          * anyway...
 4574          */
 4575 
 4576         /*
 4577          * Acknowledge that directories and non-directories aren't
 4578          * supposed to mix.
 4579          */
 4580         if (tvp != NULL) {
 4581                 if ((fvp->v_type == VDIR) && (tvp->v_type != VDIR)) {
 4582                         error = ENOTDIR;
 4583                         goto abort3;
 4584                 } else if ((fvp->v_type != VDIR) && (tvp->v_type == VDIR)) {
 4585                         error = EISDIR;
 4586                         goto abort3;
 4587                 }
 4588         }
 4589 
 4590         /*
 4591          * Acknowledge some random screw case, among the dozens that
 4592          * might arise.
 4593          */
 4594         if (fvp == tdvp) {
 4595                 error = EINVAL;
 4596                 goto abort3;
 4597         }
 4598 
 4599         /*
 4600          * Acknowledge that POSIX has a wacky screw case.
 4601          *
 4602          * XXX Eventually the retain flag needs to be passed on to
 4603          * VOP_RENAME.
 4604          */
 4605         if (fvp == tvp) {
 4606                 if (retain) {
 4607                         error = 0;
 4608                         goto abort3;
 4609                 } else if ((fdvp == tdvp) &&
 4610                     (fnd.ni_cnd.cn_namelen == tnd.ni_cnd.cn_namelen) &&
 4611                     (0 == memcmp(fnd.ni_cnd.cn_nameptr, tnd.ni_cnd.cn_nameptr,
 4612                         fnd.ni_cnd.cn_namelen))) {
 4613                         error = 0;
 4614                         goto abort3;
 4615                 }
 4616         }
 4617 
 4618         /*
 4619          * Make sure veriexec can screw us up.  (But a race can screw
 4620          * up veriexec, of course -- remember, fvp and (soon) tvp are
 4621          * bogus.)
 4622          */
 4623 #if NVERIEXEC > 0
 4624         {
 4625                 char *f1, *f2;
 4626                 size_t f1_len;
 4627                 size_t f2_len;
 4628 
 4629                 f1_len = fnd.ni_cnd.cn_namelen + 1;
 4630                 f1 = kmem_alloc(f1_len, KM_SLEEP);
 4631                 strlcpy(f1, fnd.ni_cnd.cn_nameptr, f1_len);
 4632 
 4633                 f2_len = tnd.ni_cnd.cn_namelen + 1;
 4634                 f2 = kmem_alloc(f2_len, KM_SLEEP);
 4635                 strlcpy(f2, tnd.ni_cnd.cn_nameptr, f2_len);
 4636 
 4637                 error = veriexec_renamechk(curlwp, fvp, f1, tvp, f2);
 4638 
 4639                 kmem_free(f1, f1_len);
 4640                 kmem_free(f2, f2_len);
 4641 
 4642                 if (error)
 4643                         goto abort3;
 4644         }
 4645 #endif /* NVERIEXEC > 0 */
 4646 
 4647         /*
 4648          * All ready.  Incant the rename vop.
 4649          */
 4650         /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
 4651         /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
 4652         KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
 4653         KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
 4654         error = VOP_RENAME(fdvp, fvp, &fnd.ni_cnd, tdvp, tvp, &tnd.ni_cnd);
 4655 
 4656         /*
 4657          * VOP_RENAME releases fdvp, fvp, tdvp, and tvp, and unlocks
 4658          * tdvp and tvp.  But we can't assert any of that.
 4659          */
 4660         /* XXX KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
 4661         /* XXX KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
 4662         /* XXX KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
 4663         /* XXX KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) != LK_EXCLUSIVE)); */
 4664 
 4665         /*
 4666          * So all we have left to do is to drop the rename lock and
 4667          * destroy the pathbufs.
 4668          */
 4669         VFS_RENAMELOCK_EXIT(mp);
 4670         fstrans_done(mp);
 4671         goto out2;
 4672 
 4673 abort3: if ((tvp != NULL) && (tvp != tdvp))
 4674                 VOP_UNLOCK(tvp);
 4675 abort2: VOP_UNLOCK(tdvp);
 4676         VFS_RENAMELOCK_EXIT(mp);
 4677 abort1: VOP_ABORTOP(tdvp, &tnd.ni_cnd);
 4678         vrele(tdvp);
 4679         if (tvp != NULL)
 4680                 vrele(tvp);
 4681 abort0: VOP_ABORTOP(fdvp, &fnd.ni_cnd);
 4682         vrele(fdvp);
 4683         vrele(fvp);
 4684         fstrans_done(mp);
 4685 out2:   pathbuf_destroy(tpb);
 4686 out1:   pathbuf_destroy(fpb);
 4687 out0:   return error;
 4688 }
 4689 
 4690 /*
 4691  * Make a directory file.
 4692  */
 4693 /* ARGSUSED */
 4694 int
 4695 sys_mkdir(struct lwp *l, const struct sys_mkdir_args *uap, register_t *retval)
 4696 {
 4697         /* {
 4698                 syscallarg(const char *) path;
 4699                 syscallarg(int) mode;
 4700         } */
 4701 
 4702         return do_sys_mkdirat(l, AT_FDCWD, SCARG(uap, path),
 4703             SCARG(uap, mode), UIO_USERSPACE);
 4704 }
 4705 
 4706 int
 4707 sys_mkdirat(struct lwp *l, const struct sys_mkdirat_args *uap,
 4708     register_t *retval)
 4709 {
 4710         /* {
 4711                 syscallarg(int) fd;
 4712                 syscallarg(const char *) path;
 4713                 syscallarg(int) mode;
 4714         } */
 4715 
 4716         return do_sys_mkdirat(l, SCARG(uap, fd), SCARG(uap, path),
 4717             SCARG(uap, mode), UIO_USERSPACE);
 4718 }
 4719 
 4720 
 4721 int
 4722 do_sys_mkdir(const char *path, mode_t mode, enum uio_seg seg)
 4723 {
 4724         return do_sys_mkdirat(NULL, AT_FDCWD, path, mode, seg);
 4725 }
 4726 
 4727 static int
 4728 do_sys_mkdirat(struct lwp *l, int fdat, const char *path, mode_t mode,
 4729     enum uio_seg seg)
 4730 {
 4731         struct proc *p = curlwp->l_proc;
 4732         struct vnode *vp;
 4733         struct vattr vattr;
 4734         int error;
 4735         struct pathbuf *pb;
 4736         struct nameidata nd;
 4737 
 4738         KASSERT(l != NULL || fdat == AT_FDCWD);
 4739 
 4740         /* XXX bollocks, should pass in a pathbuf */
 4741         error = pathbuf_maybe_copyin(path, seg, &pb);
 4742         if (error) {
 4743                 return error;
 4744         }
 4745 
 4746         NDINIT(&nd, CREATE, LOCKPARENT | CREATEDIR | TRYEMULROOT, pb);
 4747 
 4748         if ((error = fd_nameiat(l, fdat, &nd)) != 0) {
 4749                 pathbuf_destroy(pb);
 4750                 return (error);
 4751         }
 4752         vp = nd.ni_vp;
 4753         if (vp != NULL) {
 4754                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 4755                 if (nd.ni_dvp == vp)
 4756                         vrele(nd.ni_dvp);
 4757                 else
 4758                         vput(nd.ni_dvp);
 4759                 vrele(vp);
 4760                 pathbuf_destroy(pb);
 4761                 return (EEXIST);
 4762         }
 4763         vattr_null(&vattr);
 4764         vattr.va_type = VDIR;
 4765         /* We will read cwdi->cwdi_cmask unlocked. */
 4766         vattr.va_mode = (mode & ACCESSPERMS) &~ p->p_cwdi->cwdi_cmask;
 4767         nd.ni_cnd.cn_flags |= WILLBEDIR;
 4768         error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 4769         if (!error)
 4770                 vrele(nd.ni_vp);
 4771         vput(nd.ni_dvp);
 4772         pathbuf_destroy(pb);
 4773         return (error);
 4774 }
 4775 
 4776 /*
 4777  * Remove a directory file.
 4778  */
 4779 /* ARGSUSED */
 4780 int
 4781 sys_rmdir(struct lwp *l, const struct sys_rmdir_args *uap, register_t *retval)
 4782 {
 4783         return do_sys_unlinkat(l, AT_FDCWD, SCARG(uap, path),
 4784             AT_REMOVEDIR, UIO_USERSPACE);
 4785 }
 4786 
 4787 /*
 4788  * Read a block of directory entries in a file system independent format.
 4789  */
 4790 int
 4791 sys___getdents30(struct lwp *l, const struct sys___getdents30_args *uap, register_t *retval)
 4792 {
 4793         /* {
 4794                 syscallarg(int) fd;
 4795                 syscallarg(char *) buf;
 4796                 syscallarg(size_t) count;
 4797         } */
 4798         file_t *fp;
 4799         int error, done;
 4800 
 4801         /* fd_getvnode() will use the descriptor for us */
 4802         if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
 4803                 return (error);
 4804         if ((fp->f_flag & FREAD) == 0) {
 4805                 error = EBADF;
 4806                 goto out;
 4807         }
 4808         error = vn_readdir(fp, SCARG(uap, buf), UIO_USERSPACE,
 4809                         SCARG(uap, count), &done, l, 0, 0);
 4810         ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, buf), done, error);
 4811         *retval = done;
 4812  out:
 4813         fd_putfile(SCARG(uap, fd));
 4814         return (error);
 4815 }
 4816 
 4817 /*
 4818  * Set the mode mask for creation of filesystem nodes.
 4819  */
 4820 int
 4821 sys_umask(struct lwp *l, const struct sys_umask_args *uap, register_t *retval)
 4822 {
 4823         /* {
 4824                 syscallarg(mode_t) newmask;
 4825         } */
 4826 
 4827         /*
 4828          * cwdi->cwdi_cmask will be read unlocked elsewhere, and no kind of
 4829          * serialization with those reads is required.  It's important to
 4830          * return a coherent answer for the caller of umask() though, and
 4831          * the atomic operation accomplishes that.
 4832          */
 4833         *retval = atomic_swap_uint(&curproc->p_cwdi->cwdi_cmask,
 4834             SCARG(uap, newmask) & ALLPERMS);
 4835 
 4836         return (0);
 4837 }
 4838 
 4839 int
 4840 dorevoke(struct vnode *vp, kauth_cred_t cred)
 4841 {
 4842         struct vattr vattr;
 4843         int error, fs_decision;
 4844 
 4845         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4846         error = VOP_GETATTR(vp, &vattr, cred);
 4847         VOP_UNLOCK(vp);
 4848         if (error != 0)
 4849                 return error;
 4850         fs_decision = (kauth_cred_geteuid(cred) == vattr.va_uid) ? 0 : EPERM;
 4851         error = kauth_authorize_vnode(cred, KAUTH_VNODE_REVOKE, vp, NULL,
 4852             fs_decision);
 4853         if (!error)
 4854                 VOP_REVOKE(vp, REVOKEALL);
 4855         return (error);
 4856 }
 4857 
 4858 /*
 4859  * Void all references to file by ripping underlying filesystem
 4860  * away from vnode.
 4861  */
 4862 /* ARGSUSED */
 4863 int
 4864 sys_revoke(struct lwp *l, const struct sys_revoke_args *uap, register_t *retval)
 4865 {
 4866         /* {
 4867                 syscallarg(const char *) path;
 4868         } */
 4869         struct vnode *vp;
 4870         int error;
 4871 
 4872         error = namei_simple_user(SCARG(uap, path),
 4873                                 NSM_FOLLOW_TRYEMULROOT, &vp);
 4874         if (error != 0)
 4875                 return (error);
 4876         error = dorevoke(vp, l->l_cred);
 4877         vrele(vp);
 4878         return (error);
 4879 }
 4880 
 4881 /*
 4882  * Allocate backing store for a file, filling a hole without having to
 4883  * explicitly write anything out.
 4884  */
 4885 /* ARGSUSED */
 4886 int
 4887 sys_posix_fallocate(struct lwp *l, const struct sys_posix_fallocate_args *uap,
 4888                 register_t *retval)
 4889 {
 4890         /* {
 4891                 syscallarg(int) fd;
 4892                 syscallarg(off_t) pos;
 4893                 syscallarg(off_t) len;
 4894         } */
 4895         int fd;
 4896         off_t pos, len;
 4897         struct file *fp;
 4898         struct vnode *vp;
 4899         int error;
 4900 
 4901         fd = SCARG(uap, fd);
 4902         pos = SCARG(uap, pos);
 4903         len = SCARG(uap, len);
 4904         
 4905         if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
 4906                 *retval = EINVAL;
 4907                 return 0;
 4908         }
 4909         
 4910         error = fd_getvnode(fd, &fp);
 4911         if (error) {
 4912                 *retval = error;
 4913                 return 0;
 4914         }
 4915         if ((fp->f_flag & FWRITE) == 0) {
 4916                 error = EBADF;
 4917                 goto fail;
 4918         }
 4919         vp = fp->f_vnode;
 4920 
 4921         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4922         if (vp->v_type == VDIR) {
 4923                 error = EISDIR;
 4924         } else {
 4925                 error = VOP_FALLOCATE(vp, pos, len);
 4926         }
 4927         VOP_UNLOCK(vp);
 4928 
 4929 fail:
 4930         fd_putfile(fd);
 4931         *retval = error;
 4932         return 0;
 4933 }
 4934 
 4935 /*
 4936  * Deallocate backing store for a file, creating a hole. Also used for
 4937  * invoking TRIM on disks.
 4938  */
 4939 /* ARGSUSED */
 4940 int
 4941 sys_fdiscard(struct lwp *l, const struct sys_fdiscard_args *uap,
 4942                 register_t *retval)
 4943 {
 4944         /* {
 4945                 syscallarg(int) fd;
 4946                 syscallarg(off_t) pos;
 4947                 syscallarg(off_t) len;
 4948         } */
 4949         int fd;
 4950         off_t pos, len;
 4951         struct file *fp;
 4952         struct vnode *vp;
 4953         int error;
 4954 
 4955         fd = SCARG(uap, fd);
 4956         pos = SCARG(uap, pos);
 4957         len = SCARG(uap, len);
 4958 
 4959         if (pos < 0 || len < 0 || len > OFF_T_MAX - pos) {
 4960                 return EINVAL;
 4961         }
 4962         
 4963         error = fd_getvnode(fd, &fp);
 4964         if (error) {
 4965                 return error;
 4966         }
 4967         if ((fp->f_flag & FWRITE) == 0) {
 4968                 error = EBADF;
 4969                 goto fail;
 4970         }
 4971         vp = fp->f_vnode;
 4972 
 4973         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 4974         if (vp->v_type == VDIR) {
 4975                 error = EISDIR;
 4976         } else {
 4977                 error = VOP_FDISCARD(vp, pos, len);
 4978         }
 4979         VOP_UNLOCK(vp);
 4980 
 4981 fail:
 4982         fd_putfile(fd);
 4983         return error;
 4984 }
Cache object: 657b25b00a1476d30e6ac3200b51bf42
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_syscalls.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_syscalls.c