The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_syscalls.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1989, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)vfs_syscalls.c      8.13 (Berkeley) 4/15/94
   37  */
   38 
   39 #include <sys/cdefs.h>
   40 __FBSDID("$FreeBSD$");
   41 
   42 #include "opt_capsicum.h"
   43 #include "opt_ktrace.h"
   44 
   45 #include <sys/param.h>
   46 #include <sys/systm.h>
   47 #include <sys/bio.h>
   48 #include <sys/buf.h>
   49 #include <sys/capsicum.h>
   50 #include <sys/disk.h>
   51 #include <sys/malloc.h>
   52 #include <sys/mount.h>
   53 #include <sys/mutex.h>
   54 #include <sys/sysproto.h>
   55 #include <sys/namei.h>
   56 #include <sys/filedesc.h>
   57 #include <sys/kernel.h>
   58 #include <sys/fcntl.h>
   59 #include <sys/file.h>
   60 #include <sys/filio.h>
   61 #include <sys/limits.h>
   62 #include <sys/linker.h>
   63 #include <sys/rwlock.h>
   64 #include <sys/sdt.h>
   65 #include <sys/stat.h>
   66 #include <sys/sx.h>
   67 #include <sys/unistd.h>
   68 #include <sys/vnode.h>
   69 #include <sys/priv.h>
   70 #include <sys/proc.h>
   71 #include <sys/dirent.h>
   72 #include <sys/jail.h>
   73 #include <sys/syscallsubr.h>
   74 #include <sys/sysctl.h>
   75 #ifdef KTRACE
   76 #include <sys/ktrace.h>
   77 #endif
   78 
   79 #include <machine/stdarg.h>
   80 
   81 #include <security/audit/audit.h>
   82 #include <security/mac/mac_framework.h>
   83 
   84 #include <vm/vm.h>
   85 #include <vm/vm_object.h>
   86 #include <vm/vm_page.h>
   87 #include <vm/uma.h>
   88 
   89 #include <fs/devfs/devfs.h>
   90 
   91 #include <ufs/ufs/quota.h>
   92 
   93 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
   94 
   95 static int kern_chflagsat(struct thread *td, int fd, const char *path,
   96     enum uio_seg pathseg, u_long flags, int atflag);
   97 static int setfflags(struct thread *td, struct vnode *, u_long);
   98 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
   99 static int getutimens(const struct timespec *, enum uio_seg,
  100     struct timespec *, int *);
  101 static int setutimes(struct thread *td, struct vnode *,
  102     const struct timespec *, int, int);
  103 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
  104     struct thread *td);
  105 static int kern_fhlinkat(struct thread *td, int fd, const char *path,
  106     enum uio_seg pathseg, fhandle_t *fhp);
  107 static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg,
  108     size_t count, struct thread *td);
  109 static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd,
  110     const char *path, enum uio_seg segflag);
  111 
  112 static uint64_t
  113 at2cnpflags(u_int at_flags, u_int mask)
  114 {
  115         u_int64_t res;
  116 
  117         MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) !=
  118             (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW));
  119 
  120         res = 0;
  121         at_flags &= mask;
  122         if ((at_flags & AT_RESOLVE_BENEATH) != 0)
  123                 res |= RBENEATH;
  124         if ((at_flags & AT_SYMLINK_FOLLOW) != 0)
  125                 res |= FOLLOW;
  126         /* NOFOLLOW is pseudo flag */
  127         if ((mask & AT_SYMLINK_NOFOLLOW) != 0) {
  128                 res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW :
  129                     FOLLOW;
  130         }
  131         if ((mask & AT_EMPTY_PATH) != 0 && (at_flags & AT_EMPTY_PATH) != 0)
  132                 res |= EMPTYPATH;
  133         return (res);
  134 }
  135 
  136 int
  137 kern_sync(struct thread *td)
  138 {
  139         struct mount *mp, *nmp;
  140         int save;
  141 
  142         mtx_lock(&mountlist_mtx);
  143         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
  144                 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
  145                         nmp = TAILQ_NEXT(mp, mnt_list);
  146                         continue;
  147                 }
  148                 if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
  149                     vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
  150                         save = curthread_pflags_set(TDP_SYNCIO);
  151                         vfs_periodic(mp, MNT_NOWAIT);
  152                         VFS_SYNC(mp, MNT_NOWAIT);
  153                         curthread_pflags_restore(save);
  154                         vn_finished_write(mp);
  155                 }
  156                 mtx_lock(&mountlist_mtx);
  157                 nmp = TAILQ_NEXT(mp, mnt_list);
  158                 vfs_unbusy(mp);
  159         }
  160         mtx_unlock(&mountlist_mtx);
  161         return (0);
  162 }
  163 
  164 /*
  165  * Sync each mounted filesystem.
  166  */
  167 #ifndef _SYS_SYSPROTO_H_
  168 struct sync_args {
  169         int     dummy;
  170 };
  171 #endif
  172 /* ARGSUSED */
  173 int
  174 sys_sync(struct thread *td, struct sync_args *uap)
  175 {
  176 
  177         return (kern_sync(td));
  178 }
  179 
  180 /*
  181  * Change filesystem quotas.
  182  */
  183 #ifndef _SYS_SYSPROTO_H_
  184 struct quotactl_args {
  185         char *path;
  186         int cmd;
  187         int uid;
  188         caddr_t arg;
  189 };
  190 #endif
  191 int
  192 sys_quotactl(struct thread *td, struct quotactl_args *uap)
  193 {
  194         struct mount *mp;
  195         struct nameidata nd;
  196         int error;
  197 
  198         AUDIT_ARG_CMD(uap->cmd);
  199         AUDIT_ARG_UID(uap->uid);
  200         if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
  201                 return (EPERM);
  202         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
  203             uap->path, td);
  204         if ((error = namei(&nd)) != 0)
  205                 return (error);
  206         NDFREE(&nd, NDF_ONLY_PNBUF);
  207         mp = nd.ni_vp->v_mount;
  208         vfs_ref(mp);
  209         vput(nd.ni_vp);
  210         error = vfs_busy(mp, 0);
  211         if (error != 0) {
  212                 vfs_rel(mp);
  213                 return (error);
  214         }
  215         error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
  216 
  217         /*
  218          * Since quota on operation typically needs to open quota
  219          * file, the Q_QUOTAON handler needs to unbusy the mount point
  220          * before calling into namei.  Otherwise, unmount might be
  221          * started between two vfs_busy() invocations (first is our,
  222          * second is from mount point cross-walk code in lookup()),
  223          * causing deadlock.
  224          *
  225          * Require that Q_QUOTAON handles the vfs_busy() reference on
  226          * its own, always returning with ubusied mount point.
  227          */
  228         if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON &&
  229             (uap->cmd >> SUBCMDSHIFT) != Q_QUOTAOFF)
  230                 vfs_unbusy(mp);
  231         vfs_rel(mp);
  232         return (error);
  233 }
  234 
  235 /*
  236  * Used by statfs conversion routines to scale the block size up if
  237  * necessary so that all of the block counts are <= 'max_size'.  Note
  238  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
  239  * value of 'n'.
  240  */
  241 void
  242 statfs_scale_blocks(struct statfs *sf, long max_size)
  243 {
  244         uint64_t count;
  245         int shift;
  246 
  247         KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
  248 
  249         /*
  250          * Attempt to scale the block counts to give a more accurate
  251          * overview to userland of the ratio of free space to used
  252          * space.  To do this, find the largest block count and compute
  253          * a divisor that lets it fit into a signed integer <= max_size.
  254          */
  255         if (sf->f_bavail < 0)
  256                 count = -sf->f_bavail;
  257         else
  258                 count = sf->f_bavail;
  259         count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
  260         if (count <= max_size)
  261                 return;
  262 
  263         count >>= flsl(max_size);
  264         shift = 0;
  265         while (count > 0) {
  266                 shift++;
  267                 count >>=1;
  268         }
  269 
  270         sf->f_bsize <<= shift;
  271         sf->f_blocks >>= shift;
  272         sf->f_bfree >>= shift;
  273         sf->f_bavail >>= shift;
  274 }
  275 
  276 static int
  277 kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
  278 {
  279         int error;
  280 
  281         if (mp == NULL)
  282                 return (EBADF);
  283         error = vfs_busy(mp, 0);
  284         vfs_rel(mp);
  285         if (error != 0)
  286                 return (error);
  287 #ifdef MAC
  288         error = mac_mount_check_stat(td->td_ucred, mp);
  289         if (error != 0)
  290                 goto out;
  291 #endif
  292         error = VFS_STATFS(mp, buf);
  293         if (error != 0)
  294                 goto out;
  295         if (priv_check_cred_vfs_generation(td->td_ucred)) {
  296                 buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
  297                 prison_enforce_statfs(td->td_ucred, mp, buf);
  298         }
  299 out:
  300         vfs_unbusy(mp);
  301         return (error);
  302 }
  303 
  304 /*
  305  * Get filesystem statistics.
  306  */
  307 #ifndef _SYS_SYSPROTO_H_
  308 struct statfs_args {
  309         char *path;
  310         struct statfs *buf;
  311 };
  312 #endif
  313 int
  314 sys_statfs(struct thread *td, struct statfs_args *uap)
  315 {
  316         struct statfs *sfp;
  317         int error;
  318 
  319         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
  320         error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
  321         if (error == 0)
  322                 error = copyout(sfp, uap->buf, sizeof(struct statfs));
  323         free(sfp, M_STATFS);
  324         return (error);
  325 }
  326 
  327 int
  328 kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg,
  329     struct statfs *buf)
  330 {
  331         struct mount *mp;
  332         struct nameidata nd;
  333         int error;
  334 
  335         NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
  336         error = namei(&nd);
  337         if (error != 0)
  338                 return (error);
  339         mp = vfs_ref_from_vp(nd.ni_vp);
  340         NDFREE_NOTHING(&nd);
  341         vrele(nd.ni_vp);
  342         return (kern_do_statfs(td, mp, buf));
  343 }
  344 
  345 /*
  346  * Get filesystem statistics.
  347  */
  348 #ifndef _SYS_SYSPROTO_H_
  349 struct fstatfs_args {
  350         int fd;
  351         struct statfs *buf;
  352 };
  353 #endif
  354 int
  355 sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
  356 {
  357         struct statfs *sfp;
  358         int error;
  359 
  360         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
  361         error = kern_fstatfs(td, uap->fd, sfp);
  362         if (error == 0)
  363                 error = copyout(sfp, uap->buf, sizeof(struct statfs));
  364         free(sfp, M_STATFS);
  365         return (error);
  366 }
  367 
  368 int
  369 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
  370 {
  371         struct file *fp;
  372         struct mount *mp;
  373         struct vnode *vp;
  374         int error;
  375 
  376         AUDIT_ARG_FD(fd);
  377         error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp);
  378         if (error != 0)
  379                 return (error);
  380         vp = fp->f_vnode;
  381 #ifdef AUDIT
  382         if (AUDITING_TD(td)) {
  383                 vn_lock(vp, LK_SHARED | LK_RETRY);
  384                 AUDIT_ARG_VNODE1(vp);
  385                 VOP_UNLOCK(vp);
  386         }
  387 #endif
  388         mp = vfs_ref_from_vp(vp);
  389         fdrop(fp, td);
  390         return (kern_do_statfs(td, mp, buf));
  391 }
  392 
  393 /*
  394  * Get statistics on all filesystems.
  395  */
  396 #ifndef _SYS_SYSPROTO_H_
  397 struct getfsstat_args {
  398         struct statfs *buf;
  399         long bufsize;
  400         int mode;
  401 };
  402 #endif
  403 int
  404 sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
  405 {
  406         size_t count;
  407         int error;
  408 
  409         if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
  410                 return (EINVAL);
  411         error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
  412             UIO_USERSPACE, uap->mode);
  413         if (error == 0)
  414                 td->td_retval[0] = count;
  415         return (error);
  416 }
  417 
  418 /*
  419  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
  420  *      The caller is responsible for freeing memory which will be allocated
  421  *      in '*buf'.
  422  */
  423 int
  424 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
  425     size_t *countp, enum uio_seg bufseg, int mode)
  426 {
  427         struct mount *mp, *nmp;
  428         struct statfs *sfsp, *sp, *sptmp, *tofree;
  429         size_t count, maxcount;
  430         int error;
  431 
  432         switch (mode) {
  433         case MNT_WAIT:
  434         case MNT_NOWAIT:
  435                 break;
  436         default:
  437                 if (bufseg == UIO_SYSSPACE)
  438                         *buf = NULL;
  439                 return (EINVAL);
  440         }
  441 restart:
  442         maxcount = bufsize / sizeof(struct statfs);
  443         if (bufsize == 0) {
  444                 sfsp = NULL;
  445                 tofree = NULL;
  446         } else if (bufseg == UIO_USERSPACE) {
  447                 sfsp = *buf;
  448                 tofree = NULL;
  449         } else /* if (bufseg == UIO_SYSSPACE) */ {
  450                 count = 0;
  451                 mtx_lock(&mountlist_mtx);
  452                 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
  453                         count++;
  454                 }
  455                 mtx_unlock(&mountlist_mtx);
  456                 if (maxcount > count)
  457                         maxcount = count;
  458                 tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
  459                     M_STATFS, M_WAITOK);
  460         }
  461 
  462         count = 0;
  463 
  464         /*
  465          * If there is no target buffer they only want the count.
  466          *
  467          * This could be TAILQ_FOREACH but it is open-coded to match the original
  468          * code below.
  469          */
  470         if (sfsp == NULL) {
  471                 mtx_lock(&mountlist_mtx);
  472                 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
  473                         if (prison_canseemount(td->td_ucred, mp) != 0) {
  474                                 nmp = TAILQ_NEXT(mp, mnt_list);
  475                                 continue;
  476                         }
  477 #ifdef MAC
  478                         if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
  479                                 nmp = TAILQ_NEXT(mp, mnt_list);
  480                                 continue;
  481                         }
  482 #endif
  483                         count++;
  484                         nmp = TAILQ_NEXT(mp, mnt_list);
  485                 }
  486                 mtx_unlock(&mountlist_mtx);
  487                 *countp = count;
  488                 return (0);
  489         }
  490 
  491         /*
  492          * They want the entire thing.
  493          *
  494          * Short-circuit the corner case of no room for anything, avoids
  495          * relocking below.
  496          */
  497         if (maxcount < 1) {
  498                 goto out;
  499         }
  500 
  501         mtx_lock(&mountlist_mtx);
  502         for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
  503                 if (prison_canseemount(td->td_ucred, mp) != 0) {
  504                         nmp = TAILQ_NEXT(mp, mnt_list);
  505                         continue;
  506                 }
  507 #ifdef MAC
  508                 if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
  509                         nmp = TAILQ_NEXT(mp, mnt_list);
  510                         continue;
  511                 }
  512 #endif
  513                 if (mode == MNT_WAIT) {
  514                         if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
  515                                 /*
  516                                  * If vfs_busy() failed, and MBF_NOWAIT
  517                                  * wasn't passed, then the mp is gone.
  518                                  * Furthermore, because of MBF_MNTLSTLOCK,
  519                                  * the mountlist_mtx was dropped.  We have
  520                                  * no other choice than to start over.
  521                                  */
  522                                 mtx_unlock(&mountlist_mtx);
  523                                 free(tofree, M_STATFS);
  524                                 goto restart;
  525                         }
  526                 } else {
  527                         if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
  528                                 nmp = TAILQ_NEXT(mp, mnt_list);
  529                                 continue;
  530                         }
  531                 }
  532                 sp = &mp->mnt_stat;
  533                 /*
  534                  * If MNT_NOWAIT is specified, do not refresh
  535                  * the fsstat cache.
  536                  */
  537                 if (mode != MNT_NOWAIT) {
  538                         error = VFS_STATFS(mp, sp);
  539                         if (error != 0) {
  540                                 mtx_lock(&mountlist_mtx);
  541                                 nmp = TAILQ_NEXT(mp, mnt_list);
  542                                 vfs_unbusy(mp);
  543                                 continue;
  544                         }
  545                 }
  546                 if (priv_check_cred_vfs_generation(td->td_ucred)) {
  547                         sptmp = malloc(sizeof(struct statfs), M_STATFS,
  548                             M_WAITOK);
  549                         *sptmp = *sp;
  550                         sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
  551                         prison_enforce_statfs(td->td_ucred, mp, sptmp);
  552                         sp = sptmp;
  553                 } else
  554                         sptmp = NULL;
  555                 if (bufseg == UIO_SYSSPACE) {
  556                         bcopy(sp, sfsp, sizeof(*sp));
  557                         free(sptmp, M_STATFS);
  558                 } else /* if (bufseg == UIO_USERSPACE) */ {
  559                         error = copyout(sp, sfsp, sizeof(*sp));
  560                         free(sptmp, M_STATFS);
  561                         if (error != 0) {
  562                                 vfs_unbusy(mp);
  563                                 return (error);
  564                         }
  565                 }
  566                 sfsp++;
  567                 count++;
  568 
  569                 if (count == maxcount) {
  570                         vfs_unbusy(mp);
  571                         goto out;
  572                 }
  573 
  574                 mtx_lock(&mountlist_mtx);
  575                 nmp = TAILQ_NEXT(mp, mnt_list);
  576                 vfs_unbusy(mp);
  577         }
  578         mtx_unlock(&mountlist_mtx);
  579 out:
  580         *countp = count;
  581         return (0);
  582 }
  583 
  584 #ifdef COMPAT_FREEBSD4
  585 /*
  586  * Get old format filesystem statistics.
  587  */
  588 static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *);
  589 
  590 #ifndef _SYS_SYSPROTO_H_
  591 struct freebsd4_statfs_args {
  592         char *path;
  593         struct ostatfs *buf;
  594 };
  595 #endif
  596 int
  597 freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
  598 {
  599         struct ostatfs osb;
  600         struct statfs *sfp;
  601         int error;
  602 
  603         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
  604         error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
  605         if (error == 0) {
  606                 freebsd4_cvtstatfs(sfp, &osb);
  607                 error = copyout(&osb, uap->buf, sizeof(osb));
  608         }
  609         free(sfp, M_STATFS);
  610         return (error);
  611 }
  612 
  613 /*
  614  * Get filesystem statistics.
  615  */
  616 #ifndef _SYS_SYSPROTO_H_
  617 struct freebsd4_fstatfs_args {
  618         int fd;
  619         struct ostatfs *buf;
  620 };
  621 #endif
  622 int
  623 freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
  624 {
  625         struct ostatfs osb;
  626         struct statfs *sfp;
  627         int error;
  628 
  629         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
  630         error = kern_fstatfs(td, uap->fd, sfp);
  631         if (error == 0) {
  632                 freebsd4_cvtstatfs(sfp, &osb);
  633                 error = copyout(&osb, uap->buf, sizeof(osb));
  634         }
  635         free(sfp, M_STATFS);
  636         return (error);
  637 }
  638 
  639 /*
  640  * Get statistics on all filesystems.
  641  */
  642 #ifndef _SYS_SYSPROTO_H_
  643 struct freebsd4_getfsstat_args {
  644         struct ostatfs *buf;
  645         long bufsize;
  646         int mode;
  647 };
  648 #endif
  649 int
  650 freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
  651 {
  652         struct statfs *buf, *sp;
  653         struct ostatfs osb;
  654         size_t count, size;
  655         int error;
  656 
  657         if (uap->bufsize < 0)
  658                 return (EINVAL);
  659         count = uap->bufsize / sizeof(struct ostatfs);
  660         if (count > SIZE_MAX / sizeof(struct statfs))
  661                 return (EINVAL);
  662         size = count * sizeof(struct statfs);
  663         error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
  664             uap->mode);
  665         if (error == 0)
  666                 td->td_retval[0] = count;
  667         if (size != 0) {
  668                 sp = buf;
  669                 while (count != 0 && error == 0) {
  670                         freebsd4_cvtstatfs(sp, &osb);
  671                         error = copyout(&osb, uap->buf, sizeof(osb));
  672                         sp++;
  673                         uap->buf++;
  674                         count--;
  675                 }
  676                 free(buf, M_STATFS);
  677         }
  678         return (error);
  679 }
  680 
  681 /*
  682  * Implement fstatfs() for (NFS) file handles.
  683  */
  684 #ifndef _SYS_SYSPROTO_H_
  685 struct freebsd4_fhstatfs_args {
  686         struct fhandle *u_fhp;
  687         struct ostatfs *buf;
  688 };
  689 #endif
  690 int
  691 freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
  692 {
  693         struct ostatfs osb;
  694         struct statfs *sfp;
  695         fhandle_t fh;
  696         int error;
  697 
  698         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
  699         if (error != 0)
  700                 return (error);
  701         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
  702         error = kern_fhstatfs(td, fh, sfp);
  703         if (error == 0) {
  704                 freebsd4_cvtstatfs(sfp, &osb);
  705                 error = copyout(&osb, uap->buf, sizeof(osb));
  706         }
  707         free(sfp, M_STATFS);
  708         return (error);
  709 }
  710 
  711 /*
  712  * Convert a new format statfs structure to an old format statfs structure.
  713  */
  714 static void
  715 freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
  716 {
  717 
  718         statfs_scale_blocks(nsp, LONG_MAX);
  719         bzero(osp, sizeof(*osp));
  720         osp->f_bsize = nsp->f_bsize;
  721         osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
  722         osp->f_blocks = nsp->f_blocks;
  723         osp->f_bfree = nsp->f_bfree;
  724         osp->f_bavail = nsp->f_bavail;
  725         osp->f_files = MIN(nsp->f_files, LONG_MAX);
  726         osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
  727         osp->f_owner = nsp->f_owner;
  728         osp->f_type = nsp->f_type;
  729         osp->f_flags = nsp->f_flags;
  730         osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
  731         osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
  732         osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
  733         osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
  734         strlcpy(osp->f_fstypename, nsp->f_fstypename,
  735             MIN(MFSNAMELEN, OMFSNAMELEN));
  736         strlcpy(osp->f_mntonname, nsp->f_mntonname,
  737             MIN(MNAMELEN, OMNAMELEN));
  738         strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
  739             MIN(MNAMELEN, OMNAMELEN));
  740         osp->f_fsid = nsp->f_fsid;
  741 }
  742 #endif /* COMPAT_FREEBSD4 */
  743 
  744 #if defined(COMPAT_FREEBSD11)
  745 /*
  746  * Get old format filesystem statistics.
  747  */
  748 static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *);
  749 
  750 int
  751 freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap)
  752 {
  753         struct freebsd11_statfs osb;
  754         struct statfs *sfp;
  755         int error;
  756 
  757         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
  758         error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
  759         if (error == 0) {
  760                 freebsd11_cvtstatfs(sfp, &osb);
  761                 error = copyout(&osb, uap->buf, sizeof(osb));
  762         }
  763         free(sfp, M_STATFS);
  764         return (error);
  765 }
  766 
  767 /*
  768  * Get filesystem statistics.
  769  */
  770 int
  771 freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap)
  772 {
  773         struct freebsd11_statfs osb;
  774         struct statfs *sfp;
  775         int error;
  776 
  777         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
  778         error = kern_fstatfs(td, uap->fd, sfp);
  779         if (error == 0) {
  780                 freebsd11_cvtstatfs(sfp, &osb);
  781                 error = copyout(&osb, uap->buf, sizeof(osb));
  782         }
  783         free(sfp, M_STATFS);
  784         return (error);
  785 }
  786 
  787 /*
  788  * Get statistics on all filesystems.
  789  */
  790 int
  791 freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap)
  792 {
  793         struct freebsd11_statfs osb;
  794         struct statfs *buf, *sp;
  795         size_t count, size;
  796         int error;
  797 
  798         count = uap->bufsize / sizeof(struct ostatfs);
  799         size = count * sizeof(struct statfs);
  800         error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
  801             uap->mode);
  802         if (error == 0)
  803                 td->td_retval[0] = count;
  804         if (size > 0) {
  805                 sp = buf;
  806                 while (count > 0 && error == 0) {
  807                         freebsd11_cvtstatfs(sp, &osb);
  808                         error = copyout(&osb, uap->buf, sizeof(osb));
  809                         sp++;
  810                         uap->buf++;
  811                         count--;
  812                 }
  813                 free(buf, M_STATFS);
  814         }
  815         return (error);
  816 }
  817 
  818 /*
  819  * Implement fstatfs() for (NFS) file handles.
  820  */
  821 int
  822 freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap)
  823 {
  824         struct freebsd11_statfs osb;
  825         struct statfs *sfp;
  826         fhandle_t fh;
  827         int error;
  828 
  829         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
  830         if (error)
  831                 return (error);
  832         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
  833         error = kern_fhstatfs(td, fh, sfp);
  834         if (error == 0) {
  835                 freebsd11_cvtstatfs(sfp, &osb);
  836                 error = copyout(&osb, uap->buf, sizeof(osb));
  837         }
  838         free(sfp, M_STATFS);
  839         return (error);
  840 }
  841 
  842 /*
  843  * Convert a new format statfs structure to an old format statfs structure.
  844  */
  845 static void
  846 freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp)
  847 {
  848 
  849         bzero(osp, sizeof(*osp));
  850         osp->f_version = FREEBSD11_STATFS_VERSION;
  851         osp->f_type = nsp->f_type;
  852         osp->f_flags = nsp->f_flags;
  853         osp->f_bsize = nsp->f_bsize;
  854         osp->f_iosize = nsp->f_iosize;
  855         osp->f_blocks = nsp->f_blocks;
  856         osp->f_bfree = nsp->f_bfree;
  857         osp->f_bavail = nsp->f_bavail;
  858         osp->f_files = nsp->f_files;
  859         osp->f_ffree = nsp->f_ffree;
  860         osp->f_syncwrites = nsp->f_syncwrites;
  861         osp->f_asyncwrites = nsp->f_asyncwrites;
  862         osp->f_syncreads = nsp->f_syncreads;
  863         osp->f_asyncreads = nsp->f_asyncreads;
  864         osp->f_namemax = nsp->f_namemax;
  865         osp->f_owner = nsp->f_owner;
  866         osp->f_fsid = nsp->f_fsid;
  867         strlcpy(osp->f_fstypename, nsp->f_fstypename,
  868             MIN(MFSNAMELEN, sizeof(osp->f_fstypename)));
  869         strlcpy(osp->f_mntonname, nsp->f_mntonname,
  870             MIN(MNAMELEN, sizeof(osp->f_mntonname)));
  871         strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
  872             MIN(MNAMELEN, sizeof(osp->f_mntfromname)));
  873 }
  874 #endif /* COMPAT_FREEBSD11 */
  875 
  876 /*
  877  * Change current working directory to a given file descriptor.
  878  */
  879 #ifndef _SYS_SYSPROTO_H_
  880 struct fchdir_args {
  881         int     fd;
  882 };
  883 #endif
  884 int
  885 sys_fchdir(struct thread *td, struct fchdir_args *uap)
  886 {
  887         struct vnode *vp, *tdp;
  888         struct mount *mp;
  889         struct file *fp;
  890         int error;
  891 
  892         AUDIT_ARG_FD(uap->fd);
  893         error = getvnode_path(td, uap->fd, &cap_fchdir_rights,
  894             &fp);
  895         if (error != 0)
  896                 return (error);
  897         vp = fp->f_vnode;
  898         vrefact(vp);
  899         fdrop(fp, td);
  900         vn_lock(vp, LK_SHARED | LK_RETRY);
  901         AUDIT_ARG_VNODE1(vp);
  902         error = change_dir(vp, td);
  903         while (!error && (mp = vp->v_mountedhere) != NULL) {
  904                 if (vfs_busy(mp, 0))
  905                         continue;
  906                 error = VFS_ROOT(mp, LK_SHARED, &tdp);
  907                 vfs_unbusy(mp);
  908                 if (error != 0)
  909                         break;
  910                 vput(vp);
  911                 vp = tdp;
  912         }
  913         if (error != 0) {
  914                 vput(vp);
  915                 return (error);
  916         }
  917         VOP_UNLOCK(vp);
  918         pwd_chdir(td, vp);
  919         return (0);
  920 }
  921 
  922 /*
  923  * Change current working directory (``.'').
  924  */
  925 #ifndef _SYS_SYSPROTO_H_
  926 struct chdir_args {
  927         char    *path;
  928 };
  929 #endif
  930 int
  931 sys_chdir(struct thread *td, struct chdir_args *uap)
  932 {
  933 
  934         return (kern_chdir(td, uap->path, UIO_USERSPACE));
  935 }
  936 
  937 int
  938 kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg)
  939 {
  940         struct nameidata nd;
  941         int error;
  942 
  943         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
  944             pathseg, path, td);
  945         if ((error = namei(&nd)) != 0)
  946                 return (error);
  947         if ((error = change_dir(nd.ni_vp, td)) != 0) {
  948                 vput(nd.ni_vp);
  949                 NDFREE_NOTHING(&nd);
  950                 return (error);
  951         }
  952         VOP_UNLOCK(nd.ni_vp);
  953         NDFREE_NOTHING(&nd);
  954         pwd_chdir(td, nd.ni_vp);
  955         return (0);
  956 }
  957 
  958 static int unprivileged_chroot = 0;
  959 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_chroot, CTLFLAG_RW,
  960     &unprivileged_chroot, 0,
  961     "Unprivileged processes can use chroot(2)");
  962 /*
  963  * Change notion of root (``/'') directory.
  964  */
  965 #ifndef _SYS_SYSPROTO_H_
  966 struct chroot_args {
  967         char    *path;
  968 };
  969 #endif
  970 int
  971 sys_chroot(struct thread *td, struct chroot_args *uap)
  972 {
  973         struct nameidata nd;
  974         struct proc *p;
  975         int error;
  976 
  977         error = priv_check(td, PRIV_VFS_CHROOT);
  978         if (error != 0) {
  979                 p = td->td_proc;
  980                 PROC_LOCK(p);
  981                 if (unprivileged_chroot == 0 ||
  982                     (p->p_flag2 & P2_NO_NEW_PRIVS) == 0) {
  983                         PROC_UNLOCK(p);
  984                         return (error);
  985                 }
  986                 PROC_UNLOCK(p);
  987         }
  988         NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
  989             UIO_USERSPACE, uap->path, td);
  990         error = namei(&nd);
  991         if (error != 0)
  992                 goto error;
  993         error = change_dir(nd.ni_vp, td);
  994         if (error != 0)
  995                 goto e_vunlock;
  996 #ifdef MAC
  997         error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
  998         if (error != 0)
  999                 goto e_vunlock;
 1000 #endif
 1001         VOP_UNLOCK(nd.ni_vp);
 1002         error = pwd_chroot(td, nd.ni_vp);
 1003         vrele(nd.ni_vp);
 1004         NDFREE_NOTHING(&nd);
 1005         return (error);
 1006 e_vunlock:
 1007         vput(nd.ni_vp);
 1008 error:
 1009         NDFREE_NOTHING(&nd);
 1010         return (error);
 1011 }
 1012 
 1013 /*
 1014  * Common routine for chroot and chdir.  Callers must provide a locked vnode
 1015  * instance.
 1016  */
 1017 int
 1018 change_dir(struct vnode *vp, struct thread *td)
 1019 {
 1020 #ifdef MAC
 1021         int error;
 1022 #endif
 1023 
 1024         ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 1025         if (vp->v_type != VDIR)
 1026                 return (ENOTDIR);
 1027 #ifdef MAC
 1028         error = mac_vnode_check_chdir(td->td_ucred, vp);
 1029         if (error != 0)
 1030                 return (error);
 1031 #endif
 1032         return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
 1033 }
 1034 
 1035 static __inline void
 1036 flags_to_rights(int flags, cap_rights_t *rightsp)
 1037 {
 1038         if (flags & O_EXEC) {
 1039                 cap_rights_set_one(rightsp, CAP_FEXECVE);
 1040                 if (flags & O_PATH)
 1041                         return;
 1042         } else {
 1043                 switch ((flags & O_ACCMODE)) {
 1044                 case O_RDONLY:
 1045                         cap_rights_set_one(rightsp, CAP_READ);
 1046                         break;
 1047                 case O_RDWR:
 1048                         cap_rights_set_one(rightsp, CAP_READ);
 1049                         /* FALLTHROUGH */
 1050                 case O_WRONLY:
 1051                         cap_rights_set_one(rightsp, CAP_WRITE);
 1052                         if (!(flags & (O_APPEND | O_TRUNC)))
 1053                                 cap_rights_set_one(rightsp, CAP_SEEK);
 1054                         break;
 1055                 }
 1056         }
 1057 
 1058         if (flags & O_CREAT)
 1059                 cap_rights_set_one(rightsp, CAP_CREATE);
 1060 
 1061         if (flags & O_TRUNC)
 1062                 cap_rights_set_one(rightsp, CAP_FTRUNCATE);
 1063 
 1064         if (flags & (O_SYNC | O_FSYNC))
 1065                 cap_rights_set_one(rightsp, CAP_FSYNC);
 1066 
 1067         if (flags & (O_EXLOCK | O_SHLOCK))
 1068                 cap_rights_set_one(rightsp, CAP_FLOCK);
 1069 }
 1070 
 1071 /*
 1072  * Check permissions, allocate an open file structure, and call the device
 1073  * open routine if any.
 1074  */
 1075 #ifndef _SYS_SYSPROTO_H_
 1076 struct open_args {
 1077         char    *path;
 1078         int     flags;
 1079         int     mode;
 1080 };
 1081 #endif
 1082 int
 1083 sys_open(struct thread *td, struct open_args *uap)
 1084 {
 1085 
 1086         return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 1087             uap->flags, uap->mode));
 1088 }
 1089 
 1090 #ifndef _SYS_SYSPROTO_H_
 1091 struct openat_args {
 1092         int     fd;
 1093         char    *path;
 1094         int     flag;
 1095         int     mode;
 1096 };
 1097 #endif
 1098 int
 1099 sys_openat(struct thread *td, struct openat_args *uap)
 1100 {
 1101 
 1102         AUDIT_ARG_FD(uap->fd);
 1103         return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 1104             uap->mode));
 1105 }
 1106 
 1107 int
 1108 kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
 1109     int flags, int mode)
 1110 {
 1111         struct proc *p = td->td_proc;
 1112         struct filedesc *fdp;
 1113         struct pwddesc *pdp;
 1114         struct file *fp;
 1115         struct vnode *vp;
 1116         struct nameidata nd;
 1117         cap_rights_t rights;
 1118         int cmode, error, indx;
 1119 
 1120         indx = -1;
 1121         fdp = p->p_fd;
 1122         pdp = p->p_pd;
 1123 
 1124         AUDIT_ARG_FFLAGS(flags);
 1125         AUDIT_ARG_MODE(mode);
 1126         cap_rights_init_one(&rights, CAP_LOOKUP);
 1127         flags_to_rights(flags, &rights);
 1128 
 1129         /*
 1130          * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
 1131          * may be specified.  On the other hand, for O_PATH any mode
 1132          * except O_EXEC is ignored.
 1133          */
 1134         if ((flags & O_PATH) != 0) {
 1135                 flags &= ~(O_CREAT | O_ACCMODE);
 1136         } else if ((flags & O_EXEC) != 0) {
 1137                 if (flags & O_ACCMODE)
 1138                         return (EINVAL);
 1139         } else if ((flags & O_ACCMODE) == O_ACCMODE) {
 1140                 return (EINVAL);
 1141         } else {
 1142                 flags = FFLAGS(flags);
 1143         }
 1144 
 1145         /*
 1146          * Allocate a file structure. The descriptor to reference it
 1147          * is allocated and used by finstall_refed() below.
 1148          */
 1149         error = falloc_noinstall(td, &fp);
 1150         if (error != 0)
 1151                 return (error);
 1152         /* Set the flags early so the finit in devfs can pick them up. */
 1153         fp->f_flag = flags & FMASK;
 1154         cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT;
 1155         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | WANTIOCTLCAPS,
 1156             pathseg, path, fd, &rights, td);
 1157         td->td_dupfd = -1;              /* XXX check for fdopen */
 1158         error = vn_open_cred(&nd, &flags, cmode, VN_OPEN_WANTIOCTLCAPS,
 1159             td->td_ucred, fp);
 1160         if (error != 0) {
 1161                 /*
 1162                  * If the vn_open replaced the method vector, something
 1163                  * wonderous happened deep below and we just pass it up
 1164                  * pretending we know what we do.
 1165                  */
 1166                 if (error == ENXIO && fp->f_ops != &badfileops) {
 1167                         MPASS((flags & O_PATH) == 0);
 1168                         goto success;
 1169                 }
 1170 
 1171                 /*
 1172                  * Handle special fdopen() case. bleh.
 1173                  *
 1174                  * Don't do this for relative (capability) lookups; we don't
 1175                  * understand exactly what would happen, and we don't think
 1176                  * that it ever should.
 1177                  */
 1178                 if ((nd.ni_resflags & NIRES_STRICTREL) == 0 &&
 1179                     (error == ENODEV || error == ENXIO) &&
 1180                     td->td_dupfd >= 0) {
 1181                         error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
 1182                             &indx);
 1183                         if (error == 0)
 1184                                 goto success;
 1185                 }
 1186 
 1187                 goto bad;
 1188         }
 1189         td->td_dupfd = 0;
 1190         NDFREE(&nd, NDF_ONLY_PNBUF);
 1191         vp = nd.ni_vp;
 1192 
 1193         /*
 1194          * Store the vnode, for any f_type. Typically, the vnode use
 1195          * count is decremented by direct call to vn_closefile() for
 1196          * files that switched type in the cdevsw fdopen() method.
 1197          */
 1198         fp->f_vnode = vp;
 1199 
 1200         /*
 1201          * If the file wasn't claimed by devfs bind it to the normal
 1202          * vnode operations here.
 1203          */
 1204         if (fp->f_ops == &badfileops) {
 1205                 KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0,
 1206                     ("Unexpected fifo fp %p vp %p", fp, vp));
 1207                 if ((flags & O_PATH) != 0) {
 1208                         finit(fp, (flags & FMASK) | (fp->f_flag & FKQALLOWED),
 1209                             DTYPE_VNODE, NULL, &path_fileops);
 1210                 } else {
 1211                         finit_vnode(fp, flags, NULL, &vnops);
 1212                 }
 1213         }
 1214 
 1215         VOP_UNLOCK(vp);
 1216         if (flags & O_TRUNC) {
 1217                 error = fo_truncate(fp, 0, td->td_ucred, td);
 1218                 if (error != 0)
 1219                         goto bad;
 1220         }
 1221 success:
 1222         /*
 1223          * If we haven't already installed the FD (for dupfdopen), do so now.
 1224          */
 1225         if (indx == -1) {
 1226                 struct filecaps *fcaps;
 1227 
 1228 #ifdef CAPABILITIES
 1229                 if ((nd.ni_resflags & NIRES_STRICTREL) != 0)
 1230                         fcaps = &nd.ni_filecaps;
 1231                 else
 1232 #endif
 1233                         fcaps = NULL;
 1234                 error = finstall_refed(td, fp, &indx, flags, fcaps);
 1235                 /* On success finstall_refed() consumes fcaps. */
 1236                 if (error != 0) {
 1237                         goto bad;
 1238                 }
 1239         } else {
 1240                 NDFREE_IOCTLCAPS(&nd);
 1241                 falloc_abort(td, fp);
 1242         }
 1243 
 1244         td->td_retval[0] = indx;
 1245         return (0);
 1246 bad:
 1247         KASSERT(indx == -1, ("indx=%d, should be -1", indx));
 1248         NDFREE_IOCTLCAPS(&nd);
 1249         falloc_abort(td, fp);
 1250         return (error);
 1251 }
 1252 
 1253 #ifdef COMPAT_43
 1254 /*
 1255  * Create a file.
 1256  */
 1257 #ifndef _SYS_SYSPROTO_H_
 1258 struct ocreat_args {
 1259         char    *path;
 1260         int     mode;
 1261 };
 1262 #endif
 1263 int
 1264 ocreat(struct thread *td, struct ocreat_args *uap)
 1265 {
 1266 
 1267         return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 1268             O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
 1269 }
 1270 #endif /* COMPAT_43 */
 1271 
 1272 /*
 1273  * Create a special file.
 1274  */
 1275 #ifndef _SYS_SYSPROTO_H_
 1276 struct mknodat_args {
 1277         int     fd;
 1278         char    *path;
 1279         mode_t  mode;
 1280         dev_t   dev;
 1281 };
 1282 #endif
 1283 int
 1284 sys_mknodat(struct thread *td, struct mknodat_args *uap)
 1285 {
 1286 
 1287         return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
 1288             uap->dev));
 1289 }
 1290 
 1291 #if defined(COMPAT_FREEBSD11)
 1292 int
 1293 freebsd11_mknod(struct thread *td,
 1294     struct freebsd11_mknod_args *uap)
 1295 {
 1296 
 1297         return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 1298             uap->mode, uap->dev));
 1299 }
 1300 
 1301 int
 1302 freebsd11_mknodat(struct thread *td,
 1303     struct freebsd11_mknodat_args *uap)
 1304 {
 1305 
 1306         return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
 1307             uap->dev));
 1308 }
 1309 #endif /* COMPAT_FREEBSD11 */
 1310 
 1311 int
 1312 kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
 1313     int mode, dev_t dev)
 1314 {
 1315         struct vnode *vp;
 1316         struct mount *mp;
 1317         struct vattr vattr;
 1318         struct nameidata nd;
 1319         int error, whiteout = 0;
 1320 
 1321         AUDIT_ARG_MODE(mode);
 1322         AUDIT_ARG_DEV(dev);
 1323         switch (mode & S_IFMT) {
 1324         case S_IFCHR:
 1325         case S_IFBLK:
 1326                 error = priv_check(td, PRIV_VFS_MKNOD_DEV);
 1327                 if (error == 0 && dev == VNOVAL)
 1328                         error = EINVAL;
 1329                 break;
 1330         case S_IFWHT:
 1331                 error = priv_check(td, PRIV_VFS_MKNOD_WHT);
 1332                 break;
 1333         case S_IFIFO:
 1334                 if (dev == 0)
 1335                         return (kern_mkfifoat(td, fd, path, pathseg, mode));
 1336                 /* FALLTHROUGH */
 1337         default:
 1338                 error = EINVAL;
 1339                 break;
 1340         }
 1341         if (error != 0)
 1342                 return (error);
 1343         NDPREINIT(&nd);
 1344 restart:
 1345         bwillwrite();
 1346         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 1347             NOCACHE, pathseg, path, fd, &cap_mknodat_rights,
 1348             td);
 1349         if ((error = namei(&nd)) != 0)
 1350                 return (error);
 1351         vp = nd.ni_vp;
 1352         if (vp != NULL) {
 1353                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1354                 if (vp == nd.ni_dvp)
 1355                         vrele(nd.ni_dvp);
 1356                 else
 1357                         vput(nd.ni_dvp);
 1358                 vrele(vp);
 1359                 return (EEXIST);
 1360         } else {
 1361                 VATTR_NULL(&vattr);
 1362                 vattr.va_mode = (mode & ALLPERMS) &
 1363                     ~td->td_proc->p_pd->pd_cmask;
 1364                 vattr.va_rdev = dev;
 1365                 whiteout = 0;
 1366 
 1367                 switch (mode & S_IFMT) {
 1368                 case S_IFCHR:
 1369                         vattr.va_type = VCHR;
 1370                         break;
 1371                 case S_IFBLK:
 1372                         vattr.va_type = VBLK;
 1373                         break;
 1374                 case S_IFWHT:
 1375                         whiteout = 1;
 1376                         break;
 1377                 default:
 1378                         panic("kern_mknod: invalid mode");
 1379                 }
 1380         }
 1381         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 1382                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1383                 vput(nd.ni_dvp);
 1384                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 1385                         return (error);
 1386                 goto restart;
 1387         }
 1388 #ifdef MAC
 1389         if (error == 0 && !whiteout)
 1390                 error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
 1391                     &nd.ni_cnd, &vattr);
 1392 #endif
 1393         if (error == 0) {
 1394                 if (whiteout)
 1395                         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 1396                 else {
 1397                         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 1398                                                 &nd.ni_cnd, &vattr);
 1399                 }
 1400         }
 1401         VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL,
 1402             true);
 1403         vn_finished_write(mp);
 1404         NDFREE(&nd, NDF_ONLY_PNBUF);
 1405         if (error == ERELOOKUP)
 1406                 goto restart;
 1407         return (error);
 1408 }
 1409 
 1410 /*
 1411  * Create a named pipe.
 1412  */
 1413 #ifndef _SYS_SYSPROTO_H_
 1414 struct mkfifo_args {
 1415         char    *path;
 1416         int     mode;
 1417 };
 1418 #endif
 1419 int
 1420 sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
 1421 {
 1422 
 1423         return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 1424             uap->mode));
 1425 }
 1426 
 1427 #ifndef _SYS_SYSPROTO_H_
 1428 struct mkfifoat_args {
 1429         int     fd;
 1430         char    *path;
 1431         mode_t  mode;
 1432 };
 1433 #endif
 1434 int
 1435 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
 1436 {
 1437 
 1438         return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
 1439             uap->mode));
 1440 }
 1441 
 1442 int
 1443 kern_mkfifoat(struct thread *td, int fd, const char *path,
 1444     enum uio_seg pathseg, int mode)
 1445 {
 1446         struct mount *mp;
 1447         struct vattr vattr;
 1448         struct nameidata nd;
 1449         int error;
 1450 
 1451         AUDIT_ARG_MODE(mode);
 1452         NDPREINIT(&nd);
 1453 restart:
 1454         bwillwrite();
 1455         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 1456             NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights,
 1457             td);
 1458         if ((error = namei(&nd)) != 0)
 1459                 return (error);
 1460         if (nd.ni_vp != NULL) {
 1461                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1462                 if (nd.ni_vp == nd.ni_dvp)
 1463                         vrele(nd.ni_dvp);
 1464                 else
 1465                         vput(nd.ni_dvp);
 1466                 vrele(nd.ni_vp);
 1467                 return (EEXIST);
 1468         }
 1469         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 1470                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1471                 vput(nd.ni_dvp);
 1472                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 1473                         return (error);
 1474                 goto restart;
 1475         }
 1476         VATTR_NULL(&vattr);
 1477         vattr.va_type = VFIFO;
 1478         vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask;
 1479 #ifdef MAC
 1480         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 1481             &vattr);
 1482         if (error != 0)
 1483                 goto out;
 1484 #endif
 1485         error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 1486 #ifdef MAC
 1487 out:
 1488 #endif
 1489         VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 1490         vn_finished_write(mp);
 1491         NDFREE(&nd, NDF_ONLY_PNBUF);
 1492         if (error == ERELOOKUP)
 1493                 goto restart;
 1494         return (error);
 1495 }
 1496 
 1497 /*
 1498  * Make a hard file link.
 1499  */
 1500 #ifndef _SYS_SYSPROTO_H_
 1501 struct link_args {
 1502         char    *path;
 1503         char    *link;
 1504 };
 1505 #endif
 1506 int
 1507 sys_link(struct thread *td, struct link_args *uap)
 1508 {
 1509 
 1510         return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
 1511             UIO_USERSPACE, AT_SYMLINK_FOLLOW));
 1512 }
 1513 
 1514 #ifndef _SYS_SYSPROTO_H_
 1515 struct linkat_args {
 1516         int     fd1;
 1517         char    *path1;
 1518         int     fd2;
 1519         char    *path2;
 1520         int     flag;
 1521 };
 1522 #endif
 1523 int
 1524 sys_linkat(struct thread *td, struct linkat_args *uap)
 1525 {
 1526 
 1527         return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
 1528             UIO_USERSPACE, uap->flag));
 1529 }
 1530 
 1531 int hardlink_check_uid = 0;
 1532 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
 1533     &hardlink_check_uid, 0,
 1534     "Unprivileged processes cannot create hard links to files owned by other "
 1535     "users");
 1536 static int hardlink_check_gid = 0;
 1537 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
 1538     &hardlink_check_gid, 0,
 1539     "Unprivileged processes cannot create hard links to files owned by other "
 1540     "groups");
 1541 
 1542 static int
 1543 can_hardlink(struct vnode *vp, struct ucred *cred)
 1544 {
 1545         struct vattr va;
 1546         int error;
 1547 
 1548         if (!hardlink_check_uid && !hardlink_check_gid)
 1549                 return (0);
 1550 
 1551         error = VOP_GETATTR(vp, &va, cred);
 1552         if (error != 0)
 1553                 return (error);
 1554 
 1555         if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
 1556                 error = priv_check_cred(cred, PRIV_VFS_LINK);
 1557                 if (error != 0)
 1558                         return (error);
 1559         }
 1560 
 1561         if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
 1562                 error = priv_check_cred(cred, PRIV_VFS_LINK);
 1563                 if (error != 0)
 1564                         return (error);
 1565         }
 1566 
 1567         return (0);
 1568 }
 1569 
 1570 int
 1571 kern_linkat(struct thread *td, int fd1, int fd2, const char *path1,
 1572     const char *path2, enum uio_seg segflag, int flag)
 1573 {
 1574         struct nameidata nd;
 1575         int error;
 1576 
 1577         if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH |
 1578             AT_EMPTY_PATH)) != 0)
 1579                 return (EINVAL);
 1580 
 1581         NDPREINIT(&nd);
 1582         do {
 1583                 bwillwrite();
 1584                 NDINIT_ATRIGHTS(&nd, LOOKUP, AUDITVNODE1 | at2cnpflags(flag,
 1585                     AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH),
 1586                     segflag, path1, fd1, &cap_linkat_source_rights, td);
 1587                 if ((error = namei(&nd)) != 0)
 1588                         return (error);
 1589                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1590                 if ((nd.ni_resflags & NIRES_EMPTYPATH) != 0) {
 1591                         error = priv_check(td, PRIV_VFS_FHOPEN);
 1592                         if (error != 0) {
 1593                                 vrele(nd.ni_vp);
 1594                                 return (error);
 1595                         }
 1596                 }
 1597                 error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag);
 1598         } while (error ==  EAGAIN || error == ERELOOKUP);
 1599         return (error);
 1600 }
 1601 
 1602 static int
 1603 kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path,
 1604     enum uio_seg segflag)
 1605 {
 1606         struct nameidata nd;
 1607         struct mount *mp;
 1608         int error;
 1609 
 1610         if (vp->v_type == VDIR) {
 1611                 vrele(vp);
 1612                 return (EPERM);         /* POSIX */
 1613         }
 1614         NDINIT_ATRIGHTS(&nd, CREATE,
 1615             LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd,
 1616             &cap_linkat_target_rights, td);
 1617         if ((error = namei(&nd)) == 0) {
 1618                 if (nd.ni_vp != NULL) {
 1619                         NDFREE(&nd, NDF_ONLY_PNBUF);
 1620                         if (nd.ni_dvp == nd.ni_vp)
 1621                                 vrele(nd.ni_dvp);
 1622                         else
 1623                                 vput(nd.ni_dvp);
 1624                         vrele(nd.ni_vp);
 1625                         vrele(vp);
 1626                         return (EEXIST);
 1627                 } else if (nd.ni_dvp->v_mount != vp->v_mount) {
 1628                         /*
 1629                          * Cross-device link.  No need to recheck
 1630                          * vp->v_type, since it cannot change, except
 1631                          * to VBAD.
 1632                          */
 1633                         NDFREE(&nd, NDF_ONLY_PNBUF);
 1634                         vput(nd.ni_dvp);
 1635                         vrele(vp);
 1636                         return (EXDEV);
 1637                 } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
 1638                         error = can_hardlink(vp, td->td_ucred);
 1639 #ifdef MAC
 1640                         if (error == 0)
 1641                                 error = mac_vnode_check_link(td->td_ucred,
 1642                                     nd.ni_dvp, vp, &nd.ni_cnd);
 1643 #endif
 1644                         if (error != 0) {
 1645                                 vput(vp);
 1646                                 vput(nd.ni_dvp);
 1647                                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1648                                 return (error);
 1649                         }
 1650                         error = vn_start_write(vp, &mp, V_NOWAIT);
 1651                         if (error != 0) {
 1652                                 vput(vp);
 1653                                 vput(nd.ni_dvp);
 1654                                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1655                                 error = vn_start_write(NULL, &mp,
 1656                                     V_XSLEEP | PCATCH);
 1657                                 if (error != 0)
 1658                                         return (error);
 1659                                 return (EAGAIN);
 1660                         }
 1661                         error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 1662                         VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
 1663                         vn_finished_write(mp);
 1664                         NDFREE(&nd, NDF_ONLY_PNBUF);
 1665                         vp = NULL;
 1666                 } else {
 1667                         vput(nd.ni_dvp);
 1668                         NDFREE(&nd, NDF_ONLY_PNBUF);
 1669                         vrele(vp);
 1670                         return (EAGAIN);
 1671                 }
 1672         }
 1673         if (vp != NULL)
 1674                 vrele(vp);
 1675         return (error);
 1676 }
 1677 
 1678 /*
 1679  * Make a symbolic link.
 1680  */
 1681 #ifndef _SYS_SYSPROTO_H_
 1682 struct symlink_args {
 1683         char    *path;
 1684         char    *link;
 1685 };
 1686 #endif
 1687 int
 1688 sys_symlink(struct thread *td, struct symlink_args *uap)
 1689 {
 1690 
 1691         return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
 1692             UIO_USERSPACE));
 1693 }
 1694 
 1695 #ifndef _SYS_SYSPROTO_H_
 1696 struct symlinkat_args {
 1697         char    *path;
 1698         int     fd;
 1699         char    *path2;
 1700 };
 1701 #endif
 1702 int
 1703 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
 1704 {
 1705 
 1706         return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
 1707             UIO_USERSPACE));
 1708 }
 1709 
 1710 int
 1711 kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2,
 1712     enum uio_seg segflg)
 1713 {
 1714         struct mount *mp;
 1715         struct vattr vattr;
 1716         const char *syspath;
 1717         char *tmppath;
 1718         struct nameidata nd;
 1719         int error;
 1720 
 1721         if (segflg == UIO_SYSSPACE) {
 1722                 syspath = path1;
 1723         } else {
 1724                 tmppath = uma_zalloc(namei_zone, M_WAITOK);
 1725                 if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0)
 1726                         goto out;
 1727                 syspath = tmppath;
 1728         }
 1729         AUDIT_ARG_TEXT(syspath);
 1730         NDPREINIT(&nd);
 1731 restart:
 1732         bwillwrite();
 1733         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 1734             NOCACHE, segflg, path2, fd, &cap_symlinkat_rights,
 1735             td);
 1736         if ((error = namei(&nd)) != 0)
 1737                 goto out;
 1738         if (nd.ni_vp) {
 1739                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1740                 if (nd.ni_vp == nd.ni_dvp)
 1741                         vrele(nd.ni_dvp);
 1742                 else
 1743                         vput(nd.ni_dvp);
 1744                 vrele(nd.ni_vp);
 1745                 nd.ni_vp = NULL;
 1746                 error = EEXIST;
 1747                 goto out;
 1748         }
 1749         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 1750                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1751                 vput(nd.ni_dvp);
 1752                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 1753                         goto out;
 1754                 goto restart;
 1755         }
 1756         VATTR_NULL(&vattr);
 1757         vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask;
 1758 #ifdef MAC
 1759         vattr.va_type = VLNK;
 1760         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 1761             &vattr);
 1762         if (error != 0)
 1763                 goto out2;
 1764 #endif
 1765         error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
 1766 #ifdef MAC
 1767 out2:
 1768 #endif
 1769         VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 1770         vn_finished_write(mp);
 1771         NDFREE(&nd, NDF_ONLY_PNBUF);
 1772         if (error == ERELOOKUP)
 1773                 goto restart;
 1774 out:
 1775         if (segflg != UIO_SYSSPACE)
 1776                 uma_zfree(namei_zone, tmppath);
 1777         return (error);
 1778 }
 1779 
 1780 /*
 1781  * Delete a whiteout from the filesystem.
 1782  */
 1783 #ifndef _SYS_SYSPROTO_H_
 1784 struct undelete_args {
 1785         char *path;
 1786 };
 1787 #endif
 1788 int
 1789 sys_undelete(struct thread *td, struct undelete_args *uap)
 1790 {
 1791         struct mount *mp;
 1792         struct nameidata nd;
 1793         int error;
 1794 
 1795         NDPREINIT(&nd);
 1796 restart:
 1797         bwillwrite();
 1798         NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
 1799             UIO_USERSPACE, uap->path, td);
 1800         error = namei(&nd);
 1801         if (error != 0)
 1802                 return (error);
 1803 
 1804         if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 1805                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1806                 if (nd.ni_vp == nd.ni_dvp)
 1807                         vrele(nd.ni_dvp);
 1808                 else
 1809                         vput(nd.ni_dvp);
 1810                 if (nd.ni_vp)
 1811                         vrele(nd.ni_vp);
 1812                 return (EEXIST);
 1813         }
 1814         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 1815                 NDFREE(&nd, NDF_ONLY_PNBUF);
 1816                 vput(nd.ni_dvp);
 1817                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 1818                         return (error);
 1819                 goto restart;
 1820         }
 1821         error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
 1822         NDFREE(&nd, NDF_ONLY_PNBUF);
 1823         vput(nd.ni_dvp);
 1824         vn_finished_write(mp);
 1825         if (error == ERELOOKUP)
 1826                 goto restart;
 1827         return (error);
 1828 }
 1829 
 1830 /*
 1831  * Delete a name from the filesystem.
 1832  */
 1833 #ifndef _SYS_SYSPROTO_H_
 1834 struct unlink_args {
 1835         char    *path;
 1836 };
 1837 #endif
 1838 int
 1839 sys_unlink(struct thread *td, struct unlink_args *uap)
 1840 {
 1841 
 1842         return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
 1843             0, 0));
 1844 }
 1845 
 1846 static int
 1847 kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd,
 1848     int flag, enum uio_seg pathseg, ino_t oldinum)
 1849 {
 1850 
 1851         if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0)
 1852                 return (EINVAL);
 1853 
 1854         if ((flag & AT_REMOVEDIR) != 0)
 1855                 return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0));
 1856 
 1857         return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0));
 1858 }
 1859 
 1860 #ifndef _SYS_SYSPROTO_H_
 1861 struct unlinkat_args {
 1862         int     fd;
 1863         char    *path;
 1864         int     flag;
 1865 };
 1866 #endif
 1867 int
 1868 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
 1869 {
 1870 
 1871         return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag,
 1872             UIO_USERSPACE, 0));
 1873 }
 1874 
 1875 #ifndef _SYS_SYSPROTO_H_
 1876 struct funlinkat_args {
 1877         int             dfd;
 1878         const char      *path;
 1879         int             fd;
 1880         int             flag;
 1881 };
 1882 #endif
 1883 int
 1884 sys_funlinkat(struct thread *td, struct funlinkat_args *uap)
 1885 {
 1886 
 1887         return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag,
 1888             UIO_USERSPACE, 0));
 1889 }
 1890 
 1891 int
 1892 kern_funlinkat(struct thread *td, int dfd, const char *path, int fd,
 1893     enum uio_seg pathseg, int flag, ino_t oldinum)
 1894 {
 1895         struct mount *mp;
 1896         struct file *fp;
 1897         struct vnode *vp;
 1898         struct nameidata nd;
 1899         struct stat sb;
 1900         int error;
 1901 
 1902         fp = NULL;
 1903         if (fd != FD_NONE) {
 1904                 error = getvnode_path(td, fd, &cap_no_rights, &fp);
 1905                 if (error != 0)
 1906                         return (error);
 1907         }
 1908 
 1909         NDPREINIT(&nd);
 1910 restart:
 1911         bwillwrite();
 1912         NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
 1913             at2cnpflags(flag, AT_RESOLVE_BENEATH),
 1914             pathseg, path, dfd, &cap_unlinkat_rights, td);
 1915         if ((error = namei(&nd)) != 0) {
 1916                 if (error == EINVAL)
 1917                         error = EPERM;
 1918                 goto fdout;
 1919         }
 1920         vp = nd.ni_vp;
 1921         if (vp->v_type == VDIR && oldinum == 0) {
 1922                 error = EPERM;          /* POSIX */
 1923         } else if (oldinum != 0 &&
 1924             ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
 1925             sb.st_ino != oldinum) {
 1926                 error = EIDRM;  /* Identifier removed */
 1927         } else if (fp != NULL && fp->f_vnode != vp) {
 1928                 if (VN_IS_DOOMED(fp->f_vnode))
 1929                         error = EBADF;
 1930                 else
 1931                         error = EDEADLK;
 1932         } else {
 1933                 /*
 1934                  * The root of a mounted filesystem cannot be deleted.
 1935                  *
 1936                  * XXX: can this only be a VDIR case?
 1937                  */
 1938                 if (vp->v_vflag & VV_ROOT)
 1939                         error = EBUSY;
 1940         }
 1941         if (error == 0) {
 1942                 if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 1943                         NDFREE(&nd, NDF_ONLY_PNBUF);
 1944                         vput(nd.ni_dvp);
 1945                         if (vp == nd.ni_dvp)
 1946                                 vrele(vp);
 1947                         else
 1948                                 vput(vp);
 1949                         if ((error = vn_start_write(NULL, &mp,
 1950                             V_XSLEEP | PCATCH)) != 0) {
 1951                                 goto fdout;
 1952                         }
 1953                         goto restart;
 1954                 }
 1955 #ifdef MAC
 1956                 error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 1957                     &nd.ni_cnd);
 1958                 if (error != 0)
 1959                         goto out;
 1960 #endif
 1961                 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 1962                 error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 1963 #ifdef MAC
 1964 out:
 1965 #endif
 1966                 vn_finished_write(mp);
 1967         }
 1968         NDFREE(&nd, NDF_ONLY_PNBUF);
 1969         vput(nd.ni_dvp);
 1970         if (vp == nd.ni_dvp)
 1971                 vrele(vp);
 1972         else
 1973                 vput(vp);
 1974         if (error == ERELOOKUP)
 1975                 goto restart;
 1976 fdout:
 1977         if (fp != NULL)
 1978                 fdrop(fp, td);
 1979         return (error);
 1980 }
 1981 
 1982 /*
 1983  * Reposition read/write file offset.
 1984  */
 1985 #ifndef _SYS_SYSPROTO_H_
 1986 struct lseek_args {
 1987         int     fd;
 1988         int     pad;
 1989         off_t   offset;
 1990         int     whence;
 1991 };
 1992 #endif
 1993 int
 1994 sys_lseek(struct thread *td, struct lseek_args *uap)
 1995 {
 1996 
 1997         return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 1998 }
 1999 
 2000 int
 2001 kern_lseek(struct thread *td, int fd, off_t offset, int whence)
 2002 {
 2003         struct file *fp;
 2004         int error;
 2005 
 2006         AUDIT_ARG_FD(fd);
 2007         error = fget(td, fd, &cap_seek_rights, &fp);
 2008         if (error != 0)
 2009                 return (error);
 2010         error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
 2011             fo_seek(fp, offset, whence, td) : ESPIPE;
 2012         fdrop(fp, td);
 2013         return (error);
 2014 }
 2015 
 2016 #if defined(COMPAT_43)
 2017 /*
 2018  * Reposition read/write file offset.
 2019  */
 2020 #ifndef _SYS_SYSPROTO_H_
 2021 struct olseek_args {
 2022         int     fd;
 2023         long    offset;
 2024         int     whence;
 2025 };
 2026 #endif
 2027 int
 2028 olseek(struct thread *td, struct olseek_args *uap)
 2029 {
 2030 
 2031         return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 2032 }
 2033 #endif /* COMPAT_43 */
 2034 
 2035 #if defined(COMPAT_FREEBSD6)
 2036 /* Version with the 'pad' argument */
 2037 int
 2038 freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
 2039 {
 2040 
 2041         return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 2042 }
 2043 #endif
 2044 
 2045 /*
 2046  * Check access permissions using passed credentials.
 2047  */
 2048 static int
 2049 vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
 2050      struct thread *td)
 2051 {
 2052         accmode_t accmode;
 2053         int error;
 2054 
 2055         /* Flags == 0 means only check for existence. */
 2056         if (user_flags == 0)
 2057                 return (0);
 2058 
 2059         accmode = 0;
 2060         if (user_flags & R_OK)
 2061                 accmode |= VREAD;
 2062         if (user_flags & W_OK)
 2063                 accmode |= VWRITE;
 2064         if (user_flags & X_OK)
 2065                 accmode |= VEXEC;
 2066 #ifdef MAC
 2067         error = mac_vnode_check_access(cred, vp, accmode);
 2068         if (error != 0)
 2069                 return (error);
 2070 #endif
 2071         if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
 2072                 error = VOP_ACCESS(vp, accmode, cred, td);
 2073         return (error);
 2074 }
 2075 
 2076 /*
 2077  * Check access permissions using "real" credentials.
 2078  */
 2079 #ifndef _SYS_SYSPROTO_H_
 2080 struct access_args {
 2081         char    *path;
 2082         int     amode;
 2083 };
 2084 #endif
 2085 int
 2086 sys_access(struct thread *td, struct access_args *uap)
 2087 {
 2088 
 2089         return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 2090             0, uap->amode));
 2091 }
 2092 
 2093 #ifndef _SYS_SYSPROTO_H_
 2094 struct faccessat_args {
 2095         int     dirfd;
 2096         char    *path;
 2097         int     amode;
 2098         int     flag;
 2099 }
 2100 #endif
 2101 int
 2102 sys_faccessat(struct thread *td, struct faccessat_args *uap)
 2103 {
 2104 
 2105         return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 2106             uap->amode));
 2107 }
 2108 
 2109 int
 2110 kern_accessat(struct thread *td, int fd, const char *path,
 2111     enum uio_seg pathseg, int flag, int amode)
 2112 {
 2113         struct ucred *cred, *usecred;
 2114         struct vnode *vp;
 2115         struct nameidata nd;
 2116         int error;
 2117 
 2118         if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0)
 2119                 return (EINVAL);
 2120         if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
 2121                 return (EINVAL);
 2122 
 2123         /*
 2124          * Create and modify a temporary credential instead of one that
 2125          * is potentially shared (if we need one).
 2126          */
 2127         cred = td->td_ucred;
 2128         if ((flag & AT_EACCESS) == 0 &&
 2129             ((cred->cr_uid != cred->cr_ruid ||
 2130             cred->cr_rgid != cred->cr_groups[0]))) {
 2131                 usecred = crdup(cred);
 2132                 usecred->cr_uid = cred->cr_ruid;
 2133                 usecred->cr_groups[0] = cred->cr_rgid;
 2134                 td->td_ucred = usecred;
 2135         } else
 2136                 usecred = cred;
 2137         AUDIT_ARG_VALUE(amode);
 2138         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
 2139             AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH |
 2140             AT_EMPTY_PATH), pathseg, path, fd, &cap_fstat_rights, td);
 2141         if ((error = namei(&nd)) != 0)
 2142                 goto out;
 2143         vp = nd.ni_vp;
 2144 
 2145         error = vn_access(vp, amode, usecred, td);
 2146         NDFREE_NOTHING(&nd);
 2147         vput(vp);
 2148 out:
 2149         if (usecred != cred) {
 2150                 td->td_ucred = cred;
 2151                 crfree(usecred);
 2152         }
 2153         return (error);
 2154 }
 2155 
 2156 /*
 2157  * Check access permissions using "effective" credentials.
 2158  */
 2159 #ifndef _SYS_SYSPROTO_H_
 2160 struct eaccess_args {
 2161         char    *path;
 2162         int     amode;
 2163 };
 2164 #endif
 2165 int
 2166 sys_eaccess(struct thread *td, struct eaccess_args *uap)
 2167 {
 2168 
 2169         return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 2170             AT_EACCESS, uap->amode));
 2171 }
 2172 
 2173 #if defined(COMPAT_43)
 2174 /*
 2175  * Get file status; this version follows links.
 2176  */
 2177 #ifndef _SYS_SYSPROTO_H_
 2178 struct ostat_args {
 2179         char    *path;
 2180         struct ostat *ub;
 2181 };
 2182 #endif
 2183 int
 2184 ostat(struct thread *td, struct ostat_args *uap)
 2185 {
 2186         struct stat sb;
 2187         struct ostat osb;
 2188         int error;
 2189 
 2190         error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 2191             &sb, NULL);
 2192         if (error != 0)
 2193                 return (error);
 2194         cvtstat(&sb, &osb);
 2195         return (copyout(&osb, uap->ub, sizeof (osb)));
 2196 }
 2197 
 2198 /*
 2199  * Get file status; this version does not follow links.
 2200  */
 2201 #ifndef _SYS_SYSPROTO_H_
 2202 struct olstat_args {
 2203         char    *path;
 2204         struct ostat *ub;
 2205 };
 2206 #endif
 2207 int
 2208 olstat(struct thread *td, struct olstat_args *uap)
 2209 {
 2210         struct stat sb;
 2211         struct ostat osb;
 2212         int error;
 2213 
 2214         error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 2215             UIO_USERSPACE, &sb, NULL);
 2216         if (error != 0)
 2217                 return (error);
 2218         cvtstat(&sb, &osb);
 2219         return (copyout(&osb, uap->ub, sizeof (osb)));
 2220 }
 2221 
 2222 /*
 2223  * Convert from an old to a new stat structure.
 2224  * XXX: many values are blindly truncated.
 2225  */
 2226 void
 2227 cvtstat(struct stat *st, struct ostat *ost)
 2228 {
 2229 
 2230         bzero(ost, sizeof(*ost));
 2231         ost->st_dev = st->st_dev;
 2232         ost->st_ino = st->st_ino;
 2233         ost->st_mode = st->st_mode;
 2234         ost->st_nlink = st->st_nlink;
 2235         ost->st_uid = st->st_uid;
 2236         ost->st_gid = st->st_gid;
 2237         ost->st_rdev = st->st_rdev;
 2238         ost->st_size = MIN(st->st_size, INT32_MAX);
 2239         ost->st_atim = st->st_atim;
 2240         ost->st_mtim = st->st_mtim;
 2241         ost->st_ctim = st->st_ctim;
 2242         ost->st_blksize = st->st_blksize;
 2243         ost->st_blocks = st->st_blocks;
 2244         ost->st_flags = st->st_flags;
 2245         ost->st_gen = st->st_gen;
 2246 }
 2247 #endif /* COMPAT_43 */
 2248 
 2249 #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
 2250 int ino64_trunc_error;
 2251 SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW,
 2252     &ino64_trunc_error, 0,
 2253     "Error on truncation of device, file or inode number, or link count");
 2254 
 2255 int
 2256 freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost)
 2257 {
 2258 
 2259         ost->st_dev = st->st_dev;
 2260         if (ost->st_dev != st->st_dev) {
 2261                 switch (ino64_trunc_error) {
 2262                 default:
 2263                         /*
 2264                          * Since dev_t is almost raw, don't clamp to the
 2265                          * maximum for case 2, but ignore the error.
 2266                          */
 2267                         break;
 2268                 case 1:
 2269                         return (EOVERFLOW);
 2270                 }
 2271         }
 2272         ost->st_ino = st->st_ino;
 2273         if (ost->st_ino != st->st_ino) {
 2274                 switch (ino64_trunc_error) {
 2275                 default:
 2276                 case 0:
 2277                         break;
 2278                 case 1:
 2279                         return (EOVERFLOW);
 2280                 case 2:
 2281                         ost->st_ino = UINT32_MAX;
 2282                         break;
 2283                 }
 2284         }
 2285         ost->st_mode = st->st_mode;
 2286         ost->st_nlink = st->st_nlink;
 2287         if (ost->st_nlink != st->st_nlink) {
 2288                 switch (ino64_trunc_error) {
 2289                 default:
 2290                 case 0:
 2291                         break;
 2292                 case 1:
 2293                         return (EOVERFLOW);
 2294                 case 2:
 2295                         ost->st_nlink = UINT16_MAX;
 2296                         break;
 2297                 }
 2298         }
 2299         ost->st_uid = st->st_uid;
 2300         ost->st_gid = st->st_gid;
 2301         ost->st_rdev = st->st_rdev;
 2302         if (ost->st_rdev != st->st_rdev) {
 2303                 switch (ino64_trunc_error) {
 2304                 default:
 2305                         break;
 2306                 case 1:
 2307                         return (EOVERFLOW);
 2308                 }
 2309         }
 2310         ost->st_atim = st->st_atim;
 2311         ost->st_mtim = st->st_mtim;
 2312         ost->st_ctim = st->st_ctim;
 2313         ost->st_size = st->st_size;
 2314         ost->st_blocks = st->st_blocks;
 2315         ost->st_blksize = st->st_blksize;
 2316         ost->st_flags = st->st_flags;
 2317         ost->st_gen = st->st_gen;
 2318         ost->st_lspare = 0;
 2319         ost->st_birthtim = st->st_birthtim;
 2320         bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim),
 2321             sizeof(*ost) - offsetof(struct freebsd11_stat,
 2322             st_birthtim) - sizeof(ost->st_birthtim));
 2323         return (0);
 2324 }
 2325 
 2326 int
 2327 freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap)
 2328 {
 2329         struct stat sb;
 2330         struct freebsd11_stat osb;
 2331         int error;
 2332 
 2333         error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 2334             &sb, NULL);
 2335         if (error != 0)
 2336                 return (error);
 2337         error = freebsd11_cvtstat(&sb, &osb);
 2338         if (error == 0)
 2339                 error = copyout(&osb, uap->ub, sizeof(osb));
 2340         return (error);
 2341 }
 2342 
 2343 int
 2344 freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap)
 2345 {
 2346         struct stat sb;
 2347         struct freebsd11_stat osb;
 2348         int error;
 2349 
 2350         error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 2351             UIO_USERSPACE, &sb, NULL);
 2352         if (error != 0)
 2353                 return (error);
 2354         error = freebsd11_cvtstat(&sb, &osb);
 2355         if (error == 0)
 2356                 error = copyout(&osb, uap->ub, sizeof(osb));
 2357         return (error);
 2358 }
 2359 
 2360 int
 2361 freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap)
 2362 {
 2363         struct fhandle fh;
 2364         struct stat sb;
 2365         struct freebsd11_stat osb;
 2366         int error;
 2367 
 2368         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 2369         if (error != 0)
 2370                 return (error);
 2371         error = kern_fhstat(td, fh, &sb);
 2372         if (error != 0)
 2373                 return (error);
 2374         error = freebsd11_cvtstat(&sb, &osb);
 2375         if (error == 0)
 2376                 error = copyout(&osb, uap->sb, sizeof(osb));
 2377         return (error);
 2378 }
 2379 
 2380 int
 2381 freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap)
 2382 {
 2383         struct stat sb;
 2384         struct freebsd11_stat osb;
 2385         int error;
 2386 
 2387         error = kern_statat(td, uap->flag, uap->fd, uap->path,
 2388             UIO_USERSPACE, &sb, NULL);
 2389         if (error != 0)
 2390                 return (error);
 2391         error = freebsd11_cvtstat(&sb, &osb);
 2392         if (error == 0)
 2393                 error = copyout(&osb, uap->buf, sizeof(osb));
 2394         return (error);
 2395 }
 2396 #endif  /* COMPAT_FREEBSD11 */
 2397 
 2398 /*
 2399  * Get file status
 2400  */
 2401 #ifndef _SYS_SYSPROTO_H_
 2402 struct fstatat_args {
 2403         int     fd;
 2404         char    *path;
 2405         struct stat     *buf;
 2406         int     flag;
 2407 }
 2408 #endif
 2409 int
 2410 sys_fstatat(struct thread *td, struct fstatat_args *uap)
 2411 {
 2412         struct stat sb;
 2413         int error;
 2414 
 2415         error = kern_statat(td, uap->flag, uap->fd, uap->path,
 2416             UIO_USERSPACE, &sb, NULL);
 2417         if (error == 0)
 2418                 error = copyout(&sb, uap->buf, sizeof (sb));
 2419         return (error);
 2420 }
 2421 
 2422 int
 2423 kern_statat(struct thread *td, int flag, int fd, const char *path,
 2424     enum uio_seg pathseg, struct stat *sbp,
 2425     void (*hook)(struct vnode *vp, struct stat *sbp))
 2426 {
 2427         struct nameidata nd;
 2428         int error;
 2429 
 2430         if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 2431             AT_EMPTY_PATH)) != 0)
 2432                 return (EINVAL);
 2433 
 2434         NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH |
 2435             AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF |
 2436             AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td);
 2437 
 2438         if ((error = namei(&nd)) != 0) {
 2439                 if (error == ENOTDIR &&
 2440                     (nd.ni_resflags & NIRES_EMPTYPATH) != 0)
 2441                         error = kern_fstat(td, fd, sbp);
 2442                 return (error);
 2443         }
 2444         error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td);
 2445         if (error == 0) {
 2446                 if (__predict_false(hook != NULL))
 2447                         hook(nd.ni_vp, sbp);
 2448         }
 2449         NDFREE_NOTHING(&nd);
 2450         vput(nd.ni_vp);
 2451 #ifdef __STAT_TIME_T_EXT
 2452         sbp->st_atim_ext = 0;
 2453         sbp->st_mtim_ext = 0;
 2454         sbp->st_ctim_ext = 0;
 2455         sbp->st_btim_ext = 0;
 2456 #endif
 2457 #ifdef KTRACE
 2458         if (KTRPOINT(td, KTR_STRUCT))
 2459                 ktrstat_error(sbp, error);
 2460 #endif
 2461         return (error);
 2462 }
 2463 
 2464 #if defined(COMPAT_FREEBSD11)
 2465 /*
 2466  * Implementation of the NetBSD [l]stat() functions.
 2467  */
 2468 void
 2469 freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb)
 2470 {
 2471 
 2472         bzero(nsb, sizeof(*nsb));
 2473         nsb->st_dev = sb->st_dev;
 2474         nsb->st_ino = sb->st_ino;
 2475         nsb->st_mode = sb->st_mode;
 2476         nsb->st_nlink = sb->st_nlink;
 2477         nsb->st_uid = sb->st_uid;
 2478         nsb->st_gid = sb->st_gid;
 2479         nsb->st_rdev = sb->st_rdev;
 2480         nsb->st_atim = sb->st_atim;
 2481         nsb->st_mtim = sb->st_mtim;
 2482         nsb->st_ctim = sb->st_ctim;
 2483         nsb->st_size = sb->st_size;
 2484         nsb->st_blocks = sb->st_blocks;
 2485         nsb->st_blksize = sb->st_blksize;
 2486         nsb->st_flags = sb->st_flags;
 2487         nsb->st_gen = sb->st_gen;
 2488         nsb->st_birthtim = sb->st_birthtim;
 2489 }
 2490 
 2491 #ifndef _SYS_SYSPROTO_H_
 2492 struct freebsd11_nstat_args {
 2493         char    *path;
 2494         struct nstat *ub;
 2495 };
 2496 #endif
 2497 int
 2498 freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap)
 2499 {
 2500         struct stat sb;
 2501         struct nstat nsb;
 2502         int error;
 2503 
 2504         error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 2505             &sb, NULL);
 2506         if (error != 0)
 2507                 return (error);
 2508         freebsd11_cvtnstat(&sb, &nsb);
 2509         return (copyout(&nsb, uap->ub, sizeof (nsb)));
 2510 }
 2511 
 2512 /*
 2513  * NetBSD lstat.  Get file status; this version does not follow links.
 2514  */
 2515 #ifndef _SYS_SYSPROTO_H_
 2516 struct freebsd11_nlstat_args {
 2517         char    *path;
 2518         struct nstat *ub;
 2519 };
 2520 #endif
 2521 int
 2522 freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap)
 2523 {
 2524         struct stat sb;
 2525         struct nstat nsb;
 2526         int error;
 2527 
 2528         error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 2529             UIO_USERSPACE, &sb, NULL);
 2530         if (error != 0)
 2531                 return (error);
 2532         freebsd11_cvtnstat(&sb, &nsb);
 2533         return (copyout(&nsb, uap->ub, sizeof (nsb)));
 2534 }
 2535 #endif /* COMPAT_FREEBSD11 */
 2536 
 2537 /*
 2538  * Get configurable pathname variables.
 2539  */
 2540 #ifndef _SYS_SYSPROTO_H_
 2541 struct pathconf_args {
 2542         char    *path;
 2543         int     name;
 2544 };
 2545 #endif
 2546 int
 2547 sys_pathconf(struct thread *td, struct pathconf_args *uap)
 2548 {
 2549         long value;
 2550         int error;
 2551 
 2552         error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW,
 2553             &value);
 2554         if (error == 0)
 2555                 td->td_retval[0] = value;
 2556         return (error);
 2557 }
 2558 
 2559 #ifndef _SYS_SYSPROTO_H_
 2560 struct lpathconf_args {
 2561         char    *path;
 2562         int     name;
 2563 };
 2564 #endif
 2565 int
 2566 sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
 2567 {
 2568         long value;
 2569         int error;
 2570 
 2571         error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
 2572             NOFOLLOW, &value);
 2573         if (error == 0)
 2574                 td->td_retval[0] = value;
 2575         return (error);
 2576 }
 2577 
 2578 int
 2579 kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg,
 2580     int name, u_long flags, long *valuep)
 2581 {
 2582         struct nameidata nd;
 2583         int error;
 2584 
 2585         NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
 2586             pathseg, path, td);
 2587         if ((error = namei(&nd)) != 0)
 2588                 return (error);
 2589         NDFREE_NOTHING(&nd);
 2590 
 2591         error = VOP_PATHCONF(nd.ni_vp, name, valuep);
 2592         vput(nd.ni_vp);
 2593         return (error);
 2594 }
 2595 
 2596 /*
 2597  * Return target name of a symbolic link.
 2598  */
 2599 #ifndef _SYS_SYSPROTO_H_
 2600 struct readlink_args {
 2601         char    *path;
 2602         char    *buf;
 2603         size_t  count;
 2604 };
 2605 #endif
 2606 int
 2607 sys_readlink(struct thread *td, struct readlink_args *uap)
 2608 {
 2609 
 2610         return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 2611             uap->buf, UIO_USERSPACE, uap->count));
 2612 }
 2613 #ifndef _SYS_SYSPROTO_H_
 2614 struct readlinkat_args {
 2615         int     fd;
 2616         char    *path;
 2617         char    *buf;
 2618         size_t  bufsize;
 2619 };
 2620 #endif
 2621 int
 2622 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
 2623 {
 2624 
 2625         return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
 2626             uap->buf, UIO_USERSPACE, uap->bufsize));
 2627 }
 2628 
 2629 int
 2630 kern_readlinkat(struct thread *td, int fd, const char *path,
 2631     enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count)
 2632 {
 2633         struct vnode *vp;
 2634         struct nameidata nd;
 2635         int error;
 2636 
 2637         if (count > IOSIZE_MAX)
 2638                 return (EINVAL);
 2639 
 2640         NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
 2641             EMPTYPATH, pathseg, path, fd, td);
 2642 
 2643         if ((error = namei(&nd)) != 0)
 2644                 return (error);
 2645         NDFREE_NOTHING(&nd);
 2646         vp = nd.ni_vp;
 2647 
 2648         error = kern_readlink_vp(vp, buf, bufseg, count, td);
 2649         vput(vp);
 2650 
 2651         return (error);
 2652 }
 2653 
 2654 /*
 2655  * Helper function to readlink from a vnode
 2656  */
 2657 static int
 2658 kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count,
 2659     struct thread *td)
 2660 {
 2661         struct iovec aiov;
 2662         struct uio auio;
 2663         int error;
 2664 
 2665         ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked");
 2666 #ifdef MAC
 2667         error = mac_vnode_check_readlink(td->td_ucred, vp);
 2668         if (error != 0)
 2669                 return (error);
 2670 #endif
 2671         if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
 2672                 return (EINVAL);
 2673 
 2674         aiov.iov_base = buf;
 2675         aiov.iov_len = count;
 2676         auio.uio_iov = &aiov;
 2677         auio.uio_iovcnt = 1;
 2678         auio.uio_offset = 0;
 2679         auio.uio_rw = UIO_READ;
 2680         auio.uio_segflg = bufseg;
 2681         auio.uio_td = td;
 2682         auio.uio_resid = count;
 2683         error = VOP_READLINK(vp, &auio, td->td_ucred);
 2684         td->td_retval[0] = count - auio.uio_resid;
 2685         return (error);
 2686 }
 2687 
 2688 /*
 2689  * Common implementation code for chflags() and fchflags().
 2690  */
 2691 static int
 2692 setfflags(struct thread *td, struct vnode *vp, u_long flags)
 2693 {
 2694         struct mount *mp;
 2695         struct vattr vattr;
 2696         int error;
 2697 
 2698         /* We can't support the value matching VNOVAL. */
 2699         if (flags == VNOVAL)
 2700                 return (EOPNOTSUPP);
 2701 
 2702         /*
 2703          * Prevent non-root users from setting flags on devices.  When
 2704          * a device is reused, users can retain ownership of the device
 2705          * if they are allowed to set flags and programs assume that
 2706          * chown can't fail when done as root.
 2707          */
 2708         if (vp->v_type == VCHR || vp->v_type == VBLK) {
 2709                 error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
 2710                 if (error != 0)
 2711                         return (error);
 2712         }
 2713 
 2714         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 2715                 return (error);
 2716         VATTR_NULL(&vattr);
 2717         vattr.va_flags = flags;
 2718         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 2719 #ifdef MAC
 2720         error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
 2721         if (error == 0)
 2722 #endif
 2723                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 2724         VOP_UNLOCK(vp);
 2725         vn_finished_write(mp);
 2726         return (error);
 2727 }
 2728 
 2729 /*
 2730  * Change flags of a file given a path name.
 2731  */
 2732 #ifndef _SYS_SYSPROTO_H_
 2733 struct chflags_args {
 2734         const char *path;
 2735         u_long  flags;
 2736 };
 2737 #endif
 2738 int
 2739 sys_chflags(struct thread *td, struct chflags_args *uap)
 2740 {
 2741 
 2742         return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 2743             uap->flags, 0));
 2744 }
 2745 
 2746 #ifndef _SYS_SYSPROTO_H_
 2747 struct chflagsat_args {
 2748         int     fd;
 2749         const char *path;
 2750         u_long  flags;
 2751         int     atflag;
 2752 }
 2753 #endif
 2754 int
 2755 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
 2756 {
 2757 
 2758         return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE,
 2759             uap->flags, uap->atflag));
 2760 }
 2761 
 2762 /*
 2763  * Same as chflags() but doesn't follow symlinks.
 2764  */
 2765 #ifndef _SYS_SYSPROTO_H_
 2766 struct lchflags_args {
 2767         const char *path;
 2768         u_long flags;
 2769 };
 2770 #endif
 2771 int
 2772 sys_lchflags(struct thread *td, struct lchflags_args *uap)
 2773 {
 2774 
 2775         return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 2776             uap->flags, AT_SYMLINK_NOFOLLOW));
 2777 }
 2778 
 2779 static int
 2780 kern_chflagsat(struct thread *td, int fd, const char *path,
 2781     enum uio_seg pathseg, u_long flags, int atflag)
 2782 {
 2783         struct nameidata nd;
 2784         int error;
 2785 
 2786         if ((atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 2787             AT_EMPTY_PATH)) != 0)
 2788                 return (EINVAL);
 2789 
 2790         AUDIT_ARG_FFLAGS(flags);
 2791         NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW |
 2792             AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
 2793             fd, &cap_fchflags_rights, td);
 2794         if ((error = namei(&nd)) != 0)
 2795                 return (error);
 2796         NDFREE_NOTHING(&nd);
 2797         error = setfflags(td, nd.ni_vp, flags);
 2798         vrele(nd.ni_vp);
 2799         return (error);
 2800 }
 2801 
 2802 /*
 2803  * Change flags of a file given a file descriptor.
 2804  */
 2805 #ifndef _SYS_SYSPROTO_H_
 2806 struct fchflags_args {
 2807         int     fd;
 2808         u_long  flags;
 2809 };
 2810 #endif
 2811 int
 2812 sys_fchflags(struct thread *td, struct fchflags_args *uap)
 2813 {
 2814         struct file *fp;
 2815         int error;
 2816 
 2817         AUDIT_ARG_FD(uap->fd);
 2818         AUDIT_ARG_FFLAGS(uap->flags);
 2819         error = getvnode(td, uap->fd, &cap_fchflags_rights,
 2820             &fp);
 2821         if (error != 0)
 2822                 return (error);
 2823 #ifdef AUDIT
 2824         if (AUDITING_TD(td)) {
 2825                 vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 2826                 AUDIT_ARG_VNODE1(fp->f_vnode);
 2827                 VOP_UNLOCK(fp->f_vnode);
 2828         }
 2829 #endif
 2830         error = setfflags(td, fp->f_vnode, uap->flags);
 2831         fdrop(fp, td);
 2832         return (error);
 2833 }
 2834 
 2835 /*
 2836  * Common implementation code for chmod(), lchmod() and fchmod().
 2837  */
 2838 int
 2839 setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
 2840 {
 2841         struct mount *mp;
 2842         struct vattr vattr;
 2843         int error;
 2844 
 2845         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 2846                 return (error);
 2847         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 2848         VATTR_NULL(&vattr);
 2849         vattr.va_mode = mode & ALLPERMS;
 2850 #ifdef MAC
 2851         error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
 2852         if (error == 0)
 2853 #endif
 2854                 error = VOP_SETATTR(vp, &vattr, cred);
 2855         VOP_UNLOCK(vp);
 2856         vn_finished_write(mp);
 2857         return (error);
 2858 }
 2859 
 2860 /*
 2861  * Change mode of a file given path name.
 2862  */
 2863 #ifndef _SYS_SYSPROTO_H_
 2864 struct chmod_args {
 2865         char    *path;
 2866         int     mode;
 2867 };
 2868 #endif
 2869 int
 2870 sys_chmod(struct thread *td, struct chmod_args *uap)
 2871 {
 2872 
 2873         return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 2874             uap->mode, 0));
 2875 }
 2876 
 2877 #ifndef _SYS_SYSPROTO_H_
 2878 struct fchmodat_args {
 2879         int     dirfd;
 2880         char    *path;
 2881         mode_t  mode;
 2882         int     flag;
 2883 }
 2884 #endif
 2885 int
 2886 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
 2887 {
 2888 
 2889         return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE,
 2890             uap->mode, uap->flag));
 2891 }
 2892 
 2893 /*
 2894  * Change mode of a file given path name (don't follow links.)
 2895  */
 2896 #ifndef _SYS_SYSPROTO_H_
 2897 struct lchmod_args {
 2898         char    *path;
 2899         int     mode;
 2900 };
 2901 #endif
 2902 int
 2903 sys_lchmod(struct thread *td, struct lchmod_args *uap)
 2904 {
 2905 
 2906         return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 2907             uap->mode, AT_SYMLINK_NOFOLLOW));
 2908 }
 2909 
 2910 int
 2911 kern_fchmodat(struct thread *td, int fd, const char *path,
 2912     enum uio_seg pathseg, mode_t mode, int flag)
 2913 {
 2914         struct nameidata nd;
 2915         int error;
 2916 
 2917         if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 2918             AT_EMPTY_PATH)) != 0)
 2919                 return (EINVAL);
 2920 
 2921         AUDIT_ARG_MODE(mode);
 2922         NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 2923             AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
 2924             fd, &cap_fchmod_rights, td);
 2925         if ((error = namei(&nd)) != 0)
 2926                 return (error);
 2927         NDFREE_NOTHING(&nd);
 2928         error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
 2929         vrele(nd.ni_vp);
 2930         return (error);
 2931 }
 2932 
 2933 /*
 2934  * Change mode of a file given a file descriptor.
 2935  */
 2936 #ifndef _SYS_SYSPROTO_H_
 2937 struct fchmod_args {
 2938         int     fd;
 2939         int     mode;
 2940 };
 2941 #endif
 2942 int
 2943 sys_fchmod(struct thread *td, struct fchmod_args *uap)
 2944 {
 2945         struct file *fp;
 2946         int error;
 2947 
 2948         AUDIT_ARG_FD(uap->fd);
 2949         AUDIT_ARG_MODE(uap->mode);
 2950 
 2951         error = fget(td, uap->fd, &cap_fchmod_rights, &fp);
 2952         if (error != 0)
 2953                 return (error);
 2954         error = fo_chmod(fp, uap->mode, td->td_ucred, td);
 2955         fdrop(fp, td);
 2956         return (error);
 2957 }
 2958 
 2959 /*
 2960  * Common implementation for chown(), lchown(), and fchown()
 2961  */
 2962 int
 2963 setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
 2964     gid_t gid)
 2965 {
 2966         struct mount *mp;
 2967         struct vattr vattr;
 2968         int error;
 2969 
 2970         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 2971                 return (error);
 2972         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 2973         VATTR_NULL(&vattr);
 2974         vattr.va_uid = uid;
 2975         vattr.va_gid = gid;
 2976 #ifdef MAC
 2977         error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
 2978             vattr.va_gid);
 2979         if (error == 0)
 2980 #endif
 2981                 error = VOP_SETATTR(vp, &vattr, cred);
 2982         VOP_UNLOCK(vp);
 2983         vn_finished_write(mp);
 2984         return (error);
 2985 }
 2986 
 2987 /*
 2988  * Set ownership given a path name.
 2989  */
 2990 #ifndef _SYS_SYSPROTO_H_
 2991 struct chown_args {
 2992         char    *path;
 2993         int     uid;
 2994         int     gid;
 2995 };
 2996 #endif
 2997 int
 2998 sys_chown(struct thread *td, struct chown_args *uap)
 2999 {
 3000 
 3001         return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
 3002             uap->gid, 0));
 3003 }
 3004 
 3005 #ifndef _SYS_SYSPROTO_H_
 3006 struct fchownat_args {
 3007         int fd;
 3008         const char * path;
 3009         uid_t uid;
 3010         gid_t gid;
 3011         int flag;
 3012 };
 3013 #endif
 3014 int
 3015 sys_fchownat(struct thread *td, struct fchownat_args *uap)
 3016 {
 3017 
 3018         return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
 3019             uap->gid, uap->flag));
 3020 }
 3021 
 3022 int
 3023 kern_fchownat(struct thread *td, int fd, const char *path,
 3024     enum uio_seg pathseg, int uid, int gid, int flag)
 3025 {
 3026         struct nameidata nd;
 3027         int error;
 3028 
 3029         if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 3030             AT_EMPTY_PATH)) != 0)
 3031                 return (EINVAL);
 3032 
 3033         AUDIT_ARG_OWNER(uid, gid);
 3034         NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 3035             AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
 3036             fd, &cap_fchown_rights, td);
 3037 
 3038         if ((error = namei(&nd)) != 0)
 3039                 return (error);
 3040         NDFREE_NOTHING(&nd);
 3041         error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
 3042         vrele(nd.ni_vp);
 3043         return (error);
 3044 }
 3045 
 3046 /*
 3047  * Set ownership given a path name, do not cross symlinks.
 3048  */
 3049 #ifndef _SYS_SYSPROTO_H_
 3050 struct lchown_args {
 3051         char    *path;
 3052         int     uid;
 3053         int     gid;
 3054 };
 3055 #endif
 3056 int
 3057 sys_lchown(struct thread *td, struct lchown_args *uap)
 3058 {
 3059 
 3060         return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 3061             uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
 3062 }
 3063 
 3064 /*
 3065  * Set ownership given a file descriptor.
 3066  */
 3067 #ifndef _SYS_SYSPROTO_H_
 3068 struct fchown_args {
 3069         int     fd;
 3070         int     uid;
 3071         int     gid;
 3072 };
 3073 #endif
 3074 int
 3075 sys_fchown(struct thread *td, struct fchown_args *uap)
 3076 {
 3077         struct file *fp;
 3078         int error;
 3079 
 3080         AUDIT_ARG_FD(uap->fd);
 3081         AUDIT_ARG_OWNER(uap->uid, uap->gid);
 3082         error = fget(td, uap->fd, &cap_fchown_rights, &fp);
 3083         if (error != 0)
 3084                 return (error);
 3085         error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
 3086         fdrop(fp, td);
 3087         return (error);
 3088 }
 3089 
 3090 /*
 3091  * Common implementation code for utimes(), lutimes(), and futimes().
 3092  */
 3093 static int
 3094 getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
 3095     struct timespec *tsp)
 3096 {
 3097         struct timeval tv[2];
 3098         const struct timeval *tvp;
 3099         int error;
 3100 
 3101         if (usrtvp == NULL) {
 3102                 vfs_timestamp(&tsp[0]);
 3103                 tsp[1] = tsp[0];
 3104         } else {
 3105                 if (tvpseg == UIO_SYSSPACE) {
 3106                         tvp = usrtvp;
 3107                 } else {
 3108                         if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
 3109                                 return (error);
 3110                         tvp = tv;
 3111                 }
 3112 
 3113                 if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
 3114                     tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
 3115                         return (EINVAL);
 3116                 TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
 3117                 TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
 3118         }
 3119         return (0);
 3120 }
 3121 
 3122 /*
 3123  * Common implementation code for futimens(), utimensat().
 3124  */
 3125 #define UTIMENS_NULL    0x1
 3126 #define UTIMENS_EXIT    0x2
 3127 static int
 3128 getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
 3129     struct timespec *tsp, int *retflags)
 3130 {
 3131         struct timespec tsnow;
 3132         int error;
 3133 
 3134         vfs_timestamp(&tsnow);
 3135         *retflags = 0;
 3136         if (usrtsp == NULL) {
 3137                 tsp[0] = tsnow;
 3138                 tsp[1] = tsnow;
 3139                 *retflags |= UTIMENS_NULL;
 3140                 return (0);
 3141         }
 3142         if (tspseg == UIO_SYSSPACE) {
 3143                 tsp[0] = usrtsp[0];
 3144                 tsp[1] = usrtsp[1];
 3145         } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
 3146                 return (error);
 3147         if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
 3148                 *retflags |= UTIMENS_EXIT;
 3149         if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
 3150                 *retflags |= UTIMENS_NULL;
 3151         if (tsp[0].tv_nsec == UTIME_OMIT)
 3152                 tsp[0].tv_sec = VNOVAL;
 3153         else if (tsp[0].tv_nsec == UTIME_NOW)
 3154                 tsp[0] = tsnow;
 3155         else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
 3156                 return (EINVAL);
 3157         if (tsp[1].tv_nsec == UTIME_OMIT)
 3158                 tsp[1].tv_sec = VNOVAL;
 3159         else if (tsp[1].tv_nsec == UTIME_NOW)
 3160                 tsp[1] = tsnow;
 3161         else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
 3162                 return (EINVAL);
 3163 
 3164         return (0);
 3165 }
 3166 
 3167 /*
 3168  * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
 3169  * and utimensat().
 3170  */
 3171 static int
 3172 setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
 3173     int numtimes, int nullflag)
 3174 {
 3175         struct mount *mp;
 3176         struct vattr vattr;
 3177         int error;
 3178         bool setbirthtime;
 3179 
 3180         setbirthtime = false;
 3181         vattr.va_birthtime.tv_sec = VNOVAL;
 3182         vattr.va_birthtime.tv_nsec = 0;
 3183 
 3184         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 3185                 return (error);
 3186         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 3187         if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred) == 0 &&
 3188             timespeccmp(&ts[1], &vattr.va_birthtime, < ))
 3189                 setbirthtime = true;
 3190         VATTR_NULL(&vattr);
 3191         vattr.va_atime = ts[0];
 3192         vattr.va_mtime = ts[1];
 3193         if (setbirthtime)
 3194                 vattr.va_birthtime = ts[1];
 3195         if (numtimes > 2)
 3196                 vattr.va_birthtime = ts[2];
 3197         if (nullflag)
 3198                 vattr.va_vaflags |= VA_UTIMES_NULL;
 3199 #ifdef MAC
 3200         error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
 3201             vattr.va_mtime);
 3202 #endif
 3203         if (error == 0)
 3204                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 3205         VOP_UNLOCK(vp);
 3206         vn_finished_write(mp);
 3207         return (error);
 3208 }
 3209 
 3210 /*
 3211  * Set the access and modification times of a file.
 3212  */
 3213 #ifndef _SYS_SYSPROTO_H_
 3214 struct utimes_args {
 3215         char    *path;
 3216         struct  timeval *tptr;
 3217 };
 3218 #endif
 3219 int
 3220 sys_utimes(struct thread *td, struct utimes_args *uap)
 3221 {
 3222 
 3223         return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 3224             uap->tptr, UIO_USERSPACE));
 3225 }
 3226 
 3227 #ifndef _SYS_SYSPROTO_H_
 3228 struct futimesat_args {
 3229         int fd;
 3230         const char * path;
 3231         const struct timeval * times;
 3232 };
 3233 #endif
 3234 int
 3235 sys_futimesat(struct thread *td, struct futimesat_args *uap)
 3236 {
 3237 
 3238         return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
 3239             uap->times, UIO_USERSPACE));
 3240 }
 3241 
 3242 int
 3243 kern_utimesat(struct thread *td, int fd, const char *path,
 3244     enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg)
 3245 {
 3246         struct nameidata nd;
 3247         struct timespec ts[2];
 3248         int error;
 3249 
 3250         if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 3251                 return (error);
 3252         NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
 3253             &cap_futimes_rights, td);
 3254 
 3255         if ((error = namei(&nd)) != 0)
 3256                 return (error);
 3257         NDFREE_NOTHING(&nd);
 3258         error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 3259         vrele(nd.ni_vp);
 3260         return (error);
 3261 }
 3262 
 3263 /*
 3264  * Set the access and modification times of a file.
 3265  */
 3266 #ifndef _SYS_SYSPROTO_H_
 3267 struct lutimes_args {
 3268         char    *path;
 3269         struct  timeval *tptr;
 3270 };
 3271 #endif
 3272 int
 3273 sys_lutimes(struct thread *td, struct lutimes_args *uap)
 3274 {
 3275 
 3276         return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 3277             UIO_USERSPACE));
 3278 }
 3279 
 3280 int
 3281 kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg,
 3282     struct timeval *tptr, enum uio_seg tptrseg)
 3283 {
 3284         struct timespec ts[2];
 3285         struct nameidata nd;
 3286         int error;
 3287 
 3288         if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 3289                 return (error);
 3290         NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
 3291         if ((error = namei(&nd)) != 0)
 3292                 return (error);
 3293         NDFREE_NOTHING(&nd);
 3294         error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 3295         vrele(nd.ni_vp);
 3296         return (error);
 3297 }
 3298 
 3299 /*
 3300  * Set the access and modification times of a file.
 3301  */
 3302 #ifndef _SYS_SYSPROTO_H_
 3303 struct futimes_args {
 3304         int     fd;
 3305         struct  timeval *tptr;
 3306 };
 3307 #endif
 3308 int
 3309 sys_futimes(struct thread *td, struct futimes_args *uap)
 3310 {
 3311 
 3312         return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
 3313 }
 3314 
 3315 int
 3316 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
 3317     enum uio_seg tptrseg)
 3318 {
 3319         struct timespec ts[2];
 3320         struct file *fp;
 3321         int error;
 3322 
 3323         AUDIT_ARG_FD(fd);
 3324         error = getutimes(tptr, tptrseg, ts);
 3325         if (error != 0)
 3326                 return (error);
 3327         error = getvnode(td, fd, &cap_futimes_rights, &fp);
 3328         if (error != 0)
 3329                 return (error);
 3330 #ifdef AUDIT
 3331         if (AUDITING_TD(td)) {
 3332                 vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 3333                 AUDIT_ARG_VNODE1(fp->f_vnode);
 3334                 VOP_UNLOCK(fp->f_vnode);
 3335         }
 3336 #endif
 3337         error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
 3338         fdrop(fp, td);
 3339         return (error);
 3340 }
 3341 
 3342 int
 3343 sys_futimens(struct thread *td, struct futimens_args *uap)
 3344 {
 3345 
 3346         return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
 3347 }
 3348 
 3349 int
 3350 kern_futimens(struct thread *td, int fd, struct timespec *tptr,
 3351     enum uio_seg tptrseg)
 3352 {
 3353         struct timespec ts[2];
 3354         struct file *fp;
 3355         int error, flags;
 3356 
 3357         AUDIT_ARG_FD(fd);
 3358         error = getutimens(tptr, tptrseg, ts, &flags);
 3359         if (error != 0)
 3360                 return (error);
 3361         if (flags & UTIMENS_EXIT)
 3362                 return (0);
 3363         error = getvnode(td, fd, &cap_futimes_rights, &fp);
 3364         if (error != 0)
 3365                 return (error);
 3366 #ifdef AUDIT
 3367         if (AUDITING_TD(td)) {
 3368                 vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 3369                 AUDIT_ARG_VNODE1(fp->f_vnode);
 3370                 VOP_UNLOCK(fp->f_vnode);
 3371         }
 3372 #endif
 3373         error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
 3374         fdrop(fp, td);
 3375         return (error);
 3376 }
 3377 
 3378 int
 3379 sys_utimensat(struct thread *td, struct utimensat_args *uap)
 3380 {
 3381 
 3382         return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
 3383             uap->times, UIO_USERSPACE, uap->flag));
 3384 }
 3385 
 3386 int
 3387 kern_utimensat(struct thread *td, int fd, const char *path,
 3388     enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg,
 3389     int flag)
 3390 {
 3391         struct nameidata nd;
 3392         struct timespec ts[2];
 3393         int error, flags;
 3394 
 3395         if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 3396             AT_EMPTY_PATH)) != 0)
 3397                 return (EINVAL);
 3398 
 3399         if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
 3400                 return (error);
 3401         NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 3402             AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1,
 3403             pathseg, path, fd, &cap_futimes_rights, td);
 3404         if ((error = namei(&nd)) != 0)
 3405                 return (error);
 3406         /*
 3407          * We are allowed to call namei() regardless of 2xUTIME_OMIT.
 3408          * POSIX states:
 3409          * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
 3410          * "Search permission is denied by a component of the path prefix."
 3411          */
 3412         NDFREE_NOTHING(&nd);
 3413         if ((flags & UTIMENS_EXIT) == 0)
 3414                 error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
 3415         vrele(nd.ni_vp);
 3416         return (error);
 3417 }
 3418 
 3419 /*
 3420  * Truncate a file given its path name.
 3421  */
 3422 #ifndef _SYS_SYSPROTO_H_
 3423 struct truncate_args {
 3424         char    *path;
 3425         int     pad;
 3426         off_t   length;
 3427 };
 3428 #endif
 3429 int
 3430 sys_truncate(struct thread *td, struct truncate_args *uap)
 3431 {
 3432 
 3433         return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 3434 }
 3435 
 3436 int
 3437 kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg,
 3438     off_t length)
 3439 {
 3440         struct mount *mp;
 3441         struct vnode *vp;
 3442         void *rl_cookie;
 3443         struct vattr vattr;
 3444         struct nameidata nd;
 3445         int error;
 3446 
 3447         if (length < 0)
 3448                 return (EINVAL);
 3449         NDPREINIT(&nd);
 3450 retry:
 3451         NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
 3452         if ((error = namei(&nd)) != 0)
 3453                 return (error);
 3454         vp = nd.ni_vp;
 3455         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 3456         if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 3457                 vn_rangelock_unlock(vp, rl_cookie);
 3458                 vrele(vp);
 3459                 return (error);
 3460         }
 3461         NDFREE(&nd, NDF_ONLY_PNBUF);
 3462         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 3463         if (vp->v_type == VDIR)
 3464                 error = EISDIR;
 3465 #ifdef MAC
 3466         else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
 3467         }
 3468 #endif
 3469         else if ((error = vn_writechk(vp)) == 0 &&
 3470             (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
 3471                 VATTR_NULL(&vattr);
 3472                 vattr.va_size = length;
 3473                 error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 3474         }
 3475         VOP_UNLOCK(vp);
 3476         vn_finished_write(mp);
 3477         vn_rangelock_unlock(vp, rl_cookie);
 3478         vrele(vp);
 3479         if (error == ERELOOKUP)
 3480                 goto retry;
 3481         return (error);
 3482 }
 3483 
 3484 #if defined(COMPAT_43)
 3485 /*
 3486  * Truncate a file given its path name.
 3487  */
 3488 #ifndef _SYS_SYSPROTO_H_
 3489 struct otruncate_args {
 3490         char    *path;
 3491         long    length;
 3492 };
 3493 #endif
 3494 int
 3495 otruncate(struct thread *td, struct otruncate_args *uap)
 3496 {
 3497 
 3498         return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 3499 }
 3500 #endif /* COMPAT_43 */
 3501 
 3502 #if defined(COMPAT_FREEBSD6)
 3503 /* Versions with the pad argument */
 3504 int
 3505 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
 3506 {
 3507 
 3508         return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 3509 }
 3510 
 3511 int
 3512 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
 3513 {
 3514 
 3515         return (kern_ftruncate(td, uap->fd, uap->length));
 3516 }
 3517 #endif
 3518 
 3519 int
 3520 kern_fsync(struct thread *td, int fd, bool fullsync)
 3521 {
 3522         struct vnode *vp;
 3523         struct mount *mp;
 3524         struct file *fp;
 3525         int error;
 3526 
 3527         AUDIT_ARG_FD(fd);
 3528         error = getvnode(td, fd, &cap_fsync_rights, &fp);
 3529         if (error != 0)
 3530                 return (error);
 3531         vp = fp->f_vnode;
 3532 #if 0
 3533         if (!fullsync)
 3534                 /* XXXKIB: compete outstanding aio writes */;
 3535 #endif
 3536 retry:
 3537         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 3538         if (error != 0)
 3539                 goto drop;
 3540         vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 3541         AUDIT_ARG_VNODE1(vp);
 3542         if (vp->v_object != NULL) {
 3543                 VM_OBJECT_WLOCK(vp->v_object);
 3544                 vm_object_page_clean(vp->v_object, 0, 0, 0);
 3545                 VM_OBJECT_WUNLOCK(vp->v_object);
 3546         }
 3547         error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
 3548         VOP_UNLOCK(vp);
 3549         vn_finished_write(mp);
 3550         if (error == ERELOOKUP)
 3551                 goto retry;
 3552 drop:
 3553         fdrop(fp, td);
 3554         return (error);
 3555 }
 3556 
 3557 /*
 3558  * Sync an open file.
 3559  */
 3560 #ifndef _SYS_SYSPROTO_H_
 3561 struct fsync_args {
 3562         int     fd;
 3563 };
 3564 #endif
 3565 int
 3566 sys_fsync(struct thread *td, struct fsync_args *uap)
 3567 {
 3568 
 3569         return (kern_fsync(td, uap->fd, true));
 3570 }
 3571 
 3572 int
 3573 sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
 3574 {
 3575 
 3576         return (kern_fsync(td, uap->fd, false));
 3577 }
 3578 
 3579 /*
 3580  * Rename files.  Source and destination must either both be directories, or
 3581  * both not be directories.  If target is a directory, it must be empty.
 3582  */
 3583 #ifndef _SYS_SYSPROTO_H_
 3584 struct rename_args {
 3585         char    *from;
 3586         char    *to;
 3587 };
 3588 #endif
 3589 int
 3590 sys_rename(struct thread *td, struct rename_args *uap)
 3591 {
 3592 
 3593         return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
 3594             uap->to, UIO_USERSPACE));
 3595 }
 3596 
 3597 #ifndef _SYS_SYSPROTO_H_
 3598 struct renameat_args {
 3599         int     oldfd;
 3600         char    *old;
 3601         int     newfd;
 3602         char    *new;
 3603 };
 3604 #endif
 3605 int
 3606 sys_renameat(struct thread *td, struct renameat_args *uap)
 3607 {
 3608 
 3609         return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
 3610             UIO_USERSPACE));
 3611 }
 3612 
 3613 #ifdef MAC
 3614 static int
 3615 kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd,
 3616     const char *new, enum uio_seg pathseg, struct nameidata *fromnd)
 3617 {
 3618         int error;
 3619 
 3620         NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
 3621             AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td);
 3622         if ((error = namei(fromnd)) != 0)
 3623                 return (error);
 3624         error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp,
 3625             fromnd->ni_vp, &fromnd->ni_cnd);
 3626         VOP_UNLOCK(fromnd->ni_dvp);
 3627         if (fromnd->ni_dvp != fromnd->ni_vp)
 3628                 VOP_UNLOCK(fromnd->ni_vp);
 3629         if (error != 0) {
 3630                 NDFREE(fromnd, NDF_ONLY_PNBUF);
 3631                 vrele(fromnd->ni_dvp);
 3632                 vrele(fromnd->ni_vp);
 3633                 if (fromnd->ni_startdir)
 3634                         vrele(fromnd->ni_startdir);
 3635         }
 3636         return (error);
 3637 }
 3638 #endif
 3639 
 3640 int
 3641 kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
 3642     const char *new, enum uio_seg pathseg)
 3643 {
 3644         struct mount *mp = NULL;
 3645         struct vnode *tvp, *fvp, *tdvp;
 3646         struct nameidata fromnd, tond;
 3647         u_int64_t tondflags;
 3648         int error;
 3649 
 3650 again:
 3651         bwillwrite();
 3652 #ifdef MAC
 3653         if (mac_vnode_check_rename_from_enabled()) {
 3654                 error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg,
 3655                     &fromnd);
 3656                 if (error != 0)
 3657                         return (error);
 3658         } else {
 3659 #endif
 3660         NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
 3661             pathseg, old, oldfd, &cap_renameat_source_rights, td);
 3662         if ((error = namei(&fromnd)) != 0)
 3663                 return (error);
 3664 #ifdef MAC
 3665         }
 3666 #endif
 3667         fvp = fromnd.ni_vp;
 3668         tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2;
 3669         if (fromnd.ni_vp->v_type == VDIR)
 3670                 tondflags |= WILLBEDIR;
 3671         NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd,
 3672             &cap_renameat_target_rights, td);
 3673         if ((error = namei(&tond)) != 0) {
 3674                 /* Translate error code for rename("dir1", "dir2/."). */
 3675                 if (error == EISDIR && fvp->v_type == VDIR)
 3676                         error = EINVAL;
 3677                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
 3678                 vrele(fromnd.ni_dvp);
 3679                 vrele(fvp);
 3680                 goto out1;
 3681         }
 3682         tdvp = tond.ni_dvp;
 3683         tvp = tond.ni_vp;
 3684         error = vn_start_write(fvp, &mp, V_NOWAIT);
 3685         if (error != 0) {
 3686                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
 3687                 NDFREE(&tond, NDF_ONLY_PNBUF);
 3688                 if (tvp != NULL)
 3689                         vput(tvp);
 3690                 if (tdvp == tvp)
 3691                         vrele(tdvp);
 3692                 else
 3693                         vput(tdvp);
 3694                 vrele(fromnd.ni_dvp);
 3695                 vrele(fvp);
 3696                 vrele(tond.ni_startdir);
 3697                 if (fromnd.ni_startdir != NULL)
 3698                         vrele(fromnd.ni_startdir);
 3699                 error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
 3700                 if (error != 0)
 3701                         return (error);
 3702                 goto again;
 3703         }
 3704         if (tvp != NULL) {
 3705                 if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 3706                         error = ENOTDIR;
 3707                         goto out;
 3708                 } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 3709                         error = EISDIR;
 3710                         goto out;
 3711                 }
 3712 #ifdef CAPABILITIES
 3713                 if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) {
 3714                         /*
 3715                          * If the target already exists we require CAP_UNLINKAT
 3716                          * from 'newfd', when newfd was used for the lookup.
 3717                          */
 3718                         error = cap_check(&tond.ni_filecaps.fc_rights,
 3719                             &cap_unlinkat_rights);
 3720                         if (error != 0)
 3721                                 goto out;
 3722                 }
 3723 #endif
 3724         }
 3725         if (fvp == tdvp) {
 3726                 error = EINVAL;
 3727                 goto out;
 3728         }
 3729         /*
 3730          * If the source is the same as the destination (that is, if they
 3731          * are links to the same vnode), then there is nothing to do.
 3732          */
 3733         if (fvp == tvp)
 3734                 error = ERESTART;
 3735 #ifdef MAC
 3736         else
 3737                 error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
 3738                     tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
 3739 #endif
 3740 out:
 3741         if (error == 0) {
 3742                 error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 3743                     tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 3744                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
 3745                 NDFREE(&tond, NDF_ONLY_PNBUF);
 3746         } else {
 3747                 NDFREE(&fromnd, NDF_ONLY_PNBUF);
 3748                 NDFREE(&tond, NDF_ONLY_PNBUF);
 3749                 if (tvp != NULL)
 3750                         vput(tvp);
 3751                 if (tdvp == tvp)
 3752                         vrele(tdvp);
 3753                 else
 3754                         vput(tdvp);
 3755                 vrele(fromnd.ni_dvp);
 3756                 vrele(fvp);
 3757         }
 3758         vrele(tond.ni_startdir);
 3759         vn_finished_write(mp);
 3760 out1:
 3761         if (fromnd.ni_startdir)
 3762                 vrele(fromnd.ni_startdir);
 3763         if (error == ERESTART)
 3764                 return (0);
 3765         if (error == ERELOOKUP)
 3766                 goto again;
 3767         return (error);
 3768 }
 3769 
 3770 /*
 3771  * Make a directory file.
 3772  */
 3773 #ifndef _SYS_SYSPROTO_H_
 3774 struct mkdir_args {
 3775         char    *path;
 3776         int     mode;
 3777 };
 3778 #endif
 3779 int
 3780 sys_mkdir(struct thread *td, struct mkdir_args *uap)
 3781 {
 3782 
 3783         return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 3784             uap->mode));
 3785 }
 3786 
 3787 #ifndef _SYS_SYSPROTO_H_
 3788 struct mkdirat_args {
 3789         int     fd;
 3790         char    *path;
 3791         mode_t  mode;
 3792 };
 3793 #endif
 3794 int
 3795 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
 3796 {
 3797 
 3798         return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
 3799 }
 3800 
 3801 int
 3802 kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg,
 3803     int mode)
 3804 {
 3805         struct mount *mp;
 3806         struct vattr vattr;
 3807         struct nameidata nd;
 3808         int error;
 3809 
 3810         AUDIT_ARG_MODE(mode);
 3811         NDPREINIT(&nd);
 3812 restart:
 3813         bwillwrite();
 3814         NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 3815             NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR,
 3816             segflg, path, fd, &cap_mkdirat_rights, td);
 3817         if ((error = namei(&nd)) != 0)
 3818                 return (error);
 3819         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 3820                 NDFREE(&nd, NDF_ONLY_PNBUF);
 3821                 vput(nd.ni_dvp);
 3822                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 3823                         return (error);
 3824                 goto restart;
 3825         }
 3826         VATTR_NULL(&vattr);
 3827         vattr.va_type = VDIR;
 3828         vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask;
 3829 #ifdef MAC
 3830         error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 3831             &vattr);
 3832         if (error != 0)
 3833                 goto out;
 3834 #endif
 3835         error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 3836 #ifdef MAC
 3837 out:
 3838 #endif
 3839         NDFREE(&nd, NDF_ONLY_PNBUF);
 3840         VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 3841         vn_finished_write(mp);
 3842         if (error == ERELOOKUP)
 3843                 goto restart;
 3844         return (error);
 3845 }
 3846 
 3847 /*
 3848  * Remove a directory file.
 3849  */
 3850 #ifndef _SYS_SYSPROTO_H_
 3851 struct rmdir_args {
 3852         char    *path;
 3853 };
 3854 #endif
 3855 int
 3856 sys_rmdir(struct thread *td, struct rmdir_args *uap)
 3857 {
 3858 
 3859         return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
 3860             0));
 3861 }
 3862 
 3863 int
 3864 kern_frmdirat(struct thread *td, int dfd, const char *path, int fd,
 3865     enum uio_seg pathseg, int flag)
 3866 {
 3867         struct mount *mp;
 3868         struct vnode *vp;
 3869         struct file *fp;
 3870         struct nameidata nd;
 3871         cap_rights_t rights;
 3872         int error;
 3873 
 3874         fp = NULL;
 3875         if (fd != FD_NONE) {
 3876                 error = getvnode(td, fd, cap_rights_init_one(&rights,
 3877                     CAP_LOOKUP), &fp);
 3878                 if (error != 0)
 3879                         return (error);
 3880         }
 3881 
 3882         NDPREINIT(&nd);
 3883 restart:
 3884         bwillwrite();
 3885         NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
 3886             at2cnpflags(flag, AT_RESOLVE_BENEATH),
 3887             pathseg, path, dfd, &cap_unlinkat_rights, td);
 3888         if ((error = namei(&nd)) != 0)
 3889                 goto fdout;
 3890         vp = nd.ni_vp;
 3891         if (vp->v_type != VDIR) {
 3892                 error = ENOTDIR;
 3893                 goto out;
 3894         }
 3895         /*
 3896          * No rmdir "." please.
 3897          */
 3898         if (nd.ni_dvp == vp) {
 3899                 error = EINVAL;
 3900                 goto out;
 3901         }
 3902         /*
 3903          * The root of a mounted filesystem cannot be deleted.
 3904          */
 3905         if (vp->v_vflag & VV_ROOT) {
 3906                 error = EBUSY;
 3907                 goto out;
 3908         }
 3909 
 3910         if (fp != NULL && fp->f_vnode != vp) {
 3911                 if (VN_IS_DOOMED(fp->f_vnode))
 3912                         error = EBADF;
 3913                 else
 3914                         error = EDEADLK;
 3915                 goto out;
 3916         }
 3917 
 3918 #ifdef MAC
 3919         error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 3920             &nd.ni_cnd);
 3921         if (error != 0)
 3922                 goto out;
 3923 #endif
 3924         if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 3925                 NDFREE(&nd, NDF_ONLY_PNBUF);
 3926                 vput(vp);
 3927                 if (nd.ni_dvp == vp)
 3928                         vrele(nd.ni_dvp);
 3929                 else
 3930                         vput(nd.ni_dvp);
 3931                 if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 3932                         goto fdout;
 3933                 goto restart;
 3934         }
 3935         vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 3936         error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 3937         vn_finished_write(mp);
 3938 out:
 3939         NDFREE(&nd, NDF_ONLY_PNBUF);
 3940         vput(vp);
 3941         if (nd.ni_dvp == vp)
 3942                 vrele(nd.ni_dvp);
 3943         else
 3944                 vput(nd.ni_dvp);
 3945         if (error == ERELOOKUP)
 3946                 goto restart;
 3947 fdout:
 3948         if (fp != NULL)
 3949                 fdrop(fp, td);
 3950         return (error);
 3951 }
 3952 
 3953 #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
 3954 int
 3955 freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count,
 3956     long *basep, void (*func)(struct freebsd11_dirent *))
 3957 {
 3958         struct freebsd11_dirent dstdp;
 3959         struct dirent *dp, *edp;
 3960         char *dirbuf;
 3961         off_t base;
 3962         ssize_t resid, ucount;
 3963         int error;
 3964 
 3965         /* XXX arbitrary sanity limit on `count'. */
 3966         count = min(count, 64 * 1024);
 3967 
 3968         dirbuf = malloc(count, M_TEMP, M_WAITOK);
 3969 
 3970         error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid,
 3971             UIO_SYSSPACE);
 3972         if (error != 0)
 3973                 goto done;
 3974         if (basep != NULL)
 3975                 *basep = base;
 3976 
 3977         ucount = 0;
 3978         for (dp = (struct dirent *)dirbuf,
 3979             edp = (struct dirent *)&dirbuf[count - resid];
 3980             ucount < count && dp < edp; ) {
 3981                 if (dp->d_reclen == 0)
 3982                         break;
 3983                 MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0));
 3984                 if (dp->d_namlen >= sizeof(dstdp.d_name))
 3985                         continue;
 3986                 dstdp.d_type = dp->d_type;
 3987                 dstdp.d_namlen = dp->d_namlen;
 3988                 dstdp.d_fileno = dp->d_fileno;          /* truncate */
 3989                 if (dstdp.d_fileno != dp->d_fileno) {
 3990                         switch (ino64_trunc_error) {
 3991                         default:
 3992                         case 0:
 3993                                 break;
 3994                         case 1:
 3995                                 error = EOVERFLOW;
 3996                                 goto done;
 3997                         case 2:
 3998                                 dstdp.d_fileno = UINT32_MAX;
 3999                                 break;
 4000                         }
 4001                 }
 4002                 dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) +
 4003                     ((dp->d_namlen + 1 + 3) &~ 3);
 4004                 bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
 4005                 bzero(dstdp.d_name + dstdp.d_namlen,
 4006                     dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) -
 4007                     dstdp.d_namlen);
 4008                 MPASS(dstdp.d_reclen <= dp->d_reclen);
 4009                 MPASS(ucount + dstdp.d_reclen <= count);
 4010                 if (func != NULL)
 4011                         func(&dstdp);
 4012                 error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen);
 4013                 if (error != 0)
 4014                         break;
 4015                 dp = (struct dirent *)((char *)dp + dp->d_reclen);
 4016                 ucount += dstdp.d_reclen;
 4017         }
 4018 
 4019 done:
 4020         free(dirbuf, M_TEMP);
 4021         if (error == 0)
 4022                 td->td_retval[0] = ucount;
 4023         return (error);
 4024 }
 4025 #endif /* COMPAT */
 4026 
 4027 #ifdef COMPAT_43
 4028 static void
 4029 ogetdirentries_cvt(struct freebsd11_dirent *dp)
 4030 {
 4031 #if (BYTE_ORDER == LITTLE_ENDIAN)
 4032         /*
 4033          * The expected low byte of dp->d_namlen is our dp->d_type.
 4034          * The high MBZ byte of dp->d_namlen is our dp->d_namlen.
 4035          */
 4036         dp->d_type = dp->d_namlen;
 4037         dp->d_namlen = 0;
 4038 #else
 4039         /*
 4040          * The dp->d_type is the high byte of the expected dp->d_namlen,
 4041          * so must be zero'ed.
 4042          */
 4043         dp->d_type = 0;
 4044 #endif
 4045 }
 4046 
 4047 /*
 4048  * Read a block of directory entries in a filesystem independent format.
 4049  */
 4050 #ifndef _SYS_SYSPROTO_H_
 4051 struct ogetdirentries_args {
 4052         int     fd;
 4053         char    *buf;
 4054         u_int   count;
 4055         long    *basep;
 4056 };
 4057 #endif
 4058 int
 4059 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
 4060 {
 4061         long loff;
 4062         int error;
 4063 
 4064         error = kern_ogetdirentries(td, uap, &loff);
 4065         if (error == 0)
 4066                 error = copyout(&loff, uap->basep, sizeof(long));
 4067         return (error);
 4068 }
 4069 
 4070 int
 4071 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
 4072     long *ploff)
 4073 {
 4074         long base;
 4075         int error;
 4076 
 4077         /* XXX arbitrary sanity limit on `count'. */
 4078         if (uap->count > 64 * 1024)
 4079                 return (EINVAL);
 4080 
 4081         error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
 4082             &base, ogetdirentries_cvt);
 4083 
 4084         if (error == 0 && uap->basep != NULL)
 4085                 error = copyout(&base, uap->basep, sizeof(long));
 4086 
 4087         return (error);
 4088 }
 4089 #endif /* COMPAT_43 */
 4090 
 4091 #if defined(COMPAT_FREEBSD11)
 4092 #ifndef _SYS_SYSPROTO_H_
 4093 struct freebsd11_getdirentries_args {
 4094         int     fd;
 4095         char    *buf;
 4096         u_int   count;
 4097         long    *basep;
 4098 };
 4099 #endif
 4100 int
 4101 freebsd11_getdirentries(struct thread *td,
 4102     struct freebsd11_getdirentries_args *uap)
 4103 {
 4104         long base;
 4105         int error;
 4106 
 4107         error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
 4108             &base, NULL);
 4109 
 4110         if (error == 0 && uap->basep != NULL)
 4111                 error = copyout(&base, uap->basep, sizeof(long));
 4112         return (error);
 4113 }
 4114 
 4115 int
 4116 freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap)
 4117 {
 4118         struct freebsd11_getdirentries_args ap;
 4119 
 4120         ap.fd = uap->fd;
 4121         ap.buf = uap->buf;
 4122         ap.count = uap->count;
 4123         ap.basep = NULL;
 4124         return (freebsd11_getdirentries(td, &ap));
 4125 }
 4126 #endif /* COMPAT_FREEBSD11 */
 4127 
 4128 /*
 4129  * Read a block of directory entries in a filesystem independent format.
 4130  */
 4131 int
 4132 sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
 4133 {
 4134         off_t base;
 4135         int error;
 4136 
 4137         error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
 4138             NULL, UIO_USERSPACE);
 4139         if (error != 0)
 4140                 return (error);
 4141         if (uap->basep != NULL)
 4142                 error = copyout(&base, uap->basep, sizeof(off_t));
 4143         return (error);
 4144 }
 4145 
 4146 int
 4147 kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
 4148     off_t *basep, ssize_t *residp, enum uio_seg bufseg)
 4149 {
 4150         struct vnode *vp;
 4151         struct file *fp;
 4152         struct uio auio;
 4153         struct iovec aiov;
 4154         off_t loff;
 4155         int error, eofflag;
 4156         off_t foffset;
 4157 
 4158         AUDIT_ARG_FD(fd);
 4159         if (count > IOSIZE_MAX)
 4160                 return (EINVAL);
 4161         auio.uio_resid = count;
 4162         error = getvnode(td, fd, &cap_read_rights, &fp);
 4163         if (error != 0)
 4164                 return (error);
 4165         if ((fp->f_flag & FREAD) == 0) {
 4166                 fdrop(fp, td);
 4167                 return (EBADF);
 4168         }
 4169         vp = fp->f_vnode;
 4170         foffset = foffset_lock(fp, 0);
 4171 unionread:
 4172         if (vp->v_type != VDIR) {
 4173                 error = EINVAL;
 4174                 goto fail;
 4175         }
 4176         if (__predict_false((vp->v_vflag & VV_UNLINKED) != 0)) {
 4177                 error = ENOENT;
 4178                 goto fail;
 4179         }
 4180         aiov.iov_base = buf;
 4181         aiov.iov_len = count;
 4182         auio.uio_iov = &aiov;
 4183         auio.uio_iovcnt = 1;
 4184         auio.uio_rw = UIO_READ;
 4185         auio.uio_segflg = bufseg;
 4186         auio.uio_td = td;
 4187         vn_lock(vp, LK_SHARED | LK_RETRY);
 4188         AUDIT_ARG_VNODE1(vp);
 4189         loff = auio.uio_offset = foffset;
 4190 #ifdef MAC
 4191         error = mac_vnode_check_readdir(td->td_ucred, vp);
 4192         if (error == 0)
 4193 #endif
 4194                 error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 4195                     NULL);
 4196         foffset = auio.uio_offset;
 4197         if (error != 0) {
 4198                 VOP_UNLOCK(vp);
 4199                 goto fail;
 4200         }
 4201         if (count == auio.uio_resid &&
 4202             (vp->v_vflag & VV_ROOT) &&
 4203             (vp->v_mount->mnt_flag & MNT_UNION)) {
 4204                 struct vnode *tvp = vp;
 4205 
 4206                 vp = vp->v_mount->mnt_vnodecovered;
 4207                 VREF(vp);
 4208                 fp->f_vnode = vp;
 4209                 foffset = 0;
 4210                 vput(tvp);
 4211                 goto unionread;
 4212         }
 4213         VOP_UNLOCK(vp);
 4214         *basep = loff;
 4215         if (residp != NULL)
 4216                 *residp = auio.uio_resid;
 4217         td->td_retval[0] = count - auio.uio_resid;
 4218 fail:
 4219         foffset_unlock(fp, foffset, 0);
 4220         fdrop(fp, td);
 4221         return (error);
 4222 }
 4223 
 4224 /*
 4225  * Set the mode mask for creation of filesystem nodes.
 4226  */
 4227 #ifndef _SYS_SYSPROTO_H_
 4228 struct umask_args {
 4229         int     newmask;
 4230 };
 4231 #endif
 4232 int
 4233 sys_umask(struct thread *td, struct umask_args *uap)
 4234 {
 4235         struct pwddesc *pdp;
 4236 
 4237         pdp = td->td_proc->p_pd;
 4238         PWDDESC_XLOCK(pdp);
 4239         td->td_retval[0] = pdp->pd_cmask;
 4240         pdp->pd_cmask = uap->newmask & ALLPERMS;
 4241         PWDDESC_XUNLOCK(pdp);
 4242         return (0);
 4243 }
 4244 
 4245 /*
 4246  * Void all references to file by ripping underlying filesystem away from
 4247  * vnode.
 4248  */
 4249 #ifndef _SYS_SYSPROTO_H_
 4250 struct revoke_args {
 4251         char    *path;
 4252 };
 4253 #endif
 4254 int
 4255 sys_revoke(struct thread *td, struct revoke_args *uap)
 4256 {
 4257         struct vnode *vp;
 4258         struct vattr vattr;
 4259         struct nameidata nd;
 4260         int error;
 4261 
 4262         NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 4263             uap->path, td);
 4264         if ((error = namei(&nd)) != 0)
 4265                 return (error);
 4266         vp = nd.ni_vp;
 4267         NDFREE_NOTHING(&nd);
 4268         if (vp->v_type != VCHR || vp->v_rdev == NULL) {
 4269                 error = EINVAL;
 4270                 goto out;
 4271         }
 4272 #ifdef MAC
 4273         error = mac_vnode_check_revoke(td->td_ucred, vp);
 4274         if (error != 0)
 4275                 goto out;
 4276 #endif
 4277         error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 4278         if (error != 0)
 4279                 goto out;
 4280         if (td->td_ucred->cr_uid != vattr.va_uid) {
 4281                 error = priv_check(td, PRIV_VFS_ADMIN);
 4282                 if (error != 0)
 4283                         goto out;
 4284         }
 4285         if (devfs_usecount(vp) > 0)
 4286                 VOP_REVOKE(vp, REVOKEALL);
 4287 out:
 4288         vput(vp);
 4289         return (error);
 4290 }
 4291 
 4292 /*
 4293  * This variant of getvnode() allows O_PATH files.  Caller should
 4294  * ensure that returned file and vnode are only used for compatible
 4295  * semantics.
 4296  */
 4297 int
 4298 getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp,
 4299     struct file **fpp)
 4300 {
 4301         struct file *fp;
 4302         int error;
 4303 
 4304         error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp);
 4305         if (error != 0)
 4306                 return (error);
 4307 
 4308         /*
 4309          * The file could be not of the vnode type, or it may be not
 4310          * yet fully initialized, in which case the f_vnode pointer
 4311          * may be set, but f_ops is still badfileops.  E.g.,
 4312          * devfs_open() transiently create such situation to
 4313          * facilitate csw d_fdopen().
 4314          *
 4315          * Dupfdopen() handling in kern_openat() installs the
 4316          * half-baked file into the process descriptor table, allowing
 4317          * other thread to dereference it. Guard against the race by
 4318          * checking f_ops.
 4319          */
 4320         if (fp->f_vnode == NULL || fp->f_ops == &badfileops) {
 4321                 fdrop(fp, td);
 4322                 *fpp = NULL;
 4323                 return (EINVAL);
 4324         }
 4325 
 4326         *fpp = fp;
 4327         return (0);
 4328 }
 4329 
 4330 /*
 4331  * Convert a user file descriptor to a kernel file entry and check
 4332  * that, if it is a capability, the correct rights are present.
 4333  * A reference on the file entry is held upon returning.
 4334  */
 4335 int
 4336 getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 4337 {
 4338         int error;
 4339 
 4340         error = getvnode_path(td, fd, rightsp, fpp);
 4341 
 4342         /*
 4343          * Filter out O_PATH file descriptors, most getvnode() callers
 4344          * do not call fo_ methods.
 4345          */
 4346         if (error == 0 && (*fpp)->f_ops == &path_fileops) {
 4347                 fdrop(*fpp, td);
 4348                 *fpp = NULL;
 4349                 error = EBADF;
 4350         }
 4351 
 4352         return (error);
 4353 }
 4354 
 4355 /*
 4356  * Get an (NFS) file handle.
 4357  */
 4358 #ifndef _SYS_SYSPROTO_H_
 4359 struct lgetfh_args {
 4360         char *fname;
 4361         fhandle_t *fhp;
 4362 };
 4363 #endif
 4364 int
 4365 sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
 4366 {
 4367 
 4368         return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname,
 4369             UIO_USERSPACE, uap->fhp, UIO_USERSPACE));
 4370 }
 4371 
 4372 #ifndef _SYS_SYSPROTO_H_
 4373 struct getfh_args {
 4374         char *fname;
 4375         fhandle_t *fhp;
 4376 };
 4377 #endif
 4378 int
 4379 sys_getfh(struct thread *td, struct getfh_args *uap)
 4380 {
 4381 
 4382         return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE,
 4383             uap->fhp, UIO_USERSPACE));
 4384 }
 4385 
 4386 /*
 4387  * syscall for the rpc.lockd to use to translate an open descriptor into
 4388  * a NFS file handle.
 4389  *
 4390  * warning: do not remove the priv_check() call or this becomes one giant
 4391  * security hole.
 4392  */
 4393 #ifndef _SYS_SYSPROTO_H_
 4394 struct getfhat_args {
 4395         int fd;
 4396         char *path;
 4397         fhandle_t *fhp;
 4398         int flags;
 4399 };
 4400 #endif
 4401 int
 4402 sys_getfhat(struct thread *td, struct getfhat_args *uap)
 4403 {
 4404 
 4405         return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE,
 4406             uap->fhp, UIO_USERSPACE));
 4407 }
 4408 
 4409 int
 4410 kern_getfhat(struct thread *td, int flags, int fd, const char *path,
 4411     enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg)
 4412 {
 4413         struct nameidata nd;
 4414         fhandle_t fh;
 4415         struct vnode *vp;
 4416         int error;
 4417 
 4418         if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
 4419                 return (EINVAL);
 4420         error = priv_check(td, PRIV_VFS_GETFH);
 4421         if (error != 0)
 4422                 return (error);
 4423         NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW |
 4424             AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path,
 4425             fd, td);
 4426         error = namei(&nd);
 4427         if (error != 0)
 4428                 return (error);
 4429         NDFREE_NOTHING(&nd);
 4430         vp = nd.ni_vp;
 4431         bzero(&fh, sizeof(fh));
 4432         fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 4433         error = VOP_VPTOFH(vp, &fh.fh_fid);
 4434         vput(vp);
 4435         if (error == 0) {
 4436                 if (fhseg == UIO_USERSPACE)
 4437                         error = copyout(&fh, fhp, sizeof (fh));
 4438                 else
 4439                         memcpy(fhp, &fh, sizeof(fh));
 4440         }
 4441         return (error);
 4442 }
 4443 
 4444 #ifndef _SYS_SYSPROTO_H_
 4445 struct fhlink_args {
 4446         fhandle_t *fhp;
 4447         const char *to;
 4448 };
 4449 #endif
 4450 int
 4451 sys_fhlink(struct thread *td, struct fhlink_args *uap)
 4452 {
 4453 
 4454         return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp));
 4455 }
 4456 
 4457 #ifndef _SYS_SYSPROTO_H_
 4458 struct fhlinkat_args {
 4459         fhandle_t *fhp;
 4460         int tofd;
 4461         const char *to;
 4462 };
 4463 #endif
 4464 int
 4465 sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap)
 4466 {
 4467 
 4468         return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp));
 4469 }
 4470 
 4471 static int
 4472 kern_fhlinkat(struct thread *td, int fd, const char *path,
 4473     enum uio_seg pathseg, fhandle_t *fhp)
 4474 {
 4475         fhandle_t fh;
 4476         struct mount *mp;
 4477         struct vnode *vp;
 4478         int error;
 4479 
 4480         error = priv_check(td, PRIV_VFS_GETFH);
 4481         if (error != 0)
 4482                 return (error);
 4483         error = copyin(fhp, &fh, sizeof(fh));
 4484         if (error != 0)
 4485                 return (error);
 4486         do {
 4487                 bwillwrite();
 4488                 if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 4489                         return (ESTALE);
 4490                 error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
 4491                 vfs_unbusy(mp);
 4492                 if (error != 0)
 4493                         return (error);
 4494                 VOP_UNLOCK(vp);
 4495                 error = kern_linkat_vp(td, vp, fd, path, pathseg);
 4496         } while (error == EAGAIN || error == ERELOOKUP);
 4497         return (error);
 4498 }
 4499 
 4500 #ifndef _SYS_SYSPROTO_H_
 4501 struct fhreadlink_args {
 4502         fhandle_t *fhp;
 4503         char *buf;
 4504         size_t bufsize;
 4505 };
 4506 #endif
 4507 int
 4508 sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap)
 4509 {
 4510         fhandle_t fh;
 4511         struct mount *mp;
 4512         struct vnode *vp;
 4513         int error;
 4514 
 4515         error = priv_check(td, PRIV_VFS_GETFH);
 4516         if (error != 0)
 4517                 return (error);
 4518         if (uap->bufsize > IOSIZE_MAX)
 4519                 return (EINVAL);
 4520         error = copyin(uap->fhp, &fh, sizeof(fh));
 4521         if (error != 0)
 4522                 return (error);
 4523         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 4524                 return (ESTALE);
 4525         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
 4526         vfs_unbusy(mp);
 4527         if (error != 0)
 4528                 return (error);
 4529         error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td);
 4530         vput(vp);
 4531         return (error);
 4532 }
 4533 
 4534 /*
 4535  * syscall for the rpc.lockd to use to translate a NFS file handle into an
 4536  * open descriptor.
 4537  *
 4538  * warning: do not remove the priv_check() call or this becomes one giant
 4539  * security hole.
 4540  */
 4541 #ifndef _SYS_SYSPROTO_H_
 4542 struct fhopen_args {
 4543         const struct fhandle *u_fhp;
 4544         int flags;
 4545 };
 4546 #endif
 4547 int
 4548 sys_fhopen(struct thread *td, struct fhopen_args *uap)
 4549 {
 4550         return (kern_fhopen(td, uap->u_fhp, uap->flags));
 4551 }
 4552 
 4553 int
 4554 kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags)
 4555 {
 4556         struct mount *mp;
 4557         struct vnode *vp;
 4558         struct fhandle fhp;
 4559         struct file *fp;
 4560         int fmode, error;
 4561         int indx;
 4562 
 4563         error = priv_check(td, PRIV_VFS_FHOPEN);
 4564         if (error != 0)
 4565                 return (error);
 4566         indx = -1;
 4567         fmode = FFLAGS(flags);
 4568         /* why not allow a non-read/write open for our lockd? */
 4569         if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 4570                 return (EINVAL);
 4571         error = copyin(u_fhp, &fhp, sizeof(fhp));
 4572         if (error != 0)
 4573                 return(error);
 4574         /* find the mount point */
 4575         mp = vfs_busyfs(&fhp.fh_fsid);
 4576         if (mp == NULL)
 4577                 return (ESTALE);
 4578         /* now give me my vnode, it gets returned to me locked */
 4579         error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
 4580         vfs_unbusy(mp);
 4581         if (error != 0)
 4582                 return (error);
 4583 
 4584         error = falloc_noinstall(td, &fp);
 4585         if (error != 0) {
 4586                 vput(vp);
 4587                 return (error);
 4588         }
 4589         /*
 4590          * An extra reference on `fp' has been held for us by
 4591          * falloc_noinstall().
 4592          */
 4593 
 4594 #ifdef INVARIANTS
 4595         td->td_dupfd = -1;
 4596 #endif
 4597         error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
 4598         if (error != 0) {
 4599                 KASSERT(fp->f_ops == &badfileops,
 4600                     ("VOP_OPEN in fhopen() set f_ops"));
 4601                 KASSERT(td->td_dupfd < 0,
 4602                     ("fhopen() encountered fdopen()"));
 4603 
 4604                 vput(vp);
 4605                 goto bad;
 4606         }
 4607 #ifdef INVARIANTS
 4608         td->td_dupfd = 0;
 4609 #endif
 4610         fp->f_vnode = vp;
 4611         finit_vnode(fp, fmode, NULL, &vnops);
 4612         VOP_UNLOCK(vp);
 4613         if ((fmode & O_TRUNC) != 0) {
 4614                 error = fo_truncate(fp, 0, td->td_ucred, td);
 4615                 if (error != 0)
 4616                         goto bad;
 4617         }
 4618 
 4619         error = finstall(td, fp, &indx, fmode, NULL);
 4620 bad:
 4621         fdrop(fp, td);
 4622         td->td_retval[0] = indx;
 4623         return (error);
 4624 }
 4625 
 4626 /*
 4627  * Stat an (NFS) file handle.
 4628  */
 4629 #ifndef _SYS_SYSPROTO_H_
 4630 struct fhstat_args {
 4631         struct fhandle *u_fhp;
 4632         struct stat *sb;
 4633 };
 4634 #endif
 4635 int
 4636 sys_fhstat(struct thread *td, struct fhstat_args *uap)
 4637 {
 4638         struct stat sb;
 4639         struct fhandle fh;
 4640         int error;
 4641 
 4642         error = copyin(uap->u_fhp, &fh, sizeof(fh));
 4643         if (error != 0)
 4644                 return (error);
 4645         error = kern_fhstat(td, fh, &sb);
 4646         if (error == 0)
 4647                 error = copyout(&sb, uap->sb, sizeof(sb));
 4648         return (error);
 4649 }
 4650 
 4651 int
 4652 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
 4653 {
 4654         struct mount *mp;
 4655         struct vnode *vp;
 4656         int error;
 4657 
 4658         error = priv_check(td, PRIV_VFS_FHSTAT);
 4659         if (error != 0)
 4660                 return (error);
 4661         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 4662                 return (ESTALE);
 4663         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 4664         vfs_unbusy(mp);
 4665         if (error != 0)
 4666                 return (error);
 4667         error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td);
 4668         vput(vp);
 4669         return (error);
 4670 }
 4671 
 4672 /*
 4673  * Implement fstatfs() for (NFS) file handles.
 4674  */
 4675 #ifndef _SYS_SYSPROTO_H_
 4676 struct fhstatfs_args {
 4677         struct fhandle *u_fhp;
 4678         struct statfs *buf;
 4679 };
 4680 #endif
 4681 int
 4682 sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
 4683 {
 4684         struct statfs *sfp;
 4685         fhandle_t fh;
 4686         int error;
 4687 
 4688         error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 4689         if (error != 0)
 4690                 return (error);
 4691         sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 4692         error = kern_fhstatfs(td, fh, sfp);
 4693         if (error == 0)
 4694                 error = copyout(sfp, uap->buf, sizeof(*sfp));
 4695         free(sfp, M_STATFS);
 4696         return (error);
 4697 }
 4698 
 4699 int
 4700 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
 4701 {
 4702         struct mount *mp;
 4703         struct vnode *vp;
 4704         int error;
 4705 
 4706         error = priv_check(td, PRIV_VFS_FHSTATFS);
 4707         if (error != 0)
 4708                 return (error);
 4709         if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 4710                 return (ESTALE);
 4711         error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 4712         if (error != 0) {
 4713                 vfs_unbusy(mp);
 4714                 return (error);
 4715         }
 4716         vput(vp);
 4717         error = prison_canseemount(td->td_ucred, mp);
 4718         if (error != 0)
 4719                 goto out;
 4720 #ifdef MAC
 4721         error = mac_mount_check_stat(td->td_ucred, mp);
 4722         if (error != 0)
 4723                 goto out;
 4724 #endif
 4725         error = VFS_STATFS(mp, buf);
 4726 out:
 4727         vfs_unbusy(mp);
 4728         return (error);
 4729 }
 4730 
 4731 /*
 4732  * Unlike madvise(2), we do not make a best effort to remember every
 4733  * possible caching hint.  Instead, we remember the last setting with
 4734  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
 4735  * region of any current setting.
 4736  */
 4737 int
 4738 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
 4739     int advice)
 4740 {
 4741         struct fadvise_info *fa, *new;
 4742         struct file *fp;
 4743         struct vnode *vp;
 4744         off_t end;
 4745         int error;
 4746 
 4747         if (offset < 0 || len < 0 || offset > OFF_MAX - len)
 4748                 return (EINVAL);
 4749         AUDIT_ARG_VALUE(advice);
 4750         switch (advice) {
 4751         case POSIX_FADV_SEQUENTIAL:
 4752         case POSIX_FADV_RANDOM:
 4753         case POSIX_FADV_NOREUSE:
 4754                 new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
 4755                 break;
 4756         case POSIX_FADV_NORMAL:
 4757         case POSIX_FADV_WILLNEED:
 4758         case POSIX_FADV_DONTNEED:
 4759                 new = NULL;
 4760                 break;
 4761         default:
 4762                 return (EINVAL);
 4763         }
 4764         /* XXX: CAP_POSIX_FADVISE? */
 4765         AUDIT_ARG_FD(fd);
 4766         error = fget(td, fd, &cap_no_rights, &fp);
 4767         if (error != 0)
 4768                 goto out;
 4769         AUDIT_ARG_FILE(td->td_proc, fp);
 4770         if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 4771                 error = ESPIPE;
 4772                 goto out;
 4773         }
 4774         if (fp->f_type != DTYPE_VNODE) {
 4775                 error = ENODEV;
 4776                 goto out;
 4777         }
 4778         vp = fp->f_vnode;
 4779         if (vp->v_type != VREG) {
 4780                 error = ENODEV;
 4781                 goto out;
 4782         }
 4783         if (len == 0)
 4784                 end = OFF_MAX;
 4785         else
 4786                 end = offset + len - 1;
 4787         switch (advice) {
 4788         case POSIX_FADV_SEQUENTIAL:
 4789         case POSIX_FADV_RANDOM:
 4790         case POSIX_FADV_NOREUSE:
 4791                 /*
 4792                  * Try to merge any existing non-standard region with
 4793                  * this new region if possible, otherwise create a new
 4794                  * non-standard region for this request.
 4795                  */
 4796                 mtx_pool_lock(mtxpool_sleep, fp);
 4797                 fa = fp->f_advice;
 4798                 if (fa != NULL && fa->fa_advice == advice &&
 4799                     ((fa->fa_start <= end && fa->fa_end >= offset) ||
 4800                     (end != OFF_MAX && fa->fa_start == end + 1) ||
 4801                     (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
 4802                         if (offset < fa->fa_start)
 4803                                 fa->fa_start = offset;
 4804                         if (end > fa->fa_end)
 4805                                 fa->fa_end = end;
 4806                 } else {
 4807                         new->fa_advice = advice;
 4808                         new->fa_start = offset;
 4809                         new->fa_end = end;
 4810                         fp->f_advice = new;
 4811                         new = fa;
 4812                 }
 4813                 mtx_pool_unlock(mtxpool_sleep, fp);
 4814                 break;
 4815         case POSIX_FADV_NORMAL:
 4816                 /*
 4817                  * If a the "normal" region overlaps with an existing
 4818                  * non-standard region, trim or remove the
 4819                  * non-standard region.
 4820                  */
 4821                 mtx_pool_lock(mtxpool_sleep, fp);
 4822                 fa = fp->f_advice;
 4823                 if (fa != NULL) {
 4824                         if (offset <= fa->fa_start && end >= fa->fa_end) {
 4825                                 new = fa;
 4826                                 fp->f_advice = NULL;
 4827                         } else if (offset <= fa->fa_start &&
 4828                             end >= fa->fa_start)
 4829                                 fa->fa_start = end + 1;
 4830                         else if (offset <= fa->fa_end && end >= fa->fa_end)
 4831                                 fa->fa_end = offset - 1;
 4832                         else if (offset >= fa->fa_start && end <= fa->fa_end) {
 4833                                 /*
 4834                                  * If the "normal" region is a middle
 4835                                  * portion of the existing
 4836                                  * non-standard region, just remove
 4837                                  * the whole thing rather than picking
 4838                                  * one side or the other to
 4839                                  * preserve.
 4840                                  */
 4841                                 new = fa;
 4842                                 fp->f_advice = NULL;
 4843                         }
 4844                 }
 4845                 mtx_pool_unlock(mtxpool_sleep, fp);
 4846                 break;
 4847         case POSIX_FADV_WILLNEED:
 4848         case POSIX_FADV_DONTNEED:
 4849                 error = VOP_ADVISE(vp, offset, end, advice);
 4850                 break;
 4851         }
 4852 out:
 4853         if (fp != NULL)
 4854                 fdrop(fp, td);
 4855         free(new, M_FADVISE);
 4856         return (error);
 4857 }
 4858 
 4859 int
 4860 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
 4861 {
 4862         int error;
 4863 
 4864         error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
 4865             uap->advice);
 4866         return (kern_posix_error(td, error));
 4867 }
 4868 
 4869 int
 4870 kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
 4871     off_t *outoffp, size_t len, unsigned int flags)
 4872 {
 4873         struct file *infp, *outfp;
 4874         struct vnode *invp, *outvp;
 4875         int error;
 4876         size_t retlen;
 4877         void *rl_rcookie, *rl_wcookie;
 4878         off_t savinoff, savoutoff;
 4879 
 4880         infp = outfp = NULL;
 4881         rl_rcookie = rl_wcookie = NULL;
 4882         savinoff = -1;
 4883         error = 0;
 4884         retlen = 0;
 4885 
 4886         if (flags != 0) {
 4887                 error = EINVAL;
 4888                 goto out;
 4889         }
 4890         if (len > SSIZE_MAX)
 4891                 /*
 4892                  * Although the len argument is size_t, the return argument
 4893                  * is ssize_t (which is signed).  Therefore a size that won't
 4894                  * fit in ssize_t can't be returned.
 4895                  */
 4896                 len = SSIZE_MAX;
 4897 
 4898         /* Get the file structures for the file descriptors. */
 4899         error = fget_read(td, infd, &cap_read_rights, &infp);
 4900         if (error != 0)
 4901                 goto out;
 4902         if (infp->f_ops == &badfileops) {
 4903                 error = EBADF;
 4904                 goto out;
 4905         }
 4906         if (infp->f_vnode == NULL) {
 4907                 error = EINVAL;
 4908                 goto out;
 4909         }
 4910         error = fget_write(td, outfd, &cap_write_rights, &outfp);
 4911         if (error != 0)
 4912                 goto out;
 4913         if (outfp->f_ops == &badfileops) {
 4914                 error = EBADF;
 4915                 goto out;
 4916         }
 4917         if (outfp->f_vnode == NULL) {
 4918                 error = EINVAL;
 4919                 goto out;
 4920         }
 4921 
 4922         /* Set the offset pointers to the correct place. */
 4923         if (inoffp == NULL)
 4924                 inoffp = &infp->f_offset;
 4925         if (outoffp == NULL)
 4926                 outoffp = &outfp->f_offset;
 4927         savinoff = *inoffp;
 4928         savoutoff = *outoffp;
 4929 
 4930         invp = infp->f_vnode;
 4931         outvp = outfp->f_vnode;
 4932         /* Sanity check the f_flag bits. */
 4933         if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
 4934             (infp->f_flag & FREAD) == 0) {
 4935                 error = EBADF;
 4936                 goto out;
 4937         }
 4938 
 4939         /* If len == 0, just return 0. */
 4940         if (len == 0)
 4941                 goto out;
 4942 
 4943         /*
 4944          * If infp and outfp refer to the same file, the byte ranges cannot
 4945          * overlap.
 4946          */
 4947         if (invp == outvp && ((savinoff <= savoutoff && savinoff + len >
 4948             savoutoff) || (savinoff > savoutoff && savoutoff + len >
 4949             savinoff))) {
 4950                 error = EINVAL;
 4951                 goto out;
 4952         }
 4953 
 4954         /* Range lock the byte ranges for both invp and outvp. */
 4955         for (;;) {
 4956                 rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
 4957                     len);
 4958                 rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp +
 4959                     len);
 4960                 if (rl_rcookie != NULL)
 4961                         break;
 4962                 vn_rangelock_unlock(outvp, rl_wcookie);
 4963                 rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
 4964                 vn_rangelock_unlock(invp, rl_rcookie);
 4965         }
 4966 
 4967         retlen = len;
 4968         error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
 4969             flags, infp->f_cred, outfp->f_cred, td);
 4970 out:
 4971         if (rl_rcookie != NULL)
 4972                 vn_rangelock_unlock(invp, rl_rcookie);
 4973         if (rl_wcookie != NULL)
 4974                 vn_rangelock_unlock(outvp, rl_wcookie);
 4975         if (savinoff != -1 && (error == EINTR || error == ERESTART)) {
 4976                 *inoffp = savinoff;
 4977                 *outoffp = savoutoff;
 4978         }
 4979         if (outfp != NULL)
 4980                 fdrop(outfp, td);
 4981         if (infp != NULL)
 4982                 fdrop(infp, td);
 4983         td->td_retval[0] = retlen;
 4984         return (error);
 4985 }
 4986 
 4987 int
 4988 sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
 4989 {
 4990         off_t inoff, outoff, *inoffp, *outoffp;
 4991         int error;
 4992 
 4993         inoffp = outoffp = NULL;
 4994         if (uap->inoffp != NULL) {
 4995                 error = copyin(uap->inoffp, &inoff, sizeof(off_t));
 4996                 if (error != 0)
 4997                         return (error);
 4998                 inoffp = &inoff;
 4999         }
 5000         if (uap->outoffp != NULL) {
 5001                 error = copyin(uap->outoffp, &outoff, sizeof(off_t));
 5002                 if (error != 0)
 5003                         return (error);
 5004                 outoffp = &outoff;
 5005         }
 5006         error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
 5007             outoffp, uap->len, uap->flags);
 5008         if (error == 0 && uap->inoffp != NULL)
 5009                 error = copyout(inoffp, uap->inoffp, sizeof(off_t));
 5010         if (error == 0 && uap->outoffp != NULL)
 5011                 error = copyout(outoffp, uap->outoffp, sizeof(off_t));
 5012         return (error);
 5013 }

Cache object: 02fcd14d3c654140bb644e642221deed


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.