The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/bsd/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
    3  *
    4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
    5  * 
    6  * This file contains Original Code and/or Modifications of Original Code
    7  * as defined in and that are subject to the Apple Public Source License
    8  * Version 2.0 (the 'License'). You may not use this file except in
    9  * compliance with the License. The rights granted to you under the License
   10  * may not be used to create, or enable the creation or redistribution of,
   11  * unlawful or unlicensed copies of an Apple operating system, or to
   12  * circumvent, violate, or enable the circumvention or violation of, any
   13  * terms of an Apple operating system software license agreement.
   14  * 
   15  * Please obtain a copy of the License at
   16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
   17  * 
   18  * The Original Code and all software distributed under the License are
   19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   23  * Please see the License for the specific language governing rights and
   24  * limitations under the License.
   25  * 
   26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   27  */
   28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
   29 /*
   30  * Copyright (c) 1982, 1986, 1989, 1993
   31  *      The Regents of the University of California.  All rights reserved.
   32  * (c) UNIX System Laboratories, Inc.
   33  * All or some portions of this file are derived from material licensed
   34  * to the University of California by American Telephone and Telegraph
   35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   36  * the permission of UNIX System Laboratories, Inc.
   37  *
   38  * Redistribution and use in source and binary forms, with or without
   39  * modification, are permitted provided that the following conditions
   40  * are met:
   41  * 1. Redistributions of source code must retain the above copyright
   42  *    notice, this list of conditions and the following disclaimer.
   43  * 2. Redistributions in binary form must reproduce the above copyright
   44  *    notice, this list of conditions and the following disclaimer in the
   45  *    documentation and/or other materials provided with the distribution.
   46  * 3. All advertising materials mentioning features or use of this software
   47  *    must display the following acknowledgement:
   48  *      This product includes software developed by the University of
   49  *      California, Berkeley and its contributors.
   50  * 4. Neither the name of the University nor the names of its contributors
   51  *    may be used to endorse or promote products derived from this software
   52  *    without specific prior written permission.
   53  *
   54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   64  * SUCH DAMAGE.
   65  *
   66  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
   67  */
   68 /*
   69  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
   70  * support for mandatory and extensible security protections.  This notice
   71  * is included in support of clause 2.2 (b) of the Apple Public License,
   72  * Version 2.0.
   73  */
   74 
   75 #include <sys/param.h>
   76 #include <sys/systm.h>
   77 #include <sys/filedesc.h>
   78 #include <sys/ioctl.h>
   79 #include <sys/file_internal.h>
   80 #include <sys/proc_internal.h>
   81 #include <sys/socketvar.h>
   82 #include <sys/uio_internal.h>
   83 #include <sys/kernel.h>
   84 #include <sys/stat.h>
   85 #include <sys/malloc.h>
   86 #include <sys/sysproto.h>
   87 
   88 #include <sys/mount_internal.h>
   89 #include <sys/protosw.h>
   90 #include <sys/ev.h>
   91 #include <sys/user.h>
   92 #include <sys/kdebug.h>
   93 #include <sys/poll.h>
   94 #include <sys/event.h>
   95 #include <sys/eventvar.h>
   96 #include <sys/proc.h>
   97 
   98 #include <mach/mach_types.h>
   99 #include <kern/kern_types.h>
  100 #include <kern/assert.h>
  101 #include <kern/kalloc.h>
  102 #include <kern/thread.h>
  103 #include <kern/clock.h>
  104 #include <kern/ledger.h>
  105 #include <kern/task.h>
  106 
  107 #include <sys/mbuf.h>
  108 #include <sys/socket.h>
  109 #include <sys/socketvar.h>
  110 #include <sys/errno.h>
  111 #include <sys/syscall.h>
  112 #include <sys/pipe.h>
  113 
  114 #include <security/audit/audit.h>
  115 
  116 #include <net/if.h>
  117 #include <net/route.h>
  118 
  119 #include <netinet/in.h>
  120 #include <netinet/in_systm.h>
  121 #include <netinet/ip.h>
  122 #include <netinet/in_pcb.h>
  123 #include <netinet/ip_var.h>
  124 #include <netinet/ip6.h>
  125 #include <netinet/tcp.h>
  126 #include <netinet/tcp_fsm.h>
  127 #include <netinet/tcp_seq.h>
  128 #include <netinet/tcp_timer.h>
  129 #include <netinet/tcp_var.h>
  130 #include <netinet/tcpip.h>
  131 #include <netinet/tcp_debug.h>
  132 /* for wait queue based select */
  133 #include <kern/wait_queue.h>
  134 #include <kern/kalloc.h>
  135 #include <sys/vnode_internal.h>
  136 
  137 /* XXX should be in a header file somewhere */
  138 void evsofree(struct socket *);
  139 void evpipefree(struct pipe *);
  140 void postpipeevent(struct pipe *, int);
  141 void postevent(struct socket *, struct sockbuf *, int);
  142 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
  143 
  144 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
  145 int wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
  146 extern void     *get_bsduthreadarg(thread_t);
  147 extern int      *get_bsduthreadrval(thread_t);
  148 
  149 __private_extern__ int  dofileread(vfs_context_t ctx, struct fileproc *fp,
  150                                                                    user_addr_t bufp, user_size_t nbyte, 
  151                                                                    off_t offset, int flags, user_ssize_t *retval);
  152 __private_extern__ int  dofilewrite(vfs_context_t ctx, struct fileproc *fp,
  153                                                                         user_addr_t bufp, user_size_t nbyte, 
  154                                                                         off_t offset, int flags, user_ssize_t *retval);
  155 __private_extern__ int  preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
  156 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
  157 
  158 
  159 /* Conflict wait queue for when selects collide (opaque type) */
  160 struct wait_queue select_conflict_queue;
  161 
  162 /*
  163  * Init routine called from bsd_init.c
  164  */
  165 void select_wait_queue_init(void);
  166 void
  167 select_wait_queue_init(void)
  168 {
  169         wait_queue_init(&select_conflict_queue, SYNC_POLICY_FIFO);
  170 }
  171 
  172 
  173 #if NETAT
  174 extern int appletalk_inited;
  175 #endif /* NETAT */
  176 
  177 #define f_flag f_fglob->fg_flag
  178 #define f_type f_fglob->fg_type
  179 #define f_msgcount f_fglob->fg_msgcount
  180 #define f_cred f_fglob->fg_cred
  181 #define f_ops f_fglob->fg_ops
  182 #define f_offset f_fglob->fg_offset
  183 #define f_data f_fglob->fg_data
  184 
  185 /*
  186  * Read system call.
  187  *
  188  * Returns:     0                       Success
  189  *      preparefileread:EBADF
  190  *      preparefileread:ESPIPE
  191  *      preparefileread:ENXIO
  192  *      preparefileread:EBADF
  193  *      dofileread:???
  194  */
  195 int
  196 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
  197 {
  198         __pthread_testcancel(1);
  199         return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
  200 }
  201 
  202 int
  203 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
  204 {
  205         struct fileproc *fp;
  206         int error;
  207         int fd = uap->fd;
  208         struct vfs_context context;
  209 
  210         if ( (error = preparefileread(p, &fp, fd, 0)) )
  211                 return (error);
  212 
  213         context = *(vfs_context_current());
  214         context.vc_ucred = fp->f_fglob->fg_cred;
  215 
  216         error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
  217                            (off_t)-1, 0, retval);
  218 
  219         donefileread(p, fp, fd);
  220 
  221         return (error);
  222 }
  223 
  224 /* 
  225  * Pread system call
  226  *
  227  * Returns:     0                       Success
  228  *      preparefileread:EBADF
  229  *      preparefileread:ESPIPE
  230  *      preparefileread:ENXIO
  231  *      preparefileread:EBADF
  232  *      dofileread:???
  233  */
  234 int
  235 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
  236 {
  237         __pthread_testcancel(1);
  238         return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
  239 }
  240 
  241 int
  242 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
  243 {
  244         struct fileproc *fp = NULL;     /* fp set by preparefileread() */
  245         int fd = uap->fd;
  246         int error;
  247         struct vfs_context context;
  248 
  249         if ( (error = preparefileread(p, &fp, fd, 1)) )
  250                 goto out;
  251 
  252         context = *(vfs_context_current());
  253         context.vc_ucred = fp->f_fglob->fg_cred;
  254 
  255         error = dofileread(&context, fp, uap->buf, uap->nbyte,
  256                         uap->offset, FOF_OFFSET, retval);
  257         
  258         donefileread(p, fp, fd);
  259 
  260         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
  261               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
  262 
  263 out:
  264         return (error);
  265 }
  266 
  267 /*
  268  * Code common for read and pread
  269  */
  270 
  271 void
  272 donefileread(struct proc *p, struct fileproc *fp, int fd)
  273 {
  274         proc_fdlock_spin(p);
  275 
  276         fp->f_flags &= ~FP_INCHRREAD;
  277 
  278         fp_drop(p, fd, fp, 1);
  279         proc_fdunlock(p);
  280 }
  281 
  282 /*
  283  * Returns:     0                       Success
  284  *              EBADF
  285  *              ESPIPE
  286  *              ENXIO
  287  *      fp_lookup:EBADF
  288  *      fo_read:???
  289  */
  290 int
  291 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
  292 {
  293         vnode_t vp;
  294         int     error;
  295         struct fileproc *fp;
  296 
  297         AUDIT_ARG(fd, fd);
  298 
  299         proc_fdlock_spin(p);
  300 
  301         error = fp_lookup(p, fd, &fp, 1);
  302 
  303         if (error) {
  304                 proc_fdunlock(p);
  305                 return (error);
  306         }
  307         if ((fp->f_flag & FREAD) == 0) {
  308                 error = EBADF;
  309                 goto out;
  310         }
  311         if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
  312                 error = ESPIPE;
  313                 goto out;
  314         }
  315         if (fp->f_type == DTYPE_VNODE) {
  316                 vp = (struct vnode *)fp->f_fglob->fg_data;
  317 
  318                 if (check_for_pread && (vnode_isfifo(vp))) {
  319                         error = ESPIPE;
  320                         goto out;
  321                 } 
  322                 if (check_for_pread && (vp->v_flag & VISTTY)) {
  323                         error = ENXIO;
  324                         goto out;
  325                 }
  326                 if (vp->v_type == VCHR)
  327                         fp->f_flags |= FP_INCHRREAD;
  328         }
  329 
  330         *fp_ret = fp;
  331 
  332         proc_fdunlock(p);
  333         return (0);
  334 
  335 out:
  336         fp_drop(p, fd, fp, 1);
  337         proc_fdunlock(p);
  338         return (error);
  339 }
  340 
  341 
  342 /*
  343  * Returns:     0                       Success
  344  *              EINVAL
  345  *      fo_read:???
  346  */
  347 __private_extern__ int
  348 dofileread(vfs_context_t ctx, struct fileproc *fp,
  349            user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
  350            user_ssize_t *retval)
  351 {
  352         uio_t auio;
  353         user_ssize_t bytecnt;
  354         long error = 0;
  355         char uio_buf[ UIO_SIZEOF(1) ];
  356 
  357         if (nbyte > INT_MAX)
  358                 return (EINVAL);
  359 
  360         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
  361                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ, 
  362                                                                           &uio_buf[0], sizeof(uio_buf));
  363         } else {
  364                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ, 
  365                                                                           &uio_buf[0], sizeof(uio_buf));
  366         }
  367         uio_addiov(auio, bufp, nbyte);
  368 
  369         bytecnt = nbyte;
  370 
  371         if ((error = fo_read(fp, auio, flags, ctx))) {
  372                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
  373                         error == EINTR || error == EWOULDBLOCK))
  374                         error = 0;
  375         }
  376         bytecnt -= uio_resid(auio);
  377 
  378         *retval = bytecnt;
  379 
  380         return (error);
  381 }
  382 
  383 /*      
  384  * Scatter read system call.
  385  *
  386  * Returns:     0                       Success
  387  *              EINVAL
  388  *              ENOMEM
  389  *      copyin:EFAULT
  390  *      rd_uio:???
  391  */
  392 int
  393 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
  394 {
  395         __pthread_testcancel(1);
  396         return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
  397 }
  398 
  399 int
  400 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
  401 {
  402         uio_t auio = NULL;
  403         int error;
  404         struct user_iovec *iovp;
  405 
  406         /* Verify range bedfore calling uio_create() */
  407         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
  408                 return (EINVAL);
  409 
  410         /* allocate a uio large enough to hold the number of iovecs passed */
  411         auio = uio_create(uap->iovcnt, 0,
  412                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
  413                                   UIO_READ);
  414                                   
  415         /* get location of iovecs within the uio.  then copyin the iovecs from
  416          * user space.
  417          */
  418         iovp = uio_iovsaddr(auio);
  419         if (iovp == NULL) {
  420                 error = ENOMEM;
  421                 goto ExitThisRoutine;
  422         }
  423         error = copyin_user_iovec_array(uap->iovp,
  424                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
  425                 uap->iovcnt, iovp);
  426         if (error) {
  427                 goto ExitThisRoutine;
  428         }
  429         
  430         /* finalize uio_t for use and do the IO 
  431          */
  432         uio_calculateresid(auio);
  433         error = rd_uio(p, uap->fd, auio, retval);
  434 
  435 ExitThisRoutine:
  436         if (auio != NULL) {
  437                 uio_free(auio);
  438         }
  439         return (error);
  440 }
  441 
  442 /*
  443  * Write system call
  444  *
  445  * Returns:     0                       Success
  446  *              EBADF
  447  *      fp_lookup:EBADF
  448  *      dofilewrite:???
  449  */
  450 int
  451 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
  452 {
  453         __pthread_testcancel(1);
  454         return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
  455 
  456 }
  457 
  458 int
  459 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
  460 {
  461         struct fileproc *fp;
  462         int error;      
  463         int fd = uap->fd;
  464 
  465         AUDIT_ARG(fd, fd);
  466 
  467         error = fp_lookup(p,fd,&fp,0);
  468         if (error)
  469                 return(error);
  470         if ((fp->f_flag & FWRITE) == 0) {
  471                 error = EBADF;
  472         } else {
  473                 struct vfs_context context = *(vfs_context_current());
  474                 context.vc_ucred = fp->f_fglob->fg_cred;
  475 
  476                 error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
  477                         (off_t)-1, 0, retval);
  478         }
  479         if (error == 0)
  480                 fp_drop_written(p, fd, fp);
  481         else
  482                 fp_drop(p, fd, fp, 0);
  483         return(error);  
  484 }
  485 
  486 /*                          
  487  * pwrite system call
  488  *
  489  * Returns:     0                       Success
  490  *              EBADF
  491  *              ESPIPE
  492  *              ENXIO
  493  *              EINVAL
  494  *      fp_lookup:EBADF
  495  *      dofilewrite:???
  496  */
  497 int
  498 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
  499 {
  500         __pthread_testcancel(1);
  501         return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
  502 }
  503 
  504 int
  505 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
  506 {
  507         struct fileproc *fp;
  508         int error; 
  509         int fd = uap->fd;
  510         vnode_t vp  = (vnode_t)0;
  511 
  512         AUDIT_ARG(fd, fd);
  513 
  514         error = fp_lookup(p,fd,&fp,0);
  515         if (error)
  516                 return(error);
  517 
  518         if ((fp->f_flag & FWRITE) == 0) {
  519                 error = EBADF;
  520         } else {
  521                 struct vfs_context context = *vfs_context_current();
  522                 context.vc_ucred = fp->f_fglob->fg_cred;
  523 
  524                 if (fp->f_type != DTYPE_VNODE) {
  525                         error = ESPIPE;
  526                         goto errout;
  527                 }
  528                 vp = (vnode_t)fp->f_fglob->fg_data;
  529                 if (vnode_isfifo(vp)) {
  530                         error = ESPIPE;
  531                         goto errout;
  532                 } 
  533                 if ((vp->v_flag & VISTTY)) {
  534                         error = ENXIO;
  535                         goto errout;
  536                 }
  537                 if (uap->offset == (off_t)-1) {
  538                         error = EINVAL;
  539                         goto errout;
  540                 }
  541 
  542                     error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
  543                         uap->offset, FOF_OFFSET, retval);
  544         }
  545 errout:
  546         if (error == 0)
  547                 fp_drop_written(p, fd, fp);
  548         else
  549                 fp_drop(p, fd, fp, 0);
  550 
  551         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
  552               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
  553         
  554         return(error);
  555 }
  556 
  557 /*
  558  * Returns:     0                       Success
  559  *              EINVAL
  560  *      <fo_write>:EPIPE
  561  *      <fo_write>:???                  [indirect through struct fileops]
  562  */
  563 __private_extern__ int                  
  564 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
  565             user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
  566             user_ssize_t *retval)
  567 {       
  568         uio_t auio;
  569         long error = 0;
  570         user_ssize_t bytecnt;
  571         char uio_buf[ UIO_SIZEOF(1) ];
  572 
  573         if (nbyte > INT_MAX)   
  574                 return (EINVAL);
  575 
  576         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
  577                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE, 
  578                                                                           &uio_buf[0], sizeof(uio_buf));
  579         } else {
  580                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE, 
  581                                                                           &uio_buf[0], sizeof(uio_buf));
  582         }
  583         uio_addiov(auio, bufp, nbyte);
  584 
  585         bytecnt = nbyte; 
  586         if ((error = fo_write(fp, auio, flags, ctx))) {
  587                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
  588                         error == EINTR || error == EWOULDBLOCK))
  589                         error = 0;
  590                 /* The socket layer handles SIGPIPE */
  591                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
  592                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
  593                         /* XXX Raise the signal on the thread? */
  594                         psignal(vfs_context_proc(ctx), SIGPIPE);
  595                 }
  596         }
  597         bytecnt -= uio_resid(auio);
  598         *retval = bytecnt;
  599 
  600         return (error); 
  601 }
  602         
  603 /*      
  604  * Gather write system call  
  605  */     
  606 int
  607 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
  608 {
  609         __pthread_testcancel(1);
  610         return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
  611 }
  612 
  613 int
  614 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
  615 {
  616         uio_t auio = NULL;
  617         int error;
  618         struct user_iovec *iovp;
  619 
  620         AUDIT_ARG(fd, uap->fd);
  621 
  622         /* Verify range bedfore calling uio_create() */
  623         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
  624                 return (EINVAL);
  625 
  626         /* allocate a uio large enough to hold the number of iovecs passed */
  627         auio = uio_create(uap->iovcnt, 0,
  628                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
  629                                   UIO_WRITE);
  630                                   
  631         /* get location of iovecs within the uio.  then copyin the iovecs from
  632          * user space.
  633          */
  634         iovp = uio_iovsaddr(auio);
  635         if (iovp == NULL) {
  636                 error = ENOMEM;
  637                 goto ExitThisRoutine;
  638         }
  639         error = copyin_user_iovec_array(uap->iovp,
  640                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
  641                 uap->iovcnt, iovp);
  642         if (error) {
  643                 goto ExitThisRoutine;
  644         }
  645         
  646         /* finalize uio_t for use and do the IO 
  647          */
  648         uio_calculateresid(auio);
  649         error = wr_uio(p, uap->fd, auio, retval);
  650 
  651 ExitThisRoutine:
  652         if (auio != NULL) {
  653                 uio_free(auio);
  654         }
  655         return (error);
  656 }
  657 
  658 
  659 int
  660 wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
  661 {
  662         struct fileproc *fp;
  663         int error;
  664         user_ssize_t count;
  665         struct vfs_context context = *vfs_context_current();
  666 
  667         error = fp_lookup(p,fdes,&fp,0);
  668         if (error)
  669                 return(error);
  670 
  671         if ((fp->f_flag & FWRITE) == 0) {
  672                 error = EBADF;
  673                 goto out;
  674         }
  675         count = uio_resid(uio);
  676 
  677         context.vc_ucred = fp->f_cred;
  678         error = fo_write(fp, uio, 0, &context);
  679         if (error) {
  680                 if (uio_resid(uio) != count && (error == ERESTART ||
  681                                                 error == EINTR || error == EWOULDBLOCK))
  682                         error = 0;
  683                 /* The socket layer handles SIGPIPE */
  684                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
  685                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0)
  686                         psignal(p, SIGPIPE);
  687         }
  688         *retval = count - uio_resid(uio);
  689 
  690 out:
  691         if (error == 0)
  692                 fp_drop_written(p, fdes, fp);
  693         else
  694                 fp_drop(p, fdes, fp, 0);
  695         return(error);
  696 }
  697 
  698 
  699 int
  700 rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
  701 {
  702         struct fileproc *fp;
  703         int error;
  704         user_ssize_t count;
  705         struct vfs_context context = *vfs_context_current();
  706 
  707         if ( (error = preparefileread(p, &fp, fdes, 0)) )
  708                 return (error);
  709 
  710         count = uio_resid(uio);
  711 
  712         context.vc_ucred = fp->f_cred;
  713 
  714         error = fo_read(fp, uio, 0, &context);
  715 
  716         if (error) {
  717                 if (uio_resid(uio) != count && (error == ERESTART ||
  718                                                 error == EINTR || error == EWOULDBLOCK))
  719                         error = 0;
  720         }
  721         *retval = count - uio_resid(uio);
  722 
  723         donefileread(p, fp, fdes);
  724 
  725         return (error);
  726 }
  727 
  728 /*
  729  * Ioctl system call
  730  *
  731  * Returns:     0                       Success
  732  *              EBADF
  733  *              ENOTTY
  734  *              ENOMEM
  735  *              ESRCH
  736  *      copyin:EFAULT
  737  *      copyoutEFAULT
  738  *      fp_lookup:EBADF                 Bad file descriptor
  739  *      fo_ioctl:???
  740  */
  741 int
  742 ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
  743 {
  744         struct fileproc *fp;
  745         u_long com;
  746         int error = 0;
  747         u_int size;
  748         caddr_t datap, memp;
  749         boolean_t is64bit;
  750         int tmp;
  751 #define STK_PARAMS      128
  752         char stkbuf[STK_PARAMS];
  753         int fd = uap->fd;
  754         struct vfs_context context = *vfs_context_current();
  755 
  756         AUDIT_ARG(fd, uap->fd);
  757         AUDIT_ARG(addr, uap->data);
  758 
  759         is64bit = proc_is64bit(p);
  760 #if CONFIG_AUDIT
  761         if (is64bit)
  762                 AUDIT_ARG(value64, uap->com);
  763         else
  764                 AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, uap->com));
  765 #endif /* CONFIG_AUDIT */
  766 
  767         proc_fdlock(p);
  768         error = fp_lookup(p,fd,&fp,1);
  769         if (error)  {
  770                 proc_fdunlock(p);
  771                 return(error);
  772         }
  773 
  774         AUDIT_ARG(file, p, fp);
  775 
  776         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  777                         error = EBADF;
  778                         goto out;
  779         }
  780 
  781         context.vc_ucred = fp->f_fglob->fg_cred;
  782 
  783 #if CONFIG_MACF
  784         error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, uap->com);
  785         if (error)
  786                 goto out;
  787 #endif
  788                 
  789 #if NETAT
  790         /*
  791          * ### LD 6/11/97 Hack Alert: this is to get AppleTalk to work
  792          * while implementing an ATioctl system call
  793          */
  794         {
  795                 if (appletalk_inited && ((uap->com & 0x0000FFFF) == 0xff99)) {
  796                         u_long  fixed_command;
  797 
  798 #ifdef APPLETALK_DEBUG
  799                         kprintf("ioctl: special AppleTalk \n");
  800 #endif
  801                         datap = &stkbuf[0];
  802                         *(user_addr_t *)datap = uap->data;
  803                         fixed_command = _IOW(0, 0xff99, uap->data);
  804                         error = fo_ioctl(fp, fixed_command, datap, &context);
  805                         goto out;
  806                 }
  807         }
  808 
  809 #endif /* NETAT */
  810 
  811 
  812         switch (com = uap->com) {
  813         case FIONCLEX:
  814                 *fdflags(p, uap->fd) &= ~UF_EXCLOSE;
  815                 error =0;
  816                 goto out;
  817         case FIOCLEX:
  818                 *fdflags(p, uap->fd) |= UF_EXCLOSE;
  819                 error =0;
  820                 goto out;
  821         }
  822 
  823         /*
  824          * Interpret high order word to find amount of data to be
  825          * copied to/from the user's address space.
  826          */
  827         size = IOCPARM_LEN(com);
  828         if (size > IOCPARM_MAX) {
  829                         error = ENOTTY;
  830                         goto out;
  831         }
  832         memp = NULL;
  833         if (size > sizeof (stkbuf)) {
  834                 proc_fdunlock(p);
  835                 if ((memp = (caddr_t)kalloc(size)) == 0) {
  836                         proc_fdlock(p);
  837                         error = ENOMEM;
  838                         goto out;
  839                 }
  840                 proc_fdlock(p);
  841                 datap = memp;
  842         } else
  843                 datap = &stkbuf[0];
  844         if (com&IOC_IN) {
  845                 if (size) {
  846                         proc_fdunlock(p);
  847                         error = copyin(uap->data, datap, size);
  848                         if (error) {
  849                                 if (memp)
  850                                         kfree(memp, size);
  851                                 proc_fdlock(p);
  852                                 goto out;
  853                         }
  854                         proc_fdlock(p);
  855                 } else {
  856                         /* XXX - IOC_IN and no size?  we should proably return an error here!! */
  857                         if (is64bit) {
  858                                 *(user_addr_t *)datap = uap->data;
  859                         }
  860                         else {
  861                                 *(uint32_t *)datap = (uint32_t)uap->data;
  862                         }
  863                 }
  864         } else if ((com&IOC_OUT) && size)
  865                 /*
  866                  * Zero the buffer so the user always
  867                  * gets back something deterministic.
  868                  */
  869                 bzero(datap, size);
  870         else if (com&IOC_VOID) {
  871                 /* XXX - this is odd since IOC_VOID means no parameters */
  872                 if (is64bit) {
  873                         *(user_addr_t *)datap = uap->data;
  874                 }
  875                 else {
  876                         *(uint32_t *)datap = (uint32_t)uap->data;
  877                 }
  878         }
  879 
  880         switch (com) {
  881 
  882         case FIONBIO:
  883                 if ( (tmp = *(int *)datap) )
  884                         fp->f_flag |= FNONBLOCK;
  885                 else
  886                         fp->f_flag &= ~FNONBLOCK;
  887                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
  888                 break;
  889 
  890         case FIOASYNC:
  891                 if ( (tmp = *(int *)datap) )
  892                         fp->f_flag |= FASYNC;
  893                 else
  894                         fp->f_flag &= ~FASYNC;
  895                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
  896                 break;
  897 
  898         case FIOSETOWN:
  899                 tmp = *(int *)datap;
  900                 if (fp->f_type == DTYPE_SOCKET) {
  901                         ((struct socket *)fp->f_data)->so_pgid = tmp;
  902                         error = 0;
  903                         break;
  904                 }
  905                 if (fp->f_type == DTYPE_PIPE) {
  906                         error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
  907                         break;
  908                 }
  909                 if (tmp <= 0) {
  910                         tmp = -tmp;
  911                 } else {
  912                         struct proc *p1 = proc_find(tmp);
  913                         if (p1 == 0) {
  914                                 error = ESRCH;
  915                                 break;
  916                         }
  917                         tmp = p1->p_pgrpid;
  918                         proc_rele(p1);
  919                 }
  920                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
  921                 break;
  922 
  923         case FIOGETOWN:
  924                 if (fp->f_type == DTYPE_SOCKET) {
  925                         error = 0;
  926                         *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
  927                         break;
  928                 }
  929                 error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
  930                 *(int *)datap = -*(int *)datap;
  931                 break;
  932 
  933         default:
  934                 error = fo_ioctl(fp, com, datap, &context);
  935                 /*
  936                  * Copy any data to user, size was
  937                  * already set and checked above.
  938                  */
  939                 if (error == 0 && (com&IOC_OUT) && size)
  940                         error = copyout(datap, uap->data, (u_int)size);
  941                 break;
  942         }
  943         proc_fdunlock(p);
  944         if (memp)
  945                 kfree(memp, size);
  946         proc_fdlock(p);
  947 out:
  948         fp_drop(p, fd, fp, 1);
  949         proc_fdunlock(p);
  950         return(error);
  951 }
  952 
  953 int     selwait, nselcoll;
  954 #define SEL_FIRSTPASS 1
  955 #define SEL_SECONDPASS 2
  956 extern int selcontinue(int error);
  957 extern int selprocess(int error, int sel_pass);
  958 static int selscan(struct proc *p, struct _select * sel,
  959                         int nfd, int32_t *retval, int sel_pass, wait_queue_sub_t wqsub);
  960 static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
  961 static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount);
  962 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
  963 
  964 /*
  965  * Select system call.
  966  *
  967  * Returns:     0                       Success
  968  *              EINVAL                  Invalid argument
  969  *              EAGAIN                  Nonconformant error if allocation fails
  970  *      selprocess:???
  971  */
  972 int
  973 select(struct proc *p, struct select_args *uap, int32_t *retval)
  974 {
  975         __pthread_testcancel(1);
  976         return(select_nocancel(p, (struct select_nocancel_args *)uap, retval));
  977 }
  978 
  979 int
  980 select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
  981 {
  982         int error = 0;
  983         u_int ni, nw, size;
  984         thread_t th_act;
  985         struct uthread  *uth;
  986         struct _select *sel;
  987         int needzerofill = 1;
  988         int count = 0;
  989 
  990         th_act = current_thread();
  991         uth = get_bsdthread_info(th_act);
  992         sel = &uth->uu_select;
  993         retval = (int *)get_bsduthreadrval(th_act);
  994         *retval = 0;
  995 
  996         if (uap->nd < 0) {
  997                 return (EINVAL);
  998         }
  999 
 1000         /* select on thread of process that already called proc_exit() */
 1001         if (p->p_fd == NULL) {
 1002                 return (EBADF);
 1003         }
 1004 
 1005         if (uap->nd > p->p_fd->fd_nfiles)
 1006                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
 1007 
 1008         nw = howmany(uap->nd, NFDBITS);
 1009         ni = nw * sizeof(fd_mask);
 1010 
 1011         /*
 1012          * if the previously allocated space for the bits is smaller than
 1013          * what is requested or no space has yet been allocated for this
 1014          * thread, allocate enough space now.
 1015          *
 1016          * Note: If this process fails, select() will return EAGAIN; this
 1017          * is the same thing pool() returns in a no-memory situation, but
 1018          * it is not a POSIX compliant error code for select().
 1019          */
 1020         if (sel->nbytes < (3 * ni)) {
 1021                 int nbytes = 3 * ni;
 1022 
 1023                 /* Free previous allocation, if any */
 1024                 if (sel->ibits != NULL)
 1025                         FREE(sel->ibits, M_TEMP);
 1026                 if (sel->obits != NULL) {
 1027                         FREE(sel->obits, M_TEMP);
 1028                         /* NULL out; subsequent ibits allocation may fail */
 1029                         sel->obits = NULL;
 1030                 }
 1031 
 1032                 MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
 1033                 if (sel->ibits == NULL)
 1034                         return (EAGAIN);
 1035                 MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
 1036                 if (sel->obits == NULL) {
 1037                         FREE(sel->ibits, M_TEMP);
 1038                         sel->ibits = NULL;
 1039                         return (EAGAIN);
 1040                 }
 1041                 sel->nbytes = nbytes;
 1042                 needzerofill = 0;
 1043         }
 1044 
 1045         if (needzerofill) {
 1046                 bzero((caddr_t)sel->ibits, sel->nbytes);
 1047                 bzero((caddr_t)sel->obits, sel->nbytes);
 1048         }
 1049 
 1050         /*
 1051          * get the bits from the user address space
 1052          */
 1053 #define getbits(name, x) \
 1054         do { \
 1055                 if (uap->name && (error = copyin(uap->name, \
 1056                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
 1057                         goto continuation; \
 1058         } while (0)
 1059 
 1060         getbits(in, 0);
 1061         getbits(ou, 1);
 1062         getbits(ex, 2);
 1063 #undef  getbits
 1064 
 1065         if (uap->tv) {
 1066                 struct timeval atv;
 1067                 if (IS_64BIT_PROCESS(p)) {
 1068                         struct user64_timeval atv64;
 1069                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
 1070                         /* Loses resolution - assume timeout < 68 years */
 1071                         atv.tv_sec = atv64.tv_sec;
 1072                         atv.tv_usec = atv64.tv_usec;
 1073                 } else {
 1074                         struct user32_timeval atv32;
 1075                         error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
 1076                         atv.tv_sec = atv32.tv_sec;
 1077                         atv.tv_usec = atv32.tv_usec;
 1078                 }
 1079                 if (error)
 1080                         goto continuation;
 1081                 if (itimerfix(&atv)) {
 1082                         error = EINVAL;
 1083                         goto continuation;
 1084                 }
 1085 
 1086                 clock_absolutetime_interval_to_deadline(
 1087                                                                                 tvtoabstime(&atv), &sel->abstime);
 1088         }
 1089         else
 1090                 sel->abstime = 0;
 1091 
 1092         if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) {
 1093                         goto continuation;
 1094         }
 1095 
 1096         sel->count = count;
 1097         size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK);
 1098         if (uth->uu_allocsize) {
 1099                 if (uth->uu_wqset == 0)
 1100                         panic("select: wql memory smashed");
 1101                 /* needed for the select now */
 1102                 if (size > uth->uu_allocsize) {
 1103                         kfree(uth->uu_wqset,  uth->uu_allocsize);
 1104                         uth->uu_allocsize = size;
 1105                         uth->uu_wqset = (wait_queue_set_t)kalloc(size);
 1106                         if (uth->uu_wqset == (wait_queue_set_t)NULL)
 1107                                 panic("failed to allocate memory for waitqueue\n");
 1108                 }
 1109         } else {
 1110                 uth->uu_allocsize = size;
 1111                 uth->uu_wqset = (wait_queue_set_t)kalloc(uth->uu_allocsize);
 1112                 if (uth->uu_wqset == (wait_queue_set_t)NULL)
 1113                         panic("failed to allocate memory for waitqueue\n");
 1114         }
 1115         bzero(uth->uu_wqset, size);
 1116         sel->wql = (char *)uth->uu_wqset + SIZEOF_WAITQUEUE_SET;
 1117         wait_queue_set_init(uth->uu_wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST));
 1118 
 1119 continuation:
 1120 
 1121         if (error) {
 1122                 /*
 1123                  * We have already cleaned up any state we established,
 1124                  * either locally or as a result of selcount().  We don't
 1125                  * need to wait_subqueue_unlink_all(), since we haven't set
 1126                  * anything at this point.
 1127                  */
 1128                 return (error);
 1129         }
 1130 
 1131         return selprocess(0, SEL_FIRSTPASS);
 1132 }
 1133 
 1134 int
 1135 selcontinue(int error)
 1136 {
 1137         return selprocess(error, SEL_SECONDPASS);
 1138 }
 1139 
 1140 
 1141 /*
 1142  * selprocess
 1143  *
 1144  * Parameters:  error                   The error code from our caller
 1145  *              sel_pass                The pass we are on
 1146  */
 1147 int
 1148 selprocess(int error, int sel_pass)
 1149 {
 1150         int ncoll;
 1151         u_int ni, nw;
 1152         thread_t th_act;
 1153         struct uthread  *uth;
 1154         struct proc *p;
 1155         struct select_args *uap;
 1156         int *retval;
 1157         struct _select *sel;
 1158         int unwind = 1;
 1159         int prepost = 0;
 1160         int somewakeup = 0;
 1161         int doretry = 0;
 1162         wait_result_t wait_result;
 1163 
 1164         p = current_proc();
 1165         th_act = current_thread();
 1166         uap = (struct select_args *)get_bsduthreadarg(th_act);
 1167         retval = (int *)get_bsduthreadrval(th_act);
 1168         uth = get_bsdthread_info(th_act);
 1169         sel = &uth->uu_select;
 1170 
 1171         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
 1172                         unwind = 0;
 1173         if (sel->count == 0)
 1174                         unwind = 0;
 1175 retry:
 1176         if (error != 0) {
 1177                 sel_pass = SEL_FIRSTPASS;       /* Reset for seldrop */
 1178                 goto done;
 1179         }
 1180 
 1181         ncoll = nselcoll;
 1182         OSBitOrAtomic(P_SELECT, &p->p_flag);
 1183         /* skip scans if the select is just for timeouts */
 1184         if (sel->count) {
 1185                 /*
 1186                  * Clear out any dangling refs from prior calls; technically
 1187                  * there should not be any.
 1188                  */
 1189                 if (sel_pass == SEL_FIRSTPASS)
 1190                         wait_queue_sub_clearrefs(uth->uu_wqset);
 1191 
 1192                 error = selscan(p, sel, uap->nd, retval, sel_pass, (wait_queue_sub_t)uth->uu_wqset);
 1193                 if (error || *retval) {
 1194                         goto done;
 1195                 }
 1196                 if (prepost) {
 1197                         /* if the select of log, then we canwakeup and discover some one
 1198                         * else already read the data; go toselct again if time permits
 1199                         */
 1200                         prepost = 0;
 1201                         doretry = 1;
 1202                 }
 1203                 if (somewakeup) {
 1204                         somewakeup = 0;
 1205                         doretry = 1;
 1206                 }
 1207         }
 1208 
 1209         if (uap->tv) {
 1210                 uint64_t        now;
 1211 
 1212                 clock_get_uptime(&now);
 1213                 if (now >= sel->abstime)
 1214                         goto done;
 1215         }
 1216 
 1217         if (doretry) {
 1218                 /* cleanup obits and try again */
 1219                 doretry = 0;
 1220                 sel_pass = SEL_FIRSTPASS;
 1221                 goto retry;
 1222         }
 1223 
 1224         /*
 1225          * To effect a poll, the timeout argument should be
 1226          * non-nil, pointing to a zero-valued timeval structure.
 1227          */
 1228         if (uap->tv && sel->abstime == 0) {
 1229                 goto done;
 1230         }
 1231 
 1232         /* No spurious wakeups due to colls,no need to check for them */
 1233          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
 1234                 sel_pass = SEL_FIRSTPASS;
 1235                 goto retry;
 1236         }
 1237 
 1238         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
 1239 
 1240         /* if the select is just for timeout skip check */
 1241         if (sel->count &&(sel_pass == SEL_SECONDPASS))
 1242                 panic("selprocess: 2nd pass assertwaiting");
 1243 
 1244         /* Wait Queue Subordinate has waitqueue as first element */
 1245         wait_result = wait_queue_assert_wait((wait_queue_t)uth->uu_wqset,
 1246                                              NULL, THREAD_ABORTSAFE, sel->abstime);
 1247         if (wait_result != THREAD_AWAKENED) {
 1248                 /* there are no preposted events */
 1249                 error = tsleep1(NULL, PSOCK | PCATCH,
 1250                                 "select", 0, selcontinue);
 1251         } else  {
 1252                 prepost = 1;
 1253                 error = 0;
 1254         }
 1255 
 1256         if (error == 0) {
 1257                 sel_pass = SEL_SECONDPASS;
 1258                 if (!prepost)
 1259                         somewakeup = 1;
 1260                 goto retry;
 1261         }
 1262 done:
 1263         if (unwind) {
 1264                 wait_subqueue_unlink_all(uth->uu_wqset);
 1265                 seldrop(p, sel->ibits, uap->nd);
 1266         }
 1267         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
 1268         /* select is not restarted after signals... */
 1269         if (error == ERESTART)
 1270                 error = EINTR;
 1271         if (error == EWOULDBLOCK)
 1272                 error = 0;
 1273         nw = howmany(uap->nd, NFDBITS);
 1274         ni = nw * sizeof(fd_mask);
 1275 
 1276 #define putbits(name, x) \
 1277         do { \
 1278                 if (uap->name && (error2 = \
 1279                         copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
 1280                         error = error2; \
 1281         } while (0)
 1282 
 1283         if (error == 0) {
 1284                 int error2;
 1285 
 1286                 putbits(in, 0);
 1287                 putbits(ou, 1);
 1288                 putbits(ex, 2);
 1289 #undef putbits
 1290         }
 1291         return(error);
 1292 }
 1293 
 1294 
 1295 /*
 1296  * selscan
 1297  *
 1298  * Parameters:  p                       Process performing the select
 1299  *              sel                     The per-thread select context structure
 1300  *              nfd                     The number of file descriptors to scan
 1301  *              retval                  The per thread system call return area
 1302  *              sel_pass                Which pass this is; allowed values are
 1303  *                                              SEL_FIRSTPASS and SEL_SECONDPASS
 1304  *              wqsub                   The per thread wait queue set
 1305  *
 1306  * Returns:     0                       Success
 1307  *              EIO                     Invalid p->p_fd field XXX Obsolete?
 1308  *              EBADF                   One of the files in the bit vector is
 1309  *                                              invalid.
 1310  */
 1311 static int
 1312 selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval,
 1313         int sel_pass, wait_queue_sub_t wqsub)
 1314 {
 1315         struct filedesc *fdp = p->p_fd;
 1316         int msk, i, j, fd;
 1317         u_int32_t bits;
 1318         struct fileproc *fp;
 1319         int n = 0;              /* count of bits */
 1320         int nc = 0;             /* bit vector offset (nc'th bit) */
 1321         static int flag[3] = { FREAD, FWRITE, 0 };
 1322         u_int32_t *iptr, *optr;
 1323         u_int nw;
 1324         u_int32_t *ibits, *obits;
 1325         char * wql;
 1326         char * wql_ptr;
 1327         int count;
 1328         struct vfs_context context = *vfs_context_current();
 1329 
 1330         /*
 1331          * Problems when reboot; due to MacOSX signal probs
 1332          * in Beaker1C ; verify that the p->p_fd is valid
 1333          */
 1334         if (fdp == NULL) {
 1335                 *retval=0;
 1336                 return(EIO);
 1337         }
 1338         ibits = sel->ibits;
 1339         obits = sel->obits;
 1340         wql = sel->wql;
 1341 
 1342         nw = howmany(nfd, NFDBITS);
 1343 
 1344         count = sel->count;
 1345 
 1346         nc = 0;
 1347         if (count) {
 1348                 proc_fdlock(p);
 1349                 for (msk = 0; msk < 3; msk++) {
 1350                         iptr = (u_int32_t *)&ibits[msk * nw];
 1351                         optr = (u_int32_t *)&obits[msk * nw];
 1352 
 1353                         for (i = 0; i < nfd; i += NFDBITS) {
 1354                                 bits = iptr[i/NFDBITS];
 1355 
 1356                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1357                                         bits &= ~(1 << j);
 1358                                         fp = fdp->fd_ofiles[fd];
 1359 
 1360                                         if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
 1361                                                 /*
 1362                                                  * If we abort because of a bad
 1363                                                  * fd, let the caller unwind...
 1364                                                  */
 1365                                                 proc_fdunlock(p);
 1366                                                 return(EBADF);
 1367                                         }
 1368                                         if (sel_pass == SEL_SECONDPASS) {
 1369                                                 wql_ptr = (char *)0;
 1370                                                 if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)wqsub)) {
 1371                                                         fp->f_flags &= ~FP_INSELECT;
 1372                                                         fp->f_waddr = (void *)0;
 1373                                                 }
 1374                                         } else {
 1375                                                 wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
 1376                                                 if (fp->f_flags & FP_INSELECT) {
 1377                                                         /* someone is already in select on this fp */
 1378                                                         fp->f_flags |= FP_SELCONFLICT;
 1379                                                         wait_queue_link(&select_conflict_queue, (wait_queue_set_t)wqsub);
 1380                                                 } else {
 1381                                                         fp->f_flags |= FP_INSELECT;
 1382                                                         fp->f_waddr = (void *)wqsub;
 1383                                                 }
 1384                                         }
 1385 
 1386                                         context.vc_ucred = fp->f_cred;
 1387 
 1388                                         /* The select; set the bit, if true */
 1389                                         if (fp->f_ops
 1390                                                 && fo_select(fp, flag[msk], wql_ptr, &context)) {
 1391                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
 1392                                                 n++;
 1393                                         }
 1394                                         nc++;
 1395                                 }
 1396                         }
 1397                 }
 1398                 proc_fdunlock(p);
 1399         }
 1400         *retval = n;
 1401         return (0);
 1402 }
 1403 
 1404 int poll_callback(struct kqueue *, struct kevent64_s *, void *);
 1405 
 1406 struct poll_continue_args {
 1407         user_addr_t pca_fds;
 1408         u_int pca_nfds;
 1409         u_int pca_rfds;
 1410 };
 1411 
 1412 int
 1413 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
 1414 {
 1415         __pthread_testcancel(1);
 1416         return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
 1417 }
 1418 
 1419 
 1420 int
 1421 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
 1422 {
 1423         struct poll_continue_args *cont;
 1424         struct pollfd *fds;
 1425         struct kqueue *kq;
 1426         struct timeval atv;
 1427         int ncoll, error = 0;
 1428         u_int nfds = uap->nfds;
 1429         u_int rfds = 0;
 1430         u_int i;
 1431         size_t ni;
 1432 
 1433         /*
 1434          * This is kinda bogus.  We have fd limits, but that is not
 1435          * really related to the size of the pollfd array.  Make sure
 1436          * we let the process use at least FD_SETSIZE entries and at
 1437          * least enough for the current limits.  We want to be reasonably
 1438          * safe, but not overly restrictive.
 1439          */
 1440         if (nfds > OPEN_MAX ||
 1441             (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) || nfds > FD_SETSIZE)))
 1442                 return (EINVAL);
 1443 
 1444         kq = kqueue_alloc(p);
 1445         if (kq == NULL)
 1446                 return (EAGAIN);
 1447 
 1448         ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
 1449         MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
 1450         if (NULL == cont) {
 1451                 error = EAGAIN;
 1452                 goto out;
 1453         }
 1454         
 1455         fds = (struct pollfd *)&cont[1];
 1456         error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
 1457         if (error)
 1458                 goto out;
 1459 
 1460         if (uap->timeout != -1) {
 1461                 struct timeval rtv;
 1462 
 1463                 atv.tv_sec = uap->timeout / 1000;
 1464                 atv.tv_usec = (uap->timeout % 1000) * 1000;
 1465                 if (itimerfix(&atv)) {
 1466                         error = EINVAL;
 1467                         goto out;
 1468                 }
 1469                 getmicrouptime(&rtv);
 1470                 timevaladd(&atv, &rtv);
 1471         } else {
 1472                 atv.tv_sec = 0;
 1473                 atv.tv_usec = 0;
 1474         }
 1475 
 1476         /* JMM - all this P_SELECT stuff is bogus */
 1477         ncoll = nselcoll;
 1478         OSBitOrAtomic(P_SELECT, &p->p_flag);
 1479         for (i = 0; i < nfds; i++) {
 1480                 short events = fds[i].events;
 1481                 struct kevent64_s kev;
 1482                 int kerror = 0;
 1483 
 1484                 /* per spec, ignore fd values below zero */
 1485                 if (fds[i].fd < 0) {
 1486                         fds[i].revents = 0;
 1487                         continue;
 1488                 }
 1489 
 1490                 /* convert the poll event into a kqueue kevent */
 1491                 kev.ident = fds[i].fd;
 1492                 kev.flags = EV_ADD | EV_ONESHOT | EV_POLL;
 1493                 kev.udata = CAST_USER_ADDR_T(&fds[i]);
 1494                 kev.fflags = 0;
 1495                 kev.data = 0;
 1496                 kev.ext[0] = 0;
 1497                 kev.ext[1] = 0;
 1498 
 1499                 /* Handle input events */
 1500                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
 1501                         kev.filter = EVFILT_READ;
 1502                         if (!(events & ( POLLIN | POLLRDNORM )))
 1503                                 kev.flags |= EV_OOBAND;
 1504                         kerror = kevent_register(kq, &kev, p);
 1505                 }
 1506 
 1507                 /* Handle output events */
 1508                 if (kerror == 0 &&
 1509                     events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) {
 1510                         kev.filter = EVFILT_WRITE;
 1511                         kerror = kevent_register(kq, &kev, p);
 1512                 }
 1513 
 1514                 /* Handle BSD extension vnode events */
 1515                 if (kerror == 0 &&
 1516                     events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) {
 1517                         kev.filter = EVFILT_VNODE;
 1518                         kev.fflags = 0;
 1519                         if (events & POLLEXTEND)
 1520                                 kev.fflags |= NOTE_EXTEND;
 1521                         if (events & POLLATTRIB)
 1522                                 kev.fflags |= NOTE_ATTRIB;
 1523                         if (events & POLLNLINK)
 1524                                 kev.fflags |= NOTE_LINK;
 1525                         if (events & POLLWRITE)
 1526                                 kev.fflags |= NOTE_WRITE;
 1527                         kerror = kevent_register(kq, &kev, p);
 1528                 }
 1529 
 1530                 if (kerror != 0) {
 1531                         fds[i].revents = POLLNVAL;
 1532                         rfds++;
 1533                 } else
 1534                         fds[i].revents = 0;
 1535         }
 1536 
 1537         /* Did we have any trouble registering? */
 1538         if (rfds > 0)
 1539                 goto done;
 1540 
 1541         /* scan for, and possibly wait for, the kevents to trigger */
 1542         cont->pca_fds = uap->fds;
 1543         cont->pca_nfds = nfds;
 1544         cont->pca_rfds = rfds;
 1545         error = kqueue_scan(kq, poll_callback, NULL, cont, &atv, p);
 1546         rfds = cont->pca_rfds;
 1547 
 1548  done:
 1549         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
 1550         /* poll is not restarted after signals... */
 1551         if (error == ERESTART)
 1552                 error = EINTR;
 1553         if (error == EWOULDBLOCK)
 1554                 error = 0;
 1555         if (error == 0) {
 1556                 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
 1557                 *retval = rfds;
 1558         }
 1559 
 1560  out:
 1561         if (NULL != cont)
 1562                 FREE(cont, M_TEMP);
 1563 
 1564         kqueue_dealloc(kq);
 1565         return (error);
 1566 }
 1567 
 1568 int
 1569 poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data)
 1570 {
 1571         struct poll_continue_args *cont = (struct poll_continue_args *)data;
 1572         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
 1573         short prev_revents = fds->revents;
 1574         short mask;
 1575 
 1576         /* convert the results back into revents */
 1577         if (kevp->flags & EV_EOF)
 1578                 fds->revents |= POLLHUP;
 1579         if (kevp->flags & EV_ERROR)
 1580                 fds->revents |= POLLERR;
 1581 
 1582         switch (kevp->filter) {
 1583         case EVFILT_READ:
 1584                 if (fds->revents & POLLHUP)
 1585                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
 1586                 else {
 1587                         mask = 0;
 1588                         if (kevp->data != 0)
 1589                                 mask |= (POLLIN | POLLRDNORM );
 1590                         if (kevp->flags & EV_OOBAND)
 1591                                 mask |= ( POLLPRI | POLLRDBAND );
 1592                 }
 1593                 fds->revents |= (fds->events & mask);
 1594                 break;
 1595 
 1596         case EVFILT_WRITE:
 1597                 if (!(fds->revents & POLLHUP))
 1598                         fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
 1599                 break;
 1600 
 1601         case EVFILT_VNODE:
 1602                 if (kevp->fflags & NOTE_EXTEND)
 1603                         fds->revents |= (fds->events & POLLEXTEND);
 1604                 if (kevp->fflags & NOTE_ATTRIB)
 1605                         fds->revents |= (fds->events & POLLATTRIB);
 1606                 if (kevp->fflags & NOTE_LINK)
 1607                         fds->revents |= (fds->events & POLLNLINK);
 1608                 if (kevp->fflags & NOTE_WRITE)
 1609                         fds->revents |= (fds->events & POLLWRITE);
 1610                 break;
 1611         }
 1612 
 1613         if (fds->revents != 0 && prev_revents == 0)
 1614                 cont->pca_rfds++;
 1615 
 1616         return 0;
 1617 }
 1618         
 1619 int
 1620 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
 1621 {
 1622 
 1623         return (1);
 1624 }
 1625 
 1626 /*
 1627  * selcount
 1628  *
 1629  * Count the number of bits set in the input bit vector, and establish an
 1630  * outstanding fp->f_iocount for each of the descriptors which will be in
 1631  * use in the select operation.
 1632  *
 1633  * Parameters:  p                       The process doing the select
 1634  *              ibits                   The input bit vector
 1635  *              nfd                     The number of fd's in the vector
 1636  *              countp                  Pointer to where to store the bit count
 1637  *
 1638  * Returns:     0                       Success
 1639  *              EIO                     Bad per process open file table
 1640  *              EBADF                   One of the bits in the input bit vector
 1641  *                                              references an invalid fd
 1642  *
 1643  * Implicit:    *countp (modified)      Count of fd's
 1644  *
 1645  * Notes:       This function is the first pass under the proc_fdlock() that
 1646  *              permits us to recognize invalid descriptors in the bit vector;
 1647  *              the may, however, not remain valid through the drop and
 1648  *              later reacquisition of the proc_fdlock().
 1649  */
 1650 static int
 1651 selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
 1652 {
 1653         struct filedesc *fdp = p->p_fd;
 1654         int msk, i, j, fd;
 1655         u_int32_t bits;
 1656         struct fileproc *fp;
 1657         int n = 0;
 1658         u_int32_t *iptr;
 1659         u_int nw;
 1660         int error=0; 
 1661         int dropcount;
 1662         int need_wakeup = 0;
 1663 
 1664         /*
 1665          * Problems when reboot; due to MacOSX signal probs
 1666          * in Beaker1C ; verify that the p->p_fd is valid
 1667          */
 1668         if (fdp == NULL) {
 1669                 *countp = 0;
 1670                 return(EIO);
 1671         }
 1672         nw = howmany(nfd, NFDBITS);
 1673 
 1674         proc_fdlock(p);
 1675         for (msk = 0; msk < 3; msk++) {
 1676                 iptr = (u_int32_t *)&ibits[msk * nw];
 1677                 for (i = 0; i < nfd; i += NFDBITS) {
 1678                         bits = iptr[i/NFDBITS];
 1679                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1680                                 bits &= ~(1 << j);
 1681                                 fp = fdp->fd_ofiles[fd];
 1682                                 if (fp == NULL ||
 1683                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
 1684                                                 *countp = 0;
 1685                                                 error = EBADF;
 1686                                                 goto bad;
 1687                                 }
 1688                                 fp->f_iocount++;
 1689                                 n++;
 1690                         }
 1691                 }
 1692         }
 1693         proc_fdunlock(p);
 1694 
 1695         *countp = n;
 1696         return (0);
 1697 
 1698 bad:
 1699         dropcount = 0;
 1700         
 1701         if (n== 0)
 1702                 goto out;
 1703         /* Ignore error return; it's already EBADF */
 1704         (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1);
 1705 
 1706 out:
 1707         proc_fdunlock(p);
 1708         if (need_wakeup) {
 1709                 wakeup(&p->p_fpdrainwait);
 1710         }
 1711         return(error);
 1712 }
 1713 
 1714 
 1715 /*
 1716  * seldrop_locked
 1717  *
 1718  * Drop outstanding wait queue references set up during selscan(); drop the
 1719  * outstanding per fileproc f_iocount() picked up during the selcount().
 1720  *
 1721  * Parameters:  p                       Process performing the select
 1722  *              ibits                   Input pit bector of fd's
 1723  *              nfd                     Number of fd's
 1724  *              lim                     Limit to number of vector entries to
 1725  *                                              consider, or -1 for "all"
 1726  *              inselect                True if
 1727  *              need_wakeup             Pointer to flag to set to do a wakeup
 1728  *                                      if f_iocont on any descriptor goes to 0
 1729  *
 1730  * Returns:     0                       Success
 1731  *              EBADF                   One or more fds in the bit vector
 1732  *                                              were invalid, but the rest
 1733  *                                              were successfully dropped
 1734  *
 1735  * Notes:       An fd make become bad while the proc_fdlock() is not held,
 1736  *              if a multithreaded application closes the fd out from under
 1737  *              the in progress select.  In this case, we still have to
 1738  *              clean up after the set up on the remaining fds.
 1739  */
 1740 static int
 1741 seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount)
 1742 {
 1743         struct filedesc *fdp = p->p_fd;
 1744         int msk, i, j, fd;
 1745         u_int32_t bits;
 1746         struct fileproc *fp;
 1747         u_int32_t *iptr;
 1748         u_int nw;
 1749         int error = 0;
 1750         int dropcount = 0;
 1751         uthread_t uth = get_bsdthread_info(current_thread());
 1752 
 1753         *need_wakeup = 0;
 1754 
 1755         /*
 1756          * Problems when reboot; due to MacOSX signal probs
 1757          * in Beaker1C ; verify that the p->p_fd is valid
 1758          */
 1759         if (fdp == NULL) {
 1760                 return(EIO);
 1761         }
 1762 
 1763         nw = howmany(nfd, NFDBITS);
 1764 
 1765         for (msk = 0; msk < 3; msk++) {
 1766                 iptr = (u_int32_t *)&ibits[msk * nw];
 1767                 for (i = 0; i < nfd; i += NFDBITS) {
 1768                         bits = iptr[i/NFDBITS];
 1769                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1770                                 bits &= ~(1 << j);
 1771                                 fp = fdp->fd_ofiles[fd];
 1772                                 /*
 1773                                  * If we've already dropped as many as were
 1774                                  * counted/scanned, then we are done.  
 1775                                  */
 1776                                 if ((fromselcount != 0) && (++dropcount > lim))
 1777                                         goto done;
 1778 
 1779                                 if (fp == NULL) {
 1780                                         /* skip (now) bad fds */
 1781                                         error = EBADF;
 1782                                         continue;
 1783                                 }
 1784                                 /*
 1785                                  * Only clear the flag if we set it.  We'll
 1786                                  * only find that we set it if we had made
 1787                                  * at least one [partial] pass through selscan().
 1788                                  */
 1789                                 if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)uth->uu_wqset)) {
 1790                                         fp->f_flags &= ~FP_INSELECT;
 1791                                         fp->f_waddr = (void *)0;
 1792                                 }
 1793 
 1794                                 fp->f_iocount--;
 1795                                 if (fp->f_iocount < 0)
 1796                                         panic("f_iocount overdecrement!");
 1797 
 1798                                 if (fp->f_iocount == 0) {
 1799                                         /*
 1800                                          * The last iocount is responsible for clearing
 1801                                          * selconfict flag - even if we didn't set it -
 1802                                          * and is also responsible for waking up anyone
 1803                                          * waiting on iocounts to drain.
 1804                                          */
 1805                                         if (fp->f_flags & FP_SELCONFLICT)
 1806                                                 fp->f_flags &= ~FP_SELCONFLICT;
 1807                                         if (p->p_fpdrainwait) {
 1808                                                 p->p_fpdrainwait = 0;
 1809                                                 *need_wakeup = 1;
 1810                                         }
 1811                                 }
 1812                         }
 1813                 }
 1814         }
 1815 done:
 1816         return (error);
 1817 }
 1818 
 1819 
 1820 static int
 1821 seldrop(struct proc *p, u_int32_t *ibits, int nfd)
 1822 {
 1823         int error;
 1824         int need_wakeup = 0;
 1825 
 1826         proc_fdlock(p);
 1827         error =  seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0);
 1828         proc_fdunlock(p);
 1829         if (need_wakeup) {
 1830                 wakeup(&p->p_fpdrainwait);
 1831         }
 1832         return (error);
 1833 }
 1834 
 1835 /*
 1836  * Record a select request.
 1837  */
 1838 void
 1839 selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql)
 1840 {
 1841         thread_t        cur_act = current_thread();
 1842         struct uthread * ut = get_bsdthread_info(cur_act);
 1843 
 1844         /* need to look at collisions */
 1845 
 1846         /*do not record if this is second pass of select */
 1847         if(p_wql == (void *)0) {
 1848                 return;
 1849         }
 1850 
 1851         if ((sip->si_flags & SI_INITED) == 0) {
 1852                 wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
 1853                 sip->si_flags |= SI_INITED;
 1854                 sip->si_flags &= ~SI_CLEAR;
 1855         }
 1856 
 1857         if (sip->si_flags & SI_RECORDED) {
 1858                 sip->si_flags |= SI_COLL;
 1859         } else
 1860                 sip->si_flags &= ~SI_COLL;
 1861 
 1862         sip->si_flags |= SI_RECORDED;
 1863         if (!wait_queue_member(&sip->si_wait_queue, ut->uu_wqset))
 1864                 wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_wqset,
 1865                                         (wait_queue_link_t)p_wql);
 1866 
 1867         return;
 1868 }
 1869 
 1870 void
 1871 selwakeup(struct selinfo *sip)
 1872 {
 1873         
 1874         if ((sip->si_flags & SI_INITED) == 0) {
 1875                 return;
 1876         }
 1877 
 1878         if (sip->si_flags & SI_COLL) {
 1879                 nselcoll++;
 1880                 sip->si_flags &= ~SI_COLL;
 1881 #if 0
 1882                 /* will not  support */
 1883                 //wakeup((caddr_t)&selwait);
 1884 #endif
 1885         }
 1886 
 1887         if (sip->si_flags & SI_RECORDED) {
 1888                 wait_queue_wakeup_all(&sip->si_wait_queue, NULL, THREAD_AWAKENED);
 1889                 sip->si_flags &= ~SI_RECORDED;
 1890         }
 1891 
 1892 }
 1893 
 1894 void 
 1895 selthreadclear(struct selinfo *sip)
 1896 {
 1897 
 1898         if ((sip->si_flags & SI_INITED) == 0) {
 1899                 return;
 1900         }
 1901         if (sip->si_flags & SI_RECORDED) {
 1902                         selwakeup(sip);
 1903                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
 1904         }
 1905         sip->si_flags |= SI_CLEAR;
 1906         wait_queue_unlink_all(&sip->si_wait_queue);
 1907 }
 1908 
 1909 
 1910 
 1911 
 1912 #define DBG_POST        0x10
 1913 #define DBG_WATCH       0x11
 1914 #define DBG_WAIT        0x12
 1915 #define DBG_MOD         0x13
 1916 #define DBG_EWAKEUP     0x14
 1917 #define DBG_ENQUEUE     0x15
 1918 #define DBG_DEQUEUE     0x16
 1919 
 1920 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
 1921 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
 1922 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
 1923 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
 1924 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
 1925 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
 1926 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
 1927 
 1928 
 1929 #define EVPROCDEQUE(p, evq)     do {                            \
 1930         proc_lock(p);                                           \
 1931         if (evq->ee_flags & EV_QUEUED) {                        \
 1932                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);      \
 1933                 evq->ee_flags &= ~EV_QUEUED;                    \
 1934         }                                                       \
 1935         proc_unlock(p);                                         \
 1936 } while (0);
 1937 
 1938 
 1939 /*
 1940  * called upon socket close. deque and free all events for
 1941  * the socket...  socket must be locked by caller.
 1942  */
 1943 void
 1944 evsofree(struct socket *sp)
 1945 {
 1946         struct eventqelt *evq, *next;
 1947         proc_t  p;
 1948 
 1949         if (sp == NULL)
 1950                 return;
 1951 
 1952         for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
 1953                 next = evq->ee_slist.tqe_next;
 1954                 p = evq->ee_proc;
 1955 
 1956                 if (evq->ee_flags & EV_QUEUED) {
 1957                         EVPROCDEQUE(p, evq);
 1958                 }
 1959                 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
 1960                 FREE(evq, M_TEMP);
 1961         }
 1962 }
 1963 
 1964 
 1965 /*
 1966  * called upon pipe close. deque and free all events for
 1967  * the pipe... pipe must be locked by caller
 1968  */
 1969 void
 1970 evpipefree(struct pipe *cpipe)
 1971 {
 1972         struct eventqelt *evq, *next;
 1973         proc_t  p;
 1974 
 1975         for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
 1976                 next = evq->ee_slist.tqe_next;
 1977                 p = evq->ee_proc;
 1978 
 1979                 EVPROCDEQUE(p, evq);
 1980 
 1981                 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
 1982                 FREE(evq, M_TEMP);
 1983         }
 1984 }
 1985 
 1986 
 1987 /*
 1988  * enqueue this event if it's not already queued. wakeup
 1989  * the proc if we do queue this event to it...
 1990  * entered with proc lock held... we drop it before
 1991  * doing the wakeup and return in that state
 1992  */
 1993 static void
 1994 evprocenque(struct eventqelt *evq)
 1995 {
 1996         proc_t  p;
 1997 
 1998         assert(evq);
 1999         p = evq->ee_proc;
 2000 
 2001         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,0,0);
 2002 
 2003         proc_lock(p);
 2004 
 2005         if (evq->ee_flags & EV_QUEUED) {
 2006                 proc_unlock(p);
 2007 
 2008                 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
 2009                 return;
 2010         }
 2011         evq->ee_flags |= EV_QUEUED;
 2012 
 2013         TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
 2014 
 2015         proc_unlock(p);
 2016 
 2017         wakeup(&p->p_evlist);
 2018 
 2019         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
 2020 }
 2021 
 2022 
 2023 /*
 2024  * pipe lock must be taken by the caller
 2025  */
 2026 void
 2027 postpipeevent(struct pipe *pipep, int event)
 2028 {
 2029         int     mask;
 2030         struct eventqelt *evq;
 2031 
 2032         if (pipep == NULL)
 2033                 return;
 2034         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
 2035 
 2036         for (evq = pipep->pipe_evlist.tqh_first;
 2037              evq != NULL; evq = evq->ee_slist.tqe_next) {
 2038 
 2039                 if (evq->ee_eventmask == 0)
 2040                         continue;
 2041                 mask = 0;
 2042 
 2043                 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
 2044 
 2045                 case EV_RWBYTES:
 2046                   if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
 2047                           mask |= EV_RE;
 2048                           evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
 2049                   }
 2050                   if ((evq->ee_eventmask & EV_WR) && 
 2051                       (MAX(pipep->pipe_buffer.size,PIPE_SIZE) - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
 2052 
 2053                           if (pipep->pipe_state & PIPE_EOF) {
 2054                                   mask |= EV_WR|EV_RESET;
 2055                                   break;
 2056                           }
 2057                           mask |= EV_WR;
 2058                           evq->ee_req.er_wcnt = MAX(pipep->pipe_buffer.size, PIPE_SIZE) - pipep->pipe_buffer.cnt;
 2059                   }
 2060                   break;
 2061 
 2062                 case EV_WCLOSED:
 2063                 case EV_RCLOSED:
 2064                   if ((evq->ee_eventmask & EV_RE)) {
 2065                           mask |= EV_RE|EV_RCLOSED;
 2066                   }
 2067                   if ((evq->ee_eventmask & EV_WR)) {
 2068                           mask |= EV_WR|EV_WCLOSED;
 2069                   }
 2070                   break;
 2071 
 2072                 default:
 2073                   return;
 2074                 }
 2075                 if (mask) {
 2076                         /*
 2077                          * disarm... postevents are nops until this event is 'read' via
 2078                          * waitevent and then re-armed via modwatch
 2079                          */
 2080                         evq->ee_eventmask = 0;
 2081 
 2082                         /*
 2083                          * since events are disarmed until after the waitevent
 2084                          * the ee_req.er_xxxx fields can't change once we've
 2085                          * inserted this event into the proc queue...
 2086                          * therefore, the waitevent will see a 'consistent'
 2087                          * snapshot of the event, even though it won't hold
 2088                          * the pipe lock, and we're updating the event outside
 2089                          * of the proc lock, which it will hold
 2090                          */
 2091                         evq->ee_req.er_eventbits |= mask;
 2092 
 2093                         KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1,0);
 2094 
 2095                         evprocenque(evq);
 2096                 }
 2097         }
 2098         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
 2099 }
 2100 
 2101 #if SOCKETS
 2102 /*
 2103  * given either a sockbuf or a socket run down the
 2104  * event list and queue ready events found...
 2105  * the socket must be locked by the caller
 2106  */
 2107 void
 2108 postevent(struct socket *sp, struct sockbuf *sb, int event)
 2109 {
 2110         int     mask;
 2111         struct  eventqelt *evq;
 2112         struct  tcpcb *tp;
 2113 
 2114         if (sb)
 2115                 sp = sb->sb_so;
 2116         if (sp == NULL)
 2117                 return;
 2118 
 2119         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
 2120 
 2121         for (evq = sp->so_evlist.tqh_first;
 2122              evq != NULL; evq = evq->ee_slist.tqe_next) {
 2123 
 2124                 if (evq->ee_eventmask == 0)
 2125                         continue;
 2126                 mask = 0;
 2127 
 2128                 /* ready for reading:
 2129                    - byte cnt >= receive low water mark
 2130                    - read-half of conn closed
 2131                    - conn pending for listening sock
 2132                    - socket error pending
 2133 
 2134                    ready for writing
 2135                    - byte cnt avail >= send low water mark
 2136                    - write half of conn closed
 2137                    - socket error pending
 2138                    - non-blocking conn completed successfully
 2139 
 2140                    exception pending
 2141                    - out of band data
 2142                    - sock at out of band mark
 2143                 */
 2144 
 2145                 switch (event & EV_DMASK) {
 2146 
 2147                 case EV_OOB:
 2148                   if ((evq->ee_eventmask & EV_EX)) {
 2149                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
 2150                                   mask |= EV_EX|EV_OOB;
 2151                   }
 2152                   break;
 2153 
 2154                 case EV_RWBYTES|EV_OOB:
 2155                   if ((evq->ee_eventmask & EV_EX)) {
 2156                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
 2157                                   mask |= EV_EX|EV_OOB;
 2158                   }
 2159                   /*
 2160                    * fall into the next case
 2161                    */
 2162                 case EV_RWBYTES:
 2163                   if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
 2164                           if (sp->so_error) {
 2165                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
 2166                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
 2167                                               (tp->t_state == TCPS_CLOSED)) {
 2168                                                   mask |= EV_RE|EV_RESET;
 2169                                                   break;
 2170                                           }
 2171                                   }
 2172                           }
 2173                           mask |= EV_RE;
 2174                           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
 2175 
 2176                           if (sp->so_state & SS_CANTRCVMORE) {
 2177                                   mask |= EV_FIN;
 2178                                   break;
 2179                           }
 2180                   }
 2181                   if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
 2182                           if (sp->so_error) {
 2183                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
 2184                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
 2185                                               (tp->t_state == TCPS_CLOSED)) {
 2186                                                   mask |= EV_WR|EV_RESET;
 2187                                                   break;
 2188                                           }
 2189                                   }
 2190                           }
 2191                           mask |= EV_WR;
 2192                           evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
 2193                   }
 2194                   break;
 2195 
 2196                 case EV_RCONN:
 2197                   if ((evq->ee_eventmask & EV_RE)) {
 2198                           mask |= EV_RE|EV_RCONN;
 2199                           evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
 2200                   }
 2201                   break;
 2202 
 2203                 case EV_WCONN:
 2204                   if ((evq->ee_eventmask & EV_WR)) {
 2205                           mask |= EV_WR|EV_WCONN;
 2206                   }
 2207                   break;
 2208 
 2209                 case EV_RCLOSED:
 2210                   if ((evq->ee_eventmask & EV_RE)) {
 2211                           mask |= EV_RE|EV_RCLOSED;
 2212                   }
 2213                   break;
 2214 
 2215                 case EV_WCLOSED:
 2216                   if ((evq->ee_eventmask & EV_WR)) {
 2217                           mask |= EV_WR|EV_WCLOSED;
 2218                   }
 2219                   break;
 2220 
 2221                 case EV_FIN:
 2222                   if (evq->ee_eventmask & EV_RE) {
 2223                           mask |= EV_RE|EV_FIN;
 2224                   }
 2225                   break;
 2226 
 2227                 case EV_RESET:
 2228                 case EV_TIMEOUT:
 2229                   if (evq->ee_eventmask & EV_RE) {
 2230                           mask |= EV_RE | event;
 2231                   } 
 2232                   if (evq->ee_eventmask & EV_WR) {
 2233                           mask |= EV_WR | event;
 2234                   }
 2235                   break;
 2236 
 2237                 default:
 2238                   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
 2239                   return;
 2240                 } /* switch */
 2241 
 2242                 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
 2243 
 2244                 if (mask) {
 2245                         /*
 2246                          * disarm... postevents are nops until this event is 'read' via
 2247                          * waitevent and then re-armed via modwatch
 2248                          */
 2249                         evq->ee_eventmask = 0;
 2250 
 2251                         /*
 2252                          * since events are disarmed until after the waitevent
 2253                          * the ee_req.er_xxxx fields can't change once we've
 2254                          * inserted this event into the proc queue...
 2255                          * since waitevent can't see this event until we 
 2256                          * enqueue it, waitevent will see a 'consistent'
 2257                          * snapshot of the event, even though it won't hold
 2258                          * the socket lock, and we're updating the event outside
 2259                          * of the proc lock, which it will hold
 2260                          */
 2261                         evq->ee_req.er_eventbits |= mask;
 2262 
 2263                         evprocenque(evq);
 2264                 }
 2265         }
 2266         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
 2267 }
 2268 #endif /* SOCKETS */
 2269 
 2270 
 2271 /*
 2272  * watchevent system call. user passes us an event to watch
 2273  * for. we malloc an event object, initialize it, and queue
 2274  * it to the open socket. when the event occurs, postevent()
 2275  * will enque it back to our proc where we can retrieve it
 2276  * via waitevent().
 2277  *
 2278  * should this prevent duplicate events on same socket?
 2279  *
 2280  * Returns:
 2281  *              ENOMEM                  No memory for operation
 2282  *      copyin:EFAULT
 2283  */
 2284 int
 2285 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
 2286 {
 2287         struct eventqelt *evq = (struct eventqelt *)0;
 2288         struct eventqelt *np = NULL;
 2289         struct eventreq64 *erp;
 2290         struct fileproc *fp = NULL;
 2291         int error;
 2292 
 2293         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
 2294 
 2295         // get a qelt and fill with users req
 2296         MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
 2297 
 2298         if (evq == NULL)
 2299                 return (ENOMEM);
 2300         erp = &evq->ee_req;
 2301 
 2302         // get users request pkt
 2303 
 2304         if (IS_64BIT_PROCESS(p)) {
 2305                 error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
 2306         } else {
 2307                 struct eventreq32 er32;
 2308 
 2309                 error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
 2310                 if (error == 0) {
 2311                        /*
 2312                         * the user only passes in the
 2313                         * er_type, er_handle and er_data...
 2314                         * the other fields are initialized
 2315                         * below, so don't bother to copy
 2316                         */
 2317                         erp->er_type = er32.er_type;
 2318                         erp->er_handle = er32.er_handle;
 2319                         erp->er_data = (user_addr_t)er32.er_data;
 2320                 }
 2321         }
 2322         if (error) {
 2323                 FREE(evq, M_TEMP);
 2324                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
 2325 
 2326                 return(error);          
 2327         }
 2328         KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
 2329 
 2330         // validate, freeing qelt if errors
 2331         error = 0;
 2332         proc_fdlock(p);
 2333 
 2334         if (erp->er_type != EV_FD) {
 2335                 error = EINVAL;
 2336         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
 2337                 error = EBADF;
 2338 #if SOCKETS
 2339         } else if (fp->f_type == DTYPE_SOCKET) {
 2340                 socket_lock((struct socket *)fp->f_data, 1);
 2341                 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
 2342 #endif /* SOCKETS */
 2343         } else if (fp->f_type == DTYPE_PIPE) {
 2344                 PIPE_LOCK((struct pipe *)fp->f_data);
 2345                 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
 2346         } else {
 2347                 fp_drop(p, erp->er_handle, fp, 1);
 2348                 error = EINVAL;
 2349         }
 2350         proc_fdunlock(p);
 2351 
 2352         if (error) {
 2353                 FREE(evq, M_TEMP);
 2354 
 2355                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
 2356                 return(error);
 2357         }
 2358                 
 2359         /*
 2360          * only allow one watch per file per proc
 2361          */
 2362         for ( ; np != NULL; np = np->ee_slist.tqe_next) {
 2363                 if (np->ee_proc == p) {
 2364 #if SOCKETS
 2365                         if (fp->f_type == DTYPE_SOCKET)
 2366                                 socket_unlock((struct socket *)fp->f_data, 1);
 2367                         else 
 2368 #endif /* SOCKETS */
 2369                                 PIPE_UNLOCK((struct pipe *)fp->f_data);
 2370                         fp_drop(p, erp->er_handle, fp, 0);
 2371                         FREE(evq, M_TEMP);
 2372                         
 2373                         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
 2374                         return(EINVAL);
 2375                 }
 2376         }
 2377         erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
 2378         evq->ee_proc = p;
 2379         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
 2380         evq->ee_flags = 0;
 2381 
 2382 #if SOCKETS
 2383         if (fp->f_type == DTYPE_SOCKET) {
 2384                 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
 2385                 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
 2386 
 2387                 socket_unlock((struct socket *)fp->f_data, 1);
 2388         } else
 2389 #endif /* SOCKETS */
 2390         {
 2391                 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
 2392                 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
 2393 
 2394                 PIPE_UNLOCK((struct pipe *)fp->f_data);
 2395         }
 2396         fp_drop_event(p, erp->er_handle, fp);
 2397 
 2398         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
 2399         return(0);
 2400 }
 2401 
 2402 
 2403 
 2404 /*
 2405  * waitevent system call.
 2406  * grabs the next waiting event for this proc and returns
 2407  * it. if no events, user can request to sleep with timeout
 2408  * or without or poll mode
 2409  *    ((tv != NULL && interval == 0) || tv == -1)
 2410  */
 2411 int
 2412 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
 2413 {
 2414         int error = 0;
 2415         struct eventqelt *evq;
 2416         struct eventreq64 *erp;
 2417         uint64_t abstime, interval;
 2418         boolean_t fast_poll = FALSE;
 2419         union {
 2420                 struct eventreq64 er64;
 2421                 struct eventreq32 er32;
 2422         } uer;
 2423 
 2424         interval = 0;
 2425 
 2426         if (uap->tv) {
 2427                 struct timeval atv;
 2428                 /*
 2429                  * check for fast poll method
 2430                  */
 2431                 if (IS_64BIT_PROCESS(p)) {
 2432                         if (uap->tv == (user_addr_t)-1)
 2433                                 fast_poll = TRUE;
 2434                 } else if (uap->tv == (user_addr_t)((uint32_t)-1))
 2435                         fast_poll = TRUE;
 2436 
 2437                 if (fast_poll == TRUE) {
 2438                         if (p->p_evlist.tqh_first == NULL) {
 2439                                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_NONE, -1,0,0,0,0);
 2440                                 /*
 2441                                  * poll failed
 2442                                  */
 2443                                 *retval = 1;
 2444                                 return (0);
 2445                         }
 2446                         proc_lock(p);
 2447                         goto retry;
 2448                 }
 2449                 if (IS_64BIT_PROCESS(p)) {
 2450                         struct user64_timeval atv64;
 2451                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
 2452                         /* Loses resolution - assume timeout < 68 years */
 2453                         atv.tv_sec = atv64.tv_sec;
 2454                         atv.tv_usec = atv64.tv_usec;
 2455                 } else {
 2456                         struct user32_timeval atv32;
 2457                         error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
 2458                         atv.tv_sec = atv32.tv_sec;
 2459                         atv.tv_usec = atv32.tv_usec;
 2460                 }
 2461 
 2462                 if (error)
 2463                         return(error);
 2464                 if (itimerfix(&atv)) {
 2465                         error = EINVAL;
 2466                         return(error);
 2467                 }
 2468                 interval = tvtoabstime(&atv);
 2469         }
 2470         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
 2471 
 2472         proc_lock(p);
 2473 retry:
 2474         if ((evq = p->p_evlist.tqh_first) != NULL) {
 2475                 /*
 2476                  * found one... make a local copy while it's still on the queue
 2477                  * to prevent it from changing while in the midst of copying
 2478                  * don't want to hold the proc lock across a copyout because
 2479                  * it might block on a page fault at the target in user space
 2480                  */
 2481                 erp = &evq->ee_req;
 2482 
 2483                 if (IS_64BIT_PROCESS(p))
 2484                         bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
 2485                 else {
 2486                         uer.er32.er_type  = erp->er_type;
 2487                         uer.er32.er_handle  = erp->er_handle;
 2488                         uer.er32.er_data  = (uint32_t)erp->er_data;
 2489                         uer.er32.er_ecnt  = erp->er_ecnt;
 2490                         uer.er32.er_rcnt  = erp->er_rcnt;
 2491                         uer.er32.er_wcnt  = erp->er_wcnt;
 2492                         uer.er32.er_eventbits = erp->er_eventbits;
 2493                 }
 2494                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
 2495 
 2496                 evq->ee_flags &= ~EV_QUEUED;
 2497 
 2498                 proc_unlock(p);
 2499 
 2500                 if (IS_64BIT_PROCESS(p))
 2501                         error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
 2502                 else
 2503                         error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
 2504 
 2505                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
 2506                              evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,0);
 2507                 return (error);
 2508         }
 2509         else {
 2510                 if (uap->tv && interval == 0) {
 2511                         proc_unlock(p);
 2512                         *retval = 1;  // poll failed
 2513 
 2514                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
 2515                         return (error);
 2516                 }
 2517                 if (interval != 0)
 2518                         clock_absolutetime_interval_to_deadline(interval, &abstime);
 2519                 else
 2520                         abstime = 0;
 2521 
 2522                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,(uint32_t)&p->p_evlist,0,0,0);
 2523 
 2524                 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
 2525 
 2526                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,(uint32_t)&p->p_evlist,0,0,0);
 2527 
 2528                 if (error == 0)
 2529                         goto retry;
 2530                 if (error == ERESTART)
 2531                         error = EINTR;
 2532                 if (error == EWOULDBLOCK) {
 2533                         *retval = 1;
 2534                         error = 0;
 2535                 }
 2536         }
 2537         proc_unlock(p);
 2538 
 2539         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
 2540         return (error);
 2541 }
 2542 
 2543 
 2544 /*
 2545  * modwatch system call. user passes in event to modify.
 2546  * if we find it we reset the event bits and que/deque event
 2547  * it needed.
 2548  */
 2549 int
 2550 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
 2551 {
 2552         struct eventreq64 er;
 2553         struct eventreq64 *erp = &er;
 2554         struct eventqelt *evq = NULL;   /* protected by error return */
 2555         int error;
 2556         struct fileproc *fp;
 2557         int flag;
 2558 
 2559         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
 2560 
 2561         /*
 2562          * get user's request pkt
 2563          * just need the er_type and er_handle which sit above the
 2564          * problematic er_data (32/64 issue)... so only copy in
 2565          * those 2 fields
 2566          */
 2567         if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
 2568                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
 2569                 return(error);
 2570         }
 2571         proc_fdlock(p);
 2572 
 2573         if (erp->er_type != EV_FD) {
 2574                 error = EINVAL;
 2575         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
 2576                 error = EBADF;
 2577 #if SOCKETS
 2578         } else if (fp->f_type == DTYPE_SOCKET) {
 2579                 socket_lock((struct socket *)fp->f_data, 1);
 2580                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
 2581 #endif /* SOCKETS */
 2582         } else if (fp->f_type == DTYPE_PIPE) {
 2583                 PIPE_LOCK((struct pipe *)fp->f_data);
 2584                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
 2585         } else {
 2586                 fp_drop(p, erp->er_handle, fp, 1);
 2587                 error = EINVAL;
 2588         }
 2589 
 2590         if (error) {
 2591                 proc_fdunlock(p);
 2592                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
 2593                 return(error);
 2594         }
 2595 
 2596         if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
 2597                 fp->f_flags &= ~FP_WAITEVENT;
 2598         }
 2599         proc_fdunlock(p);
 2600 
 2601         // locate event if possible
 2602         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
 2603                 if (evq->ee_proc == p)
 2604                         break;
 2605         }
 2606         if (evq == NULL) {
 2607 #if SOCKETS
 2608                 if (fp->f_type == DTYPE_SOCKET) 
 2609                         socket_unlock((struct socket *)fp->f_data, 1);
 2610                 else
 2611 #endif /* SOCKETS */
 2612                         PIPE_UNLOCK((struct pipe *)fp->f_data);
 2613                 fp_drop(p, erp->er_handle, fp, 0);
 2614                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
 2615                 return(EINVAL);
 2616         }
 2617         KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
 2618 
 2619         if (uap->u_eventmask == EV_RM) {
 2620                 EVPROCDEQUE(p, evq);
 2621 
 2622 #if SOCKETS
 2623                 if (fp->f_type == DTYPE_SOCKET) {
 2624                         TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
 2625                         socket_unlock((struct socket *)fp->f_data, 1);
 2626                 } else
 2627 #endif /* SOCKETS */
 2628                 {
 2629                         TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
 2630                         PIPE_UNLOCK((struct pipe *)fp->f_data);
 2631                 }
 2632                 fp_drop(p, erp->er_handle, fp, 0);
 2633                 FREE(evq, M_TEMP);
 2634                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
 2635                 return(0);
 2636         }
 2637         switch (uap->u_eventmask & EV_MASK) {
 2638  
 2639         case 0:
 2640                 flag = 0;
 2641                 break;
 2642 
 2643         case EV_RE:
 2644         case EV_WR:
 2645         case EV_RE|EV_WR:
 2646                 flag = EV_RWBYTES;
 2647                 break;
 2648 
 2649         case EV_EX:
 2650                 flag = EV_OOB;
 2651                 break;
 2652 
 2653         case EV_EX|EV_RE:
 2654         case EV_EX|EV_WR:
 2655         case EV_EX|EV_RE|EV_WR:
 2656                 flag = EV_OOB|EV_RWBYTES;
 2657                 break;
 2658 
 2659         default:
 2660 #if SOCKETS
 2661                 if (fp->f_type == DTYPE_SOCKET) 
 2662                         socket_unlock((struct socket *)fp->f_data, 1);
 2663                 else 
 2664 #endif /* SOCKETS */
 2665                         PIPE_UNLOCK((struct pipe *)fp->f_data);
 2666                 fp_drop(p, erp->er_handle, fp, 0);
 2667                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
 2668                 return(EINVAL);
 2669         }
 2670         /*
 2671          * since we're holding the socket/pipe lock, the event
 2672          * cannot go from the unqueued state to the queued state
 2673          * however, it can go from the queued state to the unqueued state
 2674          * since that direction is protected by the proc_lock...
 2675          * so do a quick check for EV_QUEUED w/o holding the proc lock
 2676          * since by far the common case will be NOT EV_QUEUED, this saves
 2677          * us taking the proc_lock the majority of the time
 2678          */
 2679         if (evq->ee_flags & EV_QUEUED) {
 2680                 /*
 2681                  * EVPROCDEQUE will recheck the state after it grabs the proc_lock
 2682                  */
 2683                 EVPROCDEQUE(p, evq);
 2684         }
 2685         /*
 2686          * while the event is off the proc queue and
 2687          * we're holding the socket/pipe lock
 2688          * it's safe to update these fields...
 2689          */
 2690         evq->ee_req.er_eventbits = 0;
 2691         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
 2692 
 2693 #if SOCKETS
 2694         if (fp->f_type == DTYPE_SOCKET) {
 2695                 postevent((struct socket *)fp->f_data, 0, flag);
 2696                 socket_unlock((struct socket *)fp->f_data, 1);
 2697         } else
 2698 #endif /* SOCKETS */
 2699         {
 2700                 postpipeevent((struct pipe *)fp->f_data, flag);
 2701                 PIPE_UNLOCK((struct pipe *)fp->f_data);
 2702         }
 2703         fp_drop(p, erp->er_handle, fp, 0);
 2704         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,0);
 2705         return(0);
 2706 }
 2707 
 2708 /* this routine is called from the close of fd with proc_fdlock held */
 2709 int
 2710 waitevent_close(struct proc *p, struct fileproc *fp)
 2711 {
 2712         struct eventqelt *evq;
 2713 
 2714 
 2715         fp->f_flags &= ~FP_WAITEVENT;
 2716 
 2717 #if SOCKETS
 2718         if (fp->f_type == DTYPE_SOCKET) {
 2719                 socket_lock((struct socket *)fp->f_data, 1);
 2720                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
 2721         } else
 2722 #endif /* SOCKETS */
 2723         if (fp->f_type == DTYPE_PIPE) {
 2724                 PIPE_LOCK((struct pipe *)fp->f_data);
 2725                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
 2726         }
 2727         else {
 2728                 return(EINVAL);
 2729         }
 2730         proc_fdunlock(p);
 2731 
 2732 
 2733         // locate event if possible
 2734         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
 2735                 if (evq->ee_proc == p)
 2736                         break;
 2737         }
 2738         if (evq == NULL) {
 2739 #if SOCKETS
 2740                 if (fp->f_type == DTYPE_SOCKET) 
 2741                         socket_unlock((struct socket *)fp->f_data, 1);
 2742                 else 
 2743 #endif /* SOCKETS */
 2744                         PIPE_UNLOCK((struct pipe *)fp->f_data);
 2745 
 2746                 proc_fdlock(p);
 2747 
 2748                 return(EINVAL);
 2749         }
 2750         EVPROCDEQUE(p, evq);
 2751 
 2752 #if SOCKETS
 2753         if (fp->f_type == DTYPE_SOCKET) {
 2754                 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
 2755                 socket_unlock((struct socket *)fp->f_data, 1);
 2756         } else
 2757 #endif /* SOCKETS */
 2758         {
 2759                 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
 2760                 PIPE_UNLOCK((struct pipe *)fp->f_data);
 2761         }
 2762         FREE(evq, M_TEMP);
 2763 
 2764         proc_fdlock(p);
 2765 
 2766         return(0);
 2767 }
 2768 
 2769 
 2770 /*
 2771  * gethostuuid
 2772  *
 2773  * Description: Get the host UUID from IOKit and return it to user space.
 2774  *
 2775  * Parameters:  uuid_buf                Pointer to buffer to receive UUID
 2776  *              timeout                 Timespec for timout
 2777  *
 2778  * Returns:     0                       Success
 2779  *              EWOULDBLOCK             Timeout is too short
 2780  *              copyout:EFAULT          Bad user buffer
 2781  *
 2782  * Notes:       A timeout seems redundant, since if it's tolerable to not
 2783  *              have a system UUID in hand, then why ask for one?
 2784  */
 2785 int
 2786 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
 2787 {
 2788         kern_return_t kret;
 2789         int error;
 2790         mach_timespec_t mach_ts;        /* for IOKit call */
 2791         __darwin_uuid_t uuid_kern;      /* for IOKit call */
 2792 
 2793         /* Convert the 32/64 bit timespec into a mach_timespec_t */
 2794         if ( proc_is64bit(p) ) {
 2795                 struct user64_timespec ts;
 2796                 error = copyin(uap->timeoutp, &ts, sizeof(ts));
 2797                 if (error)
 2798                         return (error);
 2799                 mach_ts.tv_sec = ts.tv_sec;
 2800                 mach_ts.tv_nsec = ts.tv_nsec;
 2801         } else {
 2802                 struct user32_timespec ts;
 2803                 error = copyin(uap->timeoutp, &ts, sizeof(ts) );
 2804                 if (error)
 2805                         return (error);
 2806                 mach_ts.tv_sec = ts.tv_sec;
 2807                 mach_ts.tv_nsec = ts.tv_nsec;
 2808         }
 2809 
 2810         /* Call IOKit with the stack buffer to get the UUID */
 2811         kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
 2812 
 2813         /*
 2814          * If we get it, copy out the data to the user buffer; note that a
 2815          * uuid_t is an array of characters, so this is size invariant for
 2816          * 32 vs. 64 bit.
 2817          */
 2818         if (kret == KERN_SUCCESS) {
 2819                 error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
 2820         } else {
 2821                 error = EWOULDBLOCK;
 2822         }
 2823 
 2824         return (error);
 2825 }
 2826 
 2827 /*
 2828  * ledger
 2829  *
 2830  * Description: Omnibus system call for ledger operations
 2831  */
 2832 int
 2833 ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
 2834 {
 2835         int rval, pid, len, error;
 2836 #ifdef LEDGER_DEBUG
 2837         struct ledger_limit_args lla;
 2838 #endif
 2839         task_t task;
 2840         proc_t proc;
 2841 
 2842         /* Finish copying in the necessary args before taking the proc lock */
 2843         error = 0;
 2844         len = 0;
 2845         if (args->cmd == LEDGER_ENTRY_INFO)
 2846                 error = copyin(args->arg3, (char *)&len, sizeof (len));
 2847         else if (args->cmd == LEDGER_TEMPLATE_INFO)
 2848                 error = copyin(args->arg2, (char *)&len, sizeof (len));
 2849 #ifdef LEDGER_DEBUG
 2850         else if (args->cmd == LEDGER_LIMIT)
 2851                 error = copyin(args->arg2, (char *)&lla, sizeof (lla));
 2852 #endif
 2853         if (error)
 2854                 return (error);
 2855         if (len < 0)
 2856                 return (EINVAL);
 2857 
 2858         rval = 0;
 2859         if (args->cmd != LEDGER_TEMPLATE_INFO) {
 2860                 pid = args->arg1;
 2861                 proc = proc_find(pid);
 2862                 if (proc == NULL)
 2863                         return (ESRCH);
 2864 
 2865 #if CONFIG_MACF
 2866                 error = mac_proc_check_ledger(p, proc, args->cmd);
 2867                 if (error) {
 2868                         proc_rele(proc);
 2869                         return (error);
 2870                 }
 2871 #endif
 2872 
 2873                 task = proc->task;
 2874         }
 2875                 
 2876         switch (args->cmd) {
 2877 #ifdef LEDGER_DEBUG
 2878                 case LEDGER_LIMIT: {
 2879                         if (!is_suser())
 2880                                 rval = EPERM;
 2881                         rval = ledger_limit(task, &lla);
 2882                         proc_rele(proc);
 2883                         break;
 2884                 }
 2885 #endif
 2886                 case LEDGER_INFO: {
 2887                         struct ledger_info info;
 2888 
 2889                         rval = ledger_info(task, &info);
 2890                         proc_rele(proc);
 2891                         if (rval == 0)
 2892                                 rval = copyout(&info, args->arg2,
 2893                                     sizeof (info));
 2894                         break;
 2895                 }
 2896 
 2897                 case LEDGER_ENTRY_INFO: {
 2898                         void *buf;
 2899                         int sz;
 2900 
 2901                         rval = ledger_entry_info(task, &buf, &len);
 2902                         proc_rele(proc);
 2903                         if ((rval == 0) && (len > 0)) {
 2904                                 sz = len * sizeof (struct ledger_entry_info);
 2905                                 rval = copyout(buf, args->arg2, sz);
 2906                                 kfree(buf, sz);
 2907                         }
 2908                         if (rval == 0)
 2909                                 rval = copyout(&len, args->arg3, sizeof (len));
 2910                         break;
 2911                 }
 2912 
 2913                 case LEDGER_TEMPLATE_INFO: {
 2914                         void *buf;
 2915                         int sz;
 2916 
 2917                         rval = ledger_template_info(&buf, &len);
 2918                         if ((rval == 0) && (len > 0)) {
 2919                                 sz = len * sizeof (struct ledger_template_info);
 2920                                 rval = copyout(buf, args->arg1, sz);
 2921                                 kfree(buf, sz);
 2922                         }
 2923                         if (rval == 0)
 2924                                 rval = copyout(&len, args->arg2, sizeof (len));
 2925                         break;
 2926                 }
 2927 
 2928                 default:
 2929                         rval = EINVAL;
 2930         }
 2931 
 2932         return (rval);
 2933 }

Cache object: 0be530c2a57938738f0d7afaa857171d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.