The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/bsd/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
    3  *
    4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
    5  * 
    6  * This file contains Original Code and/or Modifications of Original Code
    7  * as defined in and that are subject to the Apple Public Source License
    8  * Version 2.0 (the 'License'). You may not use this file except in
    9  * compliance with the License. The rights granted to you under the License
   10  * may not be used to create, or enable the creation or redistribution of,
   11  * unlawful or unlicensed copies of an Apple operating system, or to
   12  * circumvent, violate, or enable the circumvention or violation of, any
   13  * terms of an Apple operating system software license agreement.
   14  * 
   15  * Please obtain a copy of the License at
   16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
   17  * 
   18  * The Original Code and all software distributed under the License are
   19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   23  * Please see the License for the specific language governing rights and
   24  * limitations under the License.
   25  * 
   26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   27  */
   28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
   29 /*
   30  * Copyright (c) 1982, 1986, 1989, 1993
   31  *      The Regents of the University of California.  All rights reserved.
   32  * (c) UNIX System Laboratories, Inc.
   33  * All or some portions of this file are derived from material licensed
   34  * to the University of California by American Telephone and Telegraph
   35  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   36  * the permission of UNIX System Laboratories, Inc.
   37  *
   38  * Redistribution and use in source and binary forms, with or without
   39  * modification, are permitted provided that the following conditions
   40  * are met:
   41  * 1. Redistributions of source code must retain the above copyright
   42  *    notice, this list of conditions and the following disclaimer.
   43  * 2. Redistributions in binary form must reproduce the above copyright
   44  *    notice, this list of conditions and the following disclaimer in the
   45  *    documentation and/or other materials provided with the distribution.
   46  * 3. All advertising materials mentioning features or use of this software
   47  *    must display the following acknowledgement:
   48  *      This product includes software developed by the University of
   49  *      California, Berkeley and its contributors.
   50  * 4. Neither the name of the University nor the names of its contributors
   51  *    may be used to endorse or promote products derived from this software
   52  *    without specific prior written permission.
   53  *
   54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   64  * SUCH DAMAGE.
   65  *
   66  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
   67  */
   68 /*
   69  * NOTICE: This file was modified by SPARTA, Inc. in 2006 to introduce
   70  * support for mandatory and extensible security protections.  This notice
   71  * is included in support of clause 2.2 (b) of the Apple Public License,
   72  * Version 2.0.
   73  */
   74 
   75 #include <sys/param.h>
   76 #include <sys/systm.h>
   77 #include <sys/filedesc.h>
   78 #include <sys/ioctl.h>
   79 #include <sys/file_internal.h>
   80 #include <sys/proc_internal.h>
   81 #include <sys/socketvar.h>
   82 #include <sys/uio_internal.h>
   83 #include <sys/kernel.h>
   84 #include <sys/stat.h>
   85 #include <sys/malloc.h>
   86 #include <sys/sysproto.h>
   87 
   88 #include <sys/mount_internal.h>
   89 #include <sys/protosw.h>
   90 #include <sys/ev.h>
   91 #include <sys/user.h>
   92 #include <sys/kdebug.h>
   93 #include <sys/poll.h>
   94 #include <sys/event.h>
   95 #include <sys/eventvar.h>
   96 
   97 #include <mach/mach_types.h>
   98 #include <kern/kern_types.h>
   99 #include <kern/assert.h>
  100 #include <kern/kalloc.h>
  101 #include <kern/thread.h>
  102 #include <kern/clock.h>
  103 
  104 #include <sys/mbuf.h>
  105 #include <sys/socket.h>
  106 #include <sys/socketvar.h>
  107 #include <sys/errno.h>
  108 #include <sys/syscall.h>
  109 #include <sys/pipe.h>
  110 
  111 #include <security/audit/audit.h>
  112 
  113 #include <net/if.h>
  114 #include <net/route.h>
  115 
  116 #include <netinet/in.h>
  117 #include <netinet/in_systm.h>
  118 #include <netinet/ip.h>
  119 #include <netinet/in_pcb.h>
  120 #include <netinet/ip_var.h>
  121 #include <netinet/ip6.h>
  122 #include <netinet/tcp.h>
  123 #include <netinet/tcp_fsm.h>
  124 #include <netinet/tcp_seq.h>
  125 #include <netinet/tcp_timer.h>
  126 #include <netinet/tcp_var.h>
  127 #include <netinet/tcpip.h>
  128 #include <netinet/tcp_debug.h>
  129 /* for wait queue based select */
  130 #include <kern/wait_queue.h>
  131 #include <kern/kalloc.h>
  132 #include <sys/vnode_internal.h>
  133 
  134 /* XXX should be in a header file somewhere */
  135 void evsofree(struct socket *);
  136 void evpipefree(struct pipe *);
  137 void postpipeevent(struct pipe *, int);
  138 void postevent(struct socket *, struct sockbuf *, int);
  139 extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp);
  140 
  141 int rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
  142 int wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval);
  143 extern void     *get_bsduthreadarg(thread_t);
  144 extern int      *get_bsduthreadrval(thread_t);
  145 
  146 __private_extern__ int  dofileread(vfs_context_t ctx, struct fileproc *fp,
  147                                                                    user_addr_t bufp, user_size_t nbyte, 
  148                                                                    off_t offset, int flags, user_ssize_t *retval);
  149 __private_extern__ int  dofilewrite(vfs_context_t ctx, struct fileproc *fp,
  150                                                                         user_addr_t bufp, user_size_t nbyte, 
  151                                                                         off_t offset, int flags, user_ssize_t *retval);
  152 __private_extern__ int  preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode);
  153 __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd);
  154 
  155 
  156 /* Conflict wait queue for when selects collide (opaque type) */
  157 struct wait_queue select_conflict_queue;
  158 
  159 /*
  160  * Init routine called from bsd_init.c
  161  */
  162 void select_wait_queue_init(void);
  163 void
  164 select_wait_queue_init(void)
  165 {
  166         wait_queue_init(&select_conflict_queue, SYNC_POLICY_FIFO);
  167 }
  168 
  169 
  170 #if NETAT
  171 extern int appletalk_inited;
  172 #endif /* NETAT */
  173 
  174 #define f_flag f_fglob->fg_flag
  175 #define f_type f_fglob->fg_type
  176 #define f_msgcount f_fglob->fg_msgcount
  177 #define f_cred f_fglob->fg_cred
  178 #define f_ops f_fglob->fg_ops
  179 #define f_offset f_fglob->fg_offset
  180 #define f_data f_fglob->fg_data
  181 
  182 /*
  183  * Read system call.
  184  *
  185  * Returns:     0                       Success
  186  *      preparefileread:EBADF
  187  *      preparefileread:ESPIPE
  188  *      preparefileread:ENXIO
  189  *      preparefileread:EBADF
  190  *      dofileread:???
  191  */
  192 int
  193 read(struct proc *p, struct read_args *uap, user_ssize_t *retval)
  194 {
  195         __pthread_testcancel(1);
  196         return(read_nocancel(p, (struct read_nocancel_args *)uap, retval));
  197 }
  198 
  199 int
  200 read_nocancel(struct proc *p, struct read_nocancel_args *uap, user_ssize_t *retval)
  201 {
  202         struct fileproc *fp;
  203         int error;
  204         int fd = uap->fd;
  205         struct vfs_context context;
  206 
  207         if ( (error = preparefileread(p, &fp, fd, 0)) )
  208                 return (error);
  209 
  210         context = *(vfs_context_current());
  211         context.vc_ucred = fp->f_fglob->fg_cred;
  212 
  213         error = dofileread(&context, fp, uap->cbuf, uap->nbyte,
  214                            (off_t)-1, 0, retval);
  215 
  216         donefileread(p, fp, fd);
  217 
  218         return (error);
  219 }
  220 
  221 /* 
  222  * Pread system call
  223  *
  224  * Returns:     0                       Success
  225  *      preparefileread:EBADF
  226  *      preparefileread:ESPIPE
  227  *      preparefileread:ENXIO
  228  *      preparefileread:EBADF
  229  *      dofileread:???
  230  */
  231 int
  232 pread(struct proc *p, struct pread_args *uap, user_ssize_t *retval)
  233 {
  234         __pthread_testcancel(1);
  235         return(pread_nocancel(p, (struct pread_nocancel_args *)uap, retval));
  236 }
  237 
  238 int
  239 pread_nocancel(struct proc *p, struct pread_nocancel_args *uap, user_ssize_t *retval)
  240 {
  241         struct fileproc *fp = NULL;     /* fp set by preparefileread() */
  242         int fd = uap->fd;
  243         int error;
  244         struct vfs_context context;
  245 
  246         if ( (error = preparefileread(p, &fp, fd, 1)) )
  247                 goto out;
  248 
  249         context = *(vfs_context_current());
  250         context.vc_ucred = fp->f_fglob->fg_cred;
  251 
  252         error = dofileread(&context, fp, uap->buf, uap->nbyte,
  253                         uap->offset, FOF_OFFSET, retval);
  254         
  255         donefileread(p, fp, fd);
  256 
  257         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
  258               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
  259 
  260 out:
  261         return (error);
  262 }
  263 
  264 /*
  265  * Code common for read and pread
  266  */
  267 
  268 void
  269 donefileread(struct proc *p, struct fileproc *fp, int fd)
  270 {
  271         proc_fdlock_spin(p);
  272 
  273         fp->f_flags &= ~FP_INCHRREAD;
  274 
  275         fp_drop(p, fd, fp, 1);
  276         proc_fdunlock(p);
  277 }
  278 
  279 /*
  280  * Returns:     0                       Success
  281  *              EBADF
  282  *              ESPIPE
  283  *              ENXIO
  284  *      fp_lookup:EBADF
  285  *      fo_read:???
  286  */
  287 int
  288 preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_pread)
  289 {
  290         vnode_t vp;
  291         int     error;
  292         struct fileproc *fp;
  293 
  294         AUDIT_ARG(fd, fd);
  295 
  296         proc_fdlock_spin(p);
  297 
  298         error = fp_lookup(p, fd, &fp, 1);
  299 
  300         if (error) {
  301                 proc_fdunlock(p);
  302                 return (error);
  303         }
  304         if ((fp->f_flag & FREAD) == 0) {
  305                 error = EBADF;
  306                 goto out;
  307         }
  308         if (check_for_pread && (fp->f_type != DTYPE_VNODE)) {
  309                 error = ESPIPE;
  310                 goto out;
  311         }
  312         if (fp->f_type == DTYPE_VNODE) {
  313                 vp = (struct vnode *)fp->f_fglob->fg_data;
  314 
  315                 if (check_for_pread && (vnode_isfifo(vp))) {
  316                         error = ESPIPE;
  317                         goto out;
  318                 } 
  319                 if (check_for_pread && (vp->v_flag & VISTTY)) {
  320                         error = ENXIO;
  321                         goto out;
  322                 }
  323                 if (vp->v_type == VCHR)
  324                         fp->f_flags |= FP_INCHRREAD;
  325         }
  326 
  327         *fp_ret = fp;
  328 
  329         proc_fdunlock(p);
  330         return (0);
  331 
  332 out:
  333         fp_drop(p, fd, fp, 1);
  334         proc_fdunlock(p);
  335         return (error);
  336 }
  337 
  338 
  339 /*
  340  * Returns:     0                       Success
  341  *              EINVAL
  342  *      fo_read:???
  343  */
  344 __private_extern__ int
  345 dofileread(vfs_context_t ctx, struct fileproc *fp,
  346            user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
  347            user_ssize_t *retval)
  348 {
  349         uio_t auio;
  350         user_ssize_t bytecnt;
  351         long error = 0;
  352         char uio_buf[ UIO_SIZEOF(1) ];
  353 
  354         if (nbyte > INT_MAX)
  355                 return (EINVAL);
  356 
  357         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
  358                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_READ, 
  359                                                                           &uio_buf[0], sizeof(uio_buf));
  360         } else {
  361                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_READ, 
  362                                                                           &uio_buf[0], sizeof(uio_buf));
  363         }
  364         uio_addiov(auio, bufp, nbyte);
  365 
  366         bytecnt = nbyte;
  367 
  368         if ((error = fo_read(fp, auio, flags, ctx))) {
  369                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
  370                         error == EINTR || error == EWOULDBLOCK))
  371                         error = 0;
  372         }
  373         bytecnt -= uio_resid(auio);
  374 
  375         *retval = bytecnt;
  376 
  377         return (error);
  378 }
  379 
  380 /*      
  381  * Scatter read system call.
  382  *
  383  * Returns:     0                       Success
  384  *              EINVAL
  385  *              ENOMEM
  386  *      copyin:EFAULT
  387  *      rd_uio:???
  388  */
  389 int
  390 readv(struct proc *p, struct readv_args *uap, user_ssize_t *retval)
  391 {
  392         __pthread_testcancel(1);
  393         return(readv_nocancel(p, (struct readv_nocancel_args *)uap, retval));
  394 }
  395 
  396 int
  397 readv_nocancel(struct proc *p, struct readv_nocancel_args *uap, user_ssize_t *retval)
  398 {
  399         uio_t auio = NULL;
  400         int error;
  401         struct user_iovec *iovp;
  402 
  403         /* Verify range bedfore calling uio_create() */
  404         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
  405                 return (EINVAL);
  406 
  407         /* allocate a uio large enough to hold the number of iovecs passed */
  408         auio = uio_create(uap->iovcnt, 0,
  409                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
  410                                   UIO_READ);
  411                                   
  412         /* get location of iovecs within the uio.  then copyin the iovecs from
  413          * user space.
  414          */
  415         iovp = uio_iovsaddr(auio);
  416         if (iovp == NULL) {
  417                 error = ENOMEM;
  418                 goto ExitThisRoutine;
  419         }
  420         error = copyin_user_iovec_array(uap->iovp,
  421                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
  422                 uap->iovcnt, iovp);
  423         if (error) {
  424                 goto ExitThisRoutine;
  425         }
  426         
  427         /* finalize uio_t for use and do the IO 
  428          */
  429         uio_calculateresid(auio);
  430         error = rd_uio(p, uap->fd, auio, retval);
  431 
  432 ExitThisRoutine:
  433         if (auio != NULL) {
  434                 uio_free(auio);
  435         }
  436         return (error);
  437 }
  438 
  439 /*
  440  * Write system call
  441  *
  442  * Returns:     0                       Success
  443  *              EBADF
  444  *      fp_lookup:EBADF
  445  *      dofilewrite:???
  446  */
  447 int
  448 write(struct proc *p, struct write_args *uap, user_ssize_t *retval)
  449 {
  450         __pthread_testcancel(1);
  451         return(write_nocancel(p, (struct write_nocancel_args *)uap, retval));
  452 
  453 }
  454 
  455 int
  456 write_nocancel(struct proc *p, struct write_nocancel_args *uap, user_ssize_t *retval)
  457 {
  458         struct fileproc *fp;
  459         int error;      
  460         int fd = uap->fd;
  461 
  462         AUDIT_ARG(fd, fd);
  463 
  464         error = fp_lookup(p,fd,&fp,0);
  465         if (error)
  466                 return(error);
  467         if ((fp->f_flag & FWRITE) == 0) {
  468                 error = EBADF;
  469         } else {
  470                 struct vfs_context context = *(vfs_context_current());
  471                 context.vc_ucred = fp->f_fglob->fg_cred;
  472 
  473                 error = dofilewrite(&context, fp, uap->cbuf, uap->nbyte,
  474                         (off_t)-1, 0, retval);
  475         }
  476         if (error == 0)
  477                 fp_drop_written(p, fd, fp);
  478         else
  479                 fp_drop(p, fd, fp, 0);
  480         return(error);  
  481 }
  482 
  483 /*                          
  484  * pwrite system call
  485  *
  486  * Returns:     0                       Success
  487  *              EBADF
  488  *              ESPIPE
  489  *              ENXIO
  490  *              EINVAL
  491  *      fp_lookup:EBADF
  492  *      dofilewrite:???
  493  */
  494 int
  495 pwrite(struct proc *p, struct pwrite_args *uap, user_ssize_t *retval)
  496 {
  497         __pthread_testcancel(1);
  498         return(pwrite_nocancel(p, (struct pwrite_nocancel_args *)uap, retval));
  499 }
  500 
  501 int
  502 pwrite_nocancel(struct proc *p, struct pwrite_nocancel_args *uap, user_ssize_t *retval)
  503 {
  504         struct fileproc *fp;
  505         int error; 
  506         int fd = uap->fd;
  507         vnode_t vp  = (vnode_t)0;
  508 
  509         AUDIT_ARG(fd, fd);
  510 
  511         error = fp_lookup(p,fd,&fp,0);
  512         if (error)
  513                 return(error);
  514 
  515         if ((fp->f_flag & FWRITE) == 0) {
  516                 error = EBADF;
  517         } else {
  518                 struct vfs_context context = *vfs_context_current();
  519                 context.vc_ucred = fp->f_fglob->fg_cred;
  520 
  521                 if (fp->f_type != DTYPE_VNODE) {
  522                         error = ESPIPE;
  523                         goto errout;
  524                 }
  525                 vp = (vnode_t)fp->f_fglob->fg_data;
  526                 if (vnode_isfifo(vp)) {
  527                         error = ESPIPE;
  528                         goto errout;
  529                 } 
  530                 if ((vp->v_flag & VISTTY)) {
  531                         error = ENXIO;
  532                         goto errout;
  533                 }
  534                 if (uap->offset == (off_t)-1) {
  535                         error = EINVAL;
  536                         goto errout;
  537                 }
  538 
  539                     error = dofilewrite(&context, fp, uap->buf, uap->nbyte,
  540                         uap->offset, FOF_OFFSET, retval);
  541         }
  542 errout:
  543         if (error == 0)
  544                 fp_drop_written(p, fd, fp);
  545         else
  546                 fp_drop(p, fd, fp, 0);
  547 
  548         KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
  549               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
  550         
  551         return(error);
  552 }
  553 
  554 /*
  555  * Returns:     0                       Success
  556  *              EINVAL
  557  *      <fo_write>:EPIPE
  558  *      <fo_write>:???                  [indirect through struct fileops]
  559  */
  560 __private_extern__ int                  
  561 dofilewrite(vfs_context_t ctx, struct fileproc *fp,
  562             user_addr_t bufp, user_size_t nbyte, off_t offset, int flags,
  563             user_ssize_t *retval)
  564 {       
  565         uio_t auio;
  566         long error = 0;
  567         user_ssize_t bytecnt;
  568         char uio_buf[ UIO_SIZEOF(1) ];
  569 
  570         if (nbyte > INT_MAX)   
  571                 return (EINVAL);
  572 
  573         if (IS_64BIT_PROCESS(vfs_context_proc(ctx))) {
  574                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE64, UIO_WRITE, 
  575                                                                           &uio_buf[0], sizeof(uio_buf));
  576         } else {
  577                 auio = uio_createwithbuffer(1, offset, UIO_USERSPACE32, UIO_WRITE, 
  578                                                                           &uio_buf[0], sizeof(uio_buf));
  579         }
  580         uio_addiov(auio, bufp, nbyte);
  581 
  582         bytecnt = nbyte; 
  583         if ((error = fo_write(fp, auio, flags, ctx))) {
  584                 if (uio_resid(auio) != bytecnt && (error == ERESTART ||
  585                         error == EINTR || error == EWOULDBLOCK))
  586                         error = 0;
  587                 /* The socket layer handles SIGPIPE */
  588                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
  589                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) {
  590                         /* XXX Raise the signal on the thread? */
  591                         psignal(vfs_context_proc(ctx), SIGPIPE);
  592                 }
  593         }
  594         bytecnt -= uio_resid(auio);
  595         *retval = bytecnt;
  596 
  597         return (error); 
  598 }
  599         
  600 /*      
  601  * Gather write system call  
  602  */     
  603 int
  604 writev(struct proc *p, struct writev_args *uap, user_ssize_t *retval)
  605 {
  606         __pthread_testcancel(1);
  607         return(writev_nocancel(p, (struct writev_nocancel_args *)uap, retval));
  608 }
  609 
  610 int
  611 writev_nocancel(struct proc *p, struct writev_nocancel_args *uap, user_ssize_t *retval)
  612 {
  613         uio_t auio = NULL;
  614         int error;
  615         struct user_iovec *iovp;
  616 
  617         AUDIT_ARG(fd, uap->fd);
  618 
  619         /* Verify range bedfore calling uio_create() */
  620         if (uap->iovcnt <= 0 || uap->iovcnt > UIO_MAXIOV)
  621                 return (EINVAL);
  622 
  623         /* allocate a uio large enough to hold the number of iovecs passed */
  624         auio = uio_create(uap->iovcnt, 0,
  625                                   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
  626                                   UIO_WRITE);
  627                                   
  628         /* get location of iovecs within the uio.  then copyin the iovecs from
  629          * user space.
  630          */
  631         iovp = uio_iovsaddr(auio);
  632         if (iovp == NULL) {
  633                 error = ENOMEM;
  634                 goto ExitThisRoutine;
  635         }
  636         error = copyin_user_iovec_array(uap->iovp,
  637                 IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32,
  638                 uap->iovcnt, iovp);
  639         if (error) {
  640                 goto ExitThisRoutine;
  641         }
  642         
  643         /* finalize uio_t for use and do the IO 
  644          */
  645         uio_calculateresid(auio);
  646         error = wr_uio(p, uap->fd, auio, retval);
  647 
  648 ExitThisRoutine:
  649         if (auio != NULL) {
  650                 uio_free(auio);
  651         }
  652         return (error);
  653 }
  654 
  655 
  656 int
  657 wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
  658 {
  659         struct fileproc *fp;
  660         int error;
  661         user_ssize_t count;
  662         struct vfs_context context = *vfs_context_current();
  663 
  664         error = fp_lookup(p,fdes,&fp,0);
  665         if (error)
  666                 return(error);
  667 
  668         if ((fp->f_flag & FWRITE) == 0) {
  669                 error = EBADF;
  670                 goto out;
  671         }
  672         count = uio_resid(uio);
  673 
  674         context.vc_ucred = fp->f_cred;
  675         error = fo_write(fp, uio, 0, &context);
  676         if (error) {
  677                 if (uio_resid(uio) != count && (error == ERESTART ||
  678                                                 error == EINTR || error == EWOULDBLOCK))
  679                         error = 0;
  680                 /* The socket layer handles SIGPIPE */
  681                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET &&
  682                     (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0)
  683                         psignal(p, SIGPIPE);
  684         }
  685         *retval = count - uio_resid(uio);
  686 
  687 out:
  688         if (error == 0)
  689                 fp_drop_written(p, fdes, fp);
  690         else
  691                 fp_drop(p, fdes, fp, 0);
  692         return(error);
  693 }
  694 
  695 
  696 int
  697 rd_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval)
  698 {
  699         struct fileproc *fp;
  700         int error;
  701         user_ssize_t count;
  702         struct vfs_context context = *vfs_context_current();
  703 
  704         if ( (error = preparefileread(p, &fp, fdes, 0)) )
  705                 return (error);
  706 
  707         count = uio_resid(uio);
  708 
  709         context.vc_ucred = fp->f_cred;
  710 
  711         error = fo_read(fp, uio, 0, &context);
  712 
  713         if (error) {
  714                 if (uio_resid(uio) != count && (error == ERESTART ||
  715                                                 error == EINTR || error == EWOULDBLOCK))
  716                         error = 0;
  717         }
  718         *retval = count - uio_resid(uio);
  719 
  720         donefileread(p, fp, fdes);
  721 
  722         return (error);
  723 }
  724 
  725 /*
  726  * Ioctl system call
  727  *
  728  * Returns:     0                       Success
  729  *              EBADF
  730  *              ENOTTY
  731  *              ENOMEM
  732  *              ESRCH
  733  *      copyin:EFAULT
  734  *      copyoutEFAULT
  735  *      fp_lookup:EBADF                 Bad file descriptor
  736  *      fo_ioctl:???
  737  */
  738 int
  739 ioctl(struct proc *p, struct ioctl_args *uap, __unused int32_t *retval)
  740 {
  741         struct fileproc *fp;
  742         u_long com;
  743         int error = 0;
  744         u_int size;
  745         caddr_t datap, memp;
  746         boolean_t is64bit;
  747         int tmp;
  748 #define STK_PARAMS      128
  749         char stkbuf[STK_PARAMS];
  750         int fd = uap->fd;
  751         struct vfs_context context = *vfs_context_current();
  752 
  753         AUDIT_ARG(fd, uap->fd);
  754         AUDIT_ARG(addr, uap->data);
  755 
  756         is64bit = proc_is64bit(p);
  757 #if CONFIG_AUDIT
  758         if (is64bit)
  759                 AUDIT_ARG(value64, uap->com);
  760         else
  761                 AUDIT_ARG(cmd, CAST_DOWN_EXPLICIT(int, uap->com));
  762 #endif /* CONFIG_AUDIT */
  763 
  764         proc_fdlock(p);
  765         error = fp_lookup(p,fd,&fp,1);
  766         if (error)  {
  767                 proc_fdunlock(p);
  768                 return(error);
  769         }
  770 
  771         AUDIT_ARG(file, p, fp);
  772 
  773         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  774                         error = EBADF;
  775                         goto out;
  776         }
  777 
  778         context.vc_ucred = fp->f_fglob->fg_cred;
  779 
  780 #if CONFIG_MACF
  781         error = mac_file_check_ioctl(context.vc_ucred, fp->f_fglob, uap->com);
  782         if (error)
  783                 goto out;
  784 #endif
  785                 
  786 #if NETAT
  787         /*
  788          * ### LD 6/11/97 Hack Alert: this is to get AppleTalk to work
  789          * while implementing an ATioctl system call
  790          */
  791         {
  792                 if (appletalk_inited && ((uap->com & 0x0000FFFF) == 0xff99)) {
  793                         u_long  fixed_command;
  794 
  795 #ifdef APPLETALK_DEBUG
  796                         kprintf("ioctl: special AppleTalk \n");
  797 #endif
  798                         datap = &stkbuf[0];
  799                         *(user_addr_t *)datap = uap->data;
  800                         fixed_command = _IOW(0, 0xff99, uap->data);
  801                         error = fo_ioctl(fp, fixed_command, datap, &context);
  802                         goto out;
  803                 }
  804         }
  805 
  806 #endif /* NETAT */
  807 
  808 
  809         switch (com = uap->com) {
  810         case FIONCLEX:
  811                 *fdflags(p, uap->fd) &= ~UF_EXCLOSE;
  812                 error =0;
  813                 goto out;
  814         case FIOCLEX:
  815                 *fdflags(p, uap->fd) |= UF_EXCLOSE;
  816                 error =0;
  817                 goto out;
  818         }
  819 
  820         /*
  821          * Interpret high order word to find amount of data to be
  822          * copied to/from the user's address space.
  823          */
  824         size = IOCPARM_LEN(com);
  825         if (size > IOCPARM_MAX) {
  826                         error = ENOTTY;
  827                         goto out;
  828         }
  829         memp = NULL;
  830         if (size > sizeof (stkbuf)) {
  831                 proc_fdunlock(p);
  832                 if ((memp = (caddr_t)kalloc(size)) == 0) {
  833                         proc_fdlock(p);
  834                         error = ENOMEM;
  835                         goto out;
  836                 }
  837                 proc_fdlock(p);
  838                 datap = memp;
  839         } else
  840                 datap = &stkbuf[0];
  841         if (com&IOC_IN) {
  842                 if (size) {
  843                         proc_fdunlock(p);
  844                         error = copyin(uap->data, datap, size);
  845                         if (error) {
  846                                 if (memp)
  847                                         kfree(memp, size);
  848                                 proc_fdlock(p);
  849                                 goto out;
  850                         }
  851                         proc_fdlock(p);
  852                 } else {
  853                         /* XXX - IOC_IN and no size?  we should proably return an error here!! */
  854                         if (is64bit) {
  855                                 *(user_addr_t *)datap = uap->data;
  856                         }
  857                         else {
  858                                 *(uint32_t *)datap = (uint32_t)uap->data;
  859                         }
  860                 }
  861         } else if ((com&IOC_OUT) && size)
  862                 /*
  863                  * Zero the buffer so the user always
  864                  * gets back something deterministic.
  865                  */
  866                 bzero(datap, size);
  867         else if (com&IOC_VOID) {
  868                 /* XXX - this is odd since IOC_VOID means no parameters */
  869                 if (is64bit) {
  870                         *(user_addr_t *)datap = uap->data;
  871                 }
  872                 else {
  873                         *(uint32_t *)datap = (uint32_t)uap->data;
  874                 }
  875         }
  876 
  877         switch (com) {
  878 
  879         case FIONBIO:
  880                 if ( (tmp = *(int *)datap) )
  881                         fp->f_flag |= FNONBLOCK;
  882                 else
  883                         fp->f_flag &= ~FNONBLOCK;
  884                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, &context);
  885                 break;
  886 
  887         case FIOASYNC:
  888                 if ( (tmp = *(int *)datap) )
  889                         fp->f_flag |= FASYNC;
  890                 else
  891                         fp->f_flag &= ~FASYNC;
  892                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, &context);
  893                 break;
  894 
  895         case FIOSETOWN:
  896                 tmp = *(int *)datap;
  897                 if (fp->f_type == DTYPE_SOCKET) {
  898                         ((struct socket *)fp->f_data)->so_pgid = tmp;
  899                         error = 0;
  900                         break;
  901                 }
  902                 if (fp->f_type == DTYPE_PIPE) {
  903                         error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
  904                         break;
  905                 }
  906                 if (tmp <= 0) {
  907                         tmp = -tmp;
  908                 } else {
  909                         struct proc *p1 = proc_find(tmp);
  910                         if (p1 == 0) {
  911                                 error = ESRCH;
  912                                 break;
  913                         }
  914                         tmp = p1->p_pgrpid;
  915                         proc_rele(p1);
  916                 }
  917                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context);
  918                 break;
  919 
  920         case FIOGETOWN:
  921                 if (fp->f_type == DTYPE_SOCKET) {
  922                         error = 0;
  923                         *(int *)datap = ((struct socket *)fp->f_data)->so_pgid;
  924                         break;
  925                 }
  926                 error = fo_ioctl(fp, TIOCGPGRP, datap, &context);
  927                 *(int *)datap = -*(int *)datap;
  928                 break;
  929 
  930         default:
  931                 error = fo_ioctl(fp, com, datap, &context);
  932                 /*
  933                  * Copy any data to user, size was
  934                  * already set and checked above.
  935                  */
  936                 if (error == 0 && (com&IOC_OUT) && size)
  937                         error = copyout(datap, uap->data, (u_int)size);
  938                 break;
  939         }
  940         proc_fdunlock(p);
  941         if (memp)
  942                 kfree(memp, size);
  943         proc_fdlock(p);
  944 out:
  945         fp_drop(p, fd, fp, 1);
  946         proc_fdunlock(p);
  947         return(error);
  948 }
  949 
  950 int     selwait, nselcoll;
  951 #define SEL_FIRSTPASS 1
  952 #define SEL_SECONDPASS 2
  953 extern int selcontinue(int error);
  954 extern int selprocess(int error, int sel_pass);
  955 static int selscan(struct proc *p, struct _select * sel,
  956                         int nfd, int32_t *retval, int sel_pass, wait_queue_sub_t wqsub);
  957 static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count);
  958 static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount);
  959 static int seldrop(struct proc *p, u_int32_t *ibits, int nfd);
  960 
  961 /*
  962  * Select system call.
  963  *
  964  * Returns:     0                       Success
  965  *              EINVAL                  Invalid argument
  966  *              EAGAIN                  Nonconformant error if allocation fails
  967  *      selprocess:???
  968  */
  969 int
  970 select(struct proc *p, struct select_args *uap, int32_t *retval)
  971 {
  972         __pthread_testcancel(1);
  973         return(select_nocancel(p, (struct select_nocancel_args *)uap, retval));
  974 }
  975 
  976 int
  977 select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retval)
  978 {
  979         int error = 0;
  980         u_int ni, nw, size;
  981         thread_t th_act;
  982         struct uthread  *uth;
  983         struct _select *sel;
  984         int needzerofill = 1;
  985         int count = 0;
  986 
  987         th_act = current_thread();
  988         uth = get_bsdthread_info(th_act);
  989         sel = &uth->uu_select;
  990         retval = (int *)get_bsduthreadrval(th_act);
  991         *retval = 0;
  992 
  993         if (uap->nd < 0) {
  994                 return (EINVAL);
  995         }
  996 
  997         /* select on thread of process that already called proc_exit() */
  998         if (p->p_fd == NULL) {
  999                 return (EBADF);
 1000         }
 1001 
 1002         if (uap->nd > p->p_fd->fd_nfiles)
 1003                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
 1004 
 1005         nw = howmany(uap->nd, NFDBITS);
 1006         ni = nw * sizeof(fd_mask);
 1007 
 1008         /*
 1009          * if the previously allocated space for the bits is smaller than
 1010          * what is requested or no space has yet been allocated for this
 1011          * thread, allocate enough space now.
 1012          *
 1013          * Note: If this process fails, select() will return EAGAIN; this
 1014          * is the same thing pool() returns in a no-memory situation, but
 1015          * it is not a POSIX compliant error code for select().
 1016          */
 1017         if (sel->nbytes < (3 * ni)) {
 1018                 int nbytes = 3 * ni;
 1019 
 1020                 /* Free previous allocation, if any */
 1021                 if (sel->ibits != NULL)
 1022                         FREE(sel->ibits, M_TEMP);
 1023                 if (sel->obits != NULL) {
 1024                         FREE(sel->obits, M_TEMP);
 1025                         /* NULL out; subsequent ibits allocation may fail */
 1026                         sel->obits = NULL;
 1027                 }
 1028 
 1029                 MALLOC(sel->ibits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
 1030                 if (sel->ibits == NULL)
 1031                         return (EAGAIN);
 1032                 MALLOC(sel->obits, u_int32_t *, nbytes, M_TEMP, M_WAITOK | M_ZERO);
 1033                 if (sel->obits == NULL) {
 1034                         FREE(sel->ibits, M_TEMP);
 1035                         sel->ibits = NULL;
 1036                         return (EAGAIN);
 1037                 }
 1038                 sel->nbytes = nbytes;
 1039                 needzerofill = 0;
 1040         }
 1041 
 1042         if (needzerofill) {
 1043                 bzero((caddr_t)sel->ibits, sel->nbytes);
 1044                 bzero((caddr_t)sel->obits, sel->nbytes);
 1045         }
 1046 
 1047         /*
 1048          * get the bits from the user address space
 1049          */
 1050 #define getbits(name, x) \
 1051         do { \
 1052                 if (uap->name && (error = copyin(uap->name, \
 1053                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
 1054                         goto continuation; \
 1055         } while (0)
 1056 
 1057         getbits(in, 0);
 1058         getbits(ou, 1);
 1059         getbits(ex, 2);
 1060 #undef  getbits
 1061 
 1062         if (uap->tv) {
 1063                 struct timeval atv;
 1064                 if (IS_64BIT_PROCESS(p)) {
 1065                         struct user64_timeval atv64;
 1066                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
 1067                         /* Loses resolution - assume timeout < 68 years */
 1068                         atv.tv_sec = atv64.tv_sec;
 1069                         atv.tv_usec = atv64.tv_usec;
 1070                 } else {
 1071                         struct user32_timeval atv32;
 1072                         error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
 1073                         atv.tv_sec = atv32.tv_sec;
 1074                         atv.tv_usec = atv32.tv_usec;
 1075                 }
 1076                 if (error)
 1077                         goto continuation;
 1078                 if (itimerfix(&atv)) {
 1079                         error = EINVAL;
 1080                         goto continuation;
 1081                 }
 1082 
 1083                 clock_absolutetime_interval_to_deadline(
 1084                                                                                 tvtoabstime(&atv), &sel->abstime);
 1085         }
 1086         else
 1087                 sel->abstime = 0;
 1088 
 1089         if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) {
 1090                         goto continuation;
 1091         }
 1092 
 1093         sel->count = count;
 1094         size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK);
 1095         if (uth->uu_allocsize) {
 1096                 if (uth->uu_wqset == 0)
 1097                         panic("select: wql memory smashed");
 1098                 /* needed for the select now */
 1099                 if (size > uth->uu_allocsize) {
 1100                         kfree(uth->uu_wqset,  uth->uu_allocsize);
 1101                         uth->uu_allocsize = size;
 1102                         uth->uu_wqset = (wait_queue_set_t)kalloc(size);
 1103                         if (uth->uu_wqset == (wait_queue_set_t)NULL)
 1104                                 panic("failed to allocate memory for waitqueue\n");
 1105                 }
 1106         } else {
 1107                 uth->uu_allocsize = size;
 1108                 uth->uu_wqset = (wait_queue_set_t)kalloc(uth->uu_allocsize);
 1109                 if (uth->uu_wqset == (wait_queue_set_t)NULL)
 1110                         panic("failed to allocate memory for waitqueue\n");
 1111         }
 1112         bzero(uth->uu_wqset, size);
 1113         sel->wql = (char *)uth->uu_wqset + SIZEOF_WAITQUEUE_SET;
 1114         wait_queue_set_init(uth->uu_wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST));
 1115 
 1116 continuation:
 1117 
 1118         if (error) {
 1119                 /*
 1120                  * We have already cleaned up any state we established,
 1121                  * either locally or as a result of selcount().  We don't
 1122                  * need to wait_subqueue_unlink_all(), since we haven't set
 1123                  * anything at this point.
 1124                  */
 1125                 return (error);
 1126         }
 1127 
 1128         return selprocess(0, SEL_FIRSTPASS);
 1129 }
 1130 
 1131 int
 1132 selcontinue(int error)
 1133 {
 1134         return selprocess(error, SEL_SECONDPASS);
 1135 }
 1136 
 1137 
 1138 /*
 1139  * selprocess
 1140  *
 1141  * Parameters:  error                   The error code from our caller
 1142  *              sel_pass                The pass we are on
 1143  */
 1144 int
 1145 selprocess(int error, int sel_pass)
 1146 {
 1147         int ncoll;
 1148         u_int ni, nw;
 1149         thread_t th_act;
 1150         struct uthread  *uth;
 1151         struct proc *p;
 1152         struct select_args *uap;
 1153         int *retval;
 1154         struct _select *sel;
 1155         int unwind = 1;
 1156         int prepost = 0;
 1157         int somewakeup = 0;
 1158         int doretry = 0;
 1159         wait_result_t wait_result;
 1160 
 1161         p = current_proc();
 1162         th_act = current_thread();
 1163         uap = (struct select_args *)get_bsduthreadarg(th_act);
 1164         retval = (int *)get_bsduthreadrval(th_act);
 1165         uth = get_bsdthread_info(th_act);
 1166         sel = &uth->uu_select;
 1167 
 1168         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
 1169                         unwind = 0;
 1170         if (sel->count == 0)
 1171                         unwind = 0;
 1172 retry:
 1173         if (error != 0) {
 1174                 sel_pass = SEL_FIRSTPASS;       /* Reset for seldrop */
 1175                 goto done;
 1176         }
 1177 
 1178         ncoll = nselcoll;
 1179         OSBitOrAtomic(P_SELECT, &p->p_flag);
 1180         /* skip scans if the select is just for timeouts */
 1181         if (sel->count) {
 1182                 /*
 1183                  * Clear out any dangling refs from prior calls; technically
 1184                  * there should not be any.
 1185                  */
 1186                 if (sel_pass == SEL_FIRSTPASS)
 1187                         wait_queue_sub_clearrefs(uth->uu_wqset);
 1188 
 1189                 error = selscan(p, sel, uap->nd, retval, sel_pass, (wait_queue_sub_t)uth->uu_wqset);
 1190                 if (error || *retval) {
 1191                         goto done;
 1192                 }
 1193                 if (prepost) {
 1194                         /* if the select of log, then we canwakeup and discover some one
 1195                         * else already read the data; go toselct again if time permits
 1196                         */
 1197                         prepost = 0;
 1198                         doretry = 1;
 1199                 }
 1200                 if (somewakeup) {
 1201                         somewakeup = 0;
 1202                         doretry = 1;
 1203                 }
 1204         }
 1205 
 1206         if (uap->tv) {
 1207                 uint64_t        now;
 1208 
 1209                 clock_get_uptime(&now);
 1210                 if (now >= sel->abstime)
 1211                         goto done;
 1212         }
 1213 
 1214         if (doretry) {
 1215                 /* cleanup obits and try again */
 1216                 doretry = 0;
 1217                 sel_pass = SEL_FIRSTPASS;
 1218                 goto retry;
 1219         }
 1220 
 1221         /*
 1222          * To effect a poll, the timeout argument should be
 1223          * non-nil, pointing to a zero-valued timeval structure.
 1224          */
 1225         if (uap->tv && sel->abstime == 0) {
 1226                 goto done;
 1227         }
 1228 
 1229         /* No spurious wakeups due to colls,no need to check for them */
 1230          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
 1231                 sel_pass = SEL_FIRSTPASS;
 1232                 goto retry;
 1233         }
 1234 
 1235         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
 1236 
 1237         /* if the select is just for timeout skip check */
 1238         if (sel->count &&(sel_pass == SEL_SECONDPASS))
 1239                 panic("selprocess: 2nd pass assertwaiting");
 1240 
 1241         /* Wait Queue Subordinate has waitqueue as first element */
 1242         wait_result = wait_queue_assert_wait((wait_queue_t)uth->uu_wqset,
 1243                                              NULL, THREAD_ABORTSAFE, sel->abstime);
 1244         if (wait_result != THREAD_AWAKENED) {
 1245                 /* there are no preposted events */
 1246                 error = tsleep1(NULL, PSOCK | PCATCH,
 1247                                 "select", 0, selcontinue);
 1248         } else  {
 1249                 prepost = 1;
 1250                 error = 0;
 1251         }
 1252 
 1253         if (error == 0) {
 1254                 sel_pass = SEL_SECONDPASS;
 1255                 if (!prepost)
 1256                         somewakeup = 1;
 1257                 goto retry;
 1258         }
 1259 done:
 1260         if (unwind) {
 1261                 wait_subqueue_unlink_all(uth->uu_wqset);
 1262                 seldrop(p, sel->ibits, uap->nd);
 1263         }
 1264         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
 1265         /* select is not restarted after signals... */
 1266         if (error == ERESTART)
 1267                 error = EINTR;
 1268         if (error == EWOULDBLOCK)
 1269                 error = 0;
 1270         nw = howmany(uap->nd, NFDBITS);
 1271         ni = nw * sizeof(fd_mask);
 1272 
 1273 #define putbits(name, x) \
 1274         do { \
 1275                 if (uap->name && (error2 = \
 1276                         copyout((caddr_t)&sel->obits[(x) * nw], uap->name, ni))) \
 1277                         error = error2; \
 1278         } while (0)
 1279 
 1280         if (error == 0) {
 1281                 int error2;
 1282 
 1283                 putbits(in, 0);
 1284                 putbits(ou, 1);
 1285                 putbits(ex, 2);
 1286 #undef putbits
 1287         }
 1288         return(error);
 1289 }
 1290 
 1291 
 1292 /*
 1293  * selscan
 1294  *
 1295  * Parameters:  p                       Process performing the select
 1296  *              sel                     The per-thread select context structure
 1297  *              nfd                     The number of file descriptors to scan
 1298  *              retval                  The per thread system call return area
 1299  *              sel_pass                Which pass this is; allowed values are
 1300  *                                              SEL_FIRSTPASS and SEL_SECONDPASS
 1301  *              wqsub                   The per thread wait queue set
 1302  *
 1303  * Returns:     0                       Success
 1304  *              EIO                     Invalid p->p_fd field XXX Obsolete?
 1305  *              EBADF                   One of the files in the bit vector is
 1306  *                                              invalid.
 1307  */
 1308 static int
 1309 selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval,
 1310         int sel_pass, wait_queue_sub_t wqsub)
 1311 {
 1312         struct filedesc *fdp = p->p_fd;
 1313         int msk, i, j, fd;
 1314         u_int32_t bits;
 1315         struct fileproc *fp;
 1316         int n = 0;              /* count of bits */
 1317         int nc = 0;             /* bit vector offset (nc'th bit) */
 1318         static int flag[3] = { FREAD, FWRITE, 0 };
 1319         u_int32_t *iptr, *optr;
 1320         u_int nw;
 1321         u_int32_t *ibits, *obits;
 1322         char * wql;
 1323         char * wql_ptr;
 1324         int count;
 1325         struct vfs_context context = *vfs_context_current();
 1326 
 1327         /*
 1328          * Problems when reboot; due to MacOSX signal probs
 1329          * in Beaker1C ; verify that the p->p_fd is valid
 1330          */
 1331         if (fdp == NULL) {
 1332                 *retval=0;
 1333                 return(EIO);
 1334         }
 1335         ibits = sel->ibits;
 1336         obits = sel->obits;
 1337         wql = sel->wql;
 1338 
 1339         nw = howmany(nfd, NFDBITS);
 1340 
 1341         count = sel->count;
 1342 
 1343         nc = 0;
 1344         if (count) {
 1345                 proc_fdlock(p);
 1346                 for (msk = 0; msk < 3; msk++) {
 1347                         iptr = (u_int32_t *)&ibits[msk * nw];
 1348                         optr = (u_int32_t *)&obits[msk * nw];
 1349 
 1350                         for (i = 0; i < nfd; i += NFDBITS) {
 1351                                 bits = iptr[i/NFDBITS];
 1352 
 1353                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1354                                         bits &= ~(1 << j);
 1355                                         fp = fdp->fd_ofiles[fd];
 1356 
 1357                                         if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
 1358                                                 /*
 1359                                                  * If we abort because of a bad
 1360                                                  * fd, let the caller unwind...
 1361                                                  */
 1362                                                 proc_fdunlock(p);
 1363                                                 return(EBADF);
 1364                                         }
 1365                                         if (sel_pass == SEL_SECONDPASS) {
 1366                                                 wql_ptr = (char *)0;
 1367                                                 if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)wqsub)) {
 1368                                                         fp->f_flags &= ~FP_INSELECT;
 1369                                                         fp->f_waddr = (void *)0;
 1370                                                 }
 1371                                         } else {
 1372                                                 wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK);
 1373                                                 if (fp->f_flags & FP_INSELECT) {
 1374                                                         /* someone is already in select on this fp */
 1375                                                         fp->f_flags |= FP_SELCONFLICT;
 1376                                                         wait_queue_link(&select_conflict_queue, (wait_queue_set_t)wqsub);
 1377                                                 } else {
 1378                                                         fp->f_flags |= FP_INSELECT;
 1379                                                         fp->f_waddr = (void *)wqsub;
 1380                                                 }
 1381                                         }
 1382 
 1383                                         context.vc_ucred = fp->f_cred;
 1384 
 1385                                         /* The select; set the bit, if true */
 1386                                         if (fp->f_ops
 1387                                                 && fo_select(fp, flag[msk], wql_ptr, &context)) {
 1388                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
 1389                                                 n++;
 1390                                         }
 1391                                         nc++;
 1392                                 }
 1393                         }
 1394                 }
 1395                 proc_fdunlock(p);
 1396         }
 1397         *retval = n;
 1398         return (0);
 1399 }
 1400 
 1401 int poll_callback(struct kqueue *, struct kevent64_s *, void *);
 1402 
 1403 struct poll_continue_args {
 1404         user_addr_t pca_fds;
 1405         u_int pca_nfds;
 1406         u_int pca_rfds;
 1407 };
 1408 
 1409 int
 1410 poll(struct proc *p, struct poll_args *uap, int32_t *retval)
 1411 {
 1412         __pthread_testcancel(1);
 1413         return(poll_nocancel(p, (struct poll_nocancel_args *)uap, retval));
 1414 }
 1415 
 1416 
 1417 int
 1418 poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
 1419 {
 1420         struct poll_continue_args *cont;
 1421         struct pollfd *fds;
 1422         struct kqueue *kq;
 1423         struct timeval atv;
 1424         int ncoll, error = 0;
 1425         u_int nfds = uap->nfds;
 1426         u_int rfds = 0;
 1427         u_int i;
 1428         size_t ni;
 1429 
 1430         /*
 1431          * This is kinda bogus.  We have fd limits, but that is not
 1432          * really related to the size of the pollfd array.  Make sure
 1433          * we let the process use at least FD_SETSIZE entries and at
 1434          * least enough for the current limits.  We want to be reasonably
 1435          * safe, but not overly restrictive.
 1436          */
 1437         if (nfds > OPEN_MAX ||
 1438             (nfds > p->p_rlimit[RLIMIT_NOFILE].rlim_cur && (proc_suser(p) || nfds > FD_SETSIZE)))
 1439                 return (EINVAL);
 1440 
 1441         kq = kqueue_alloc(p);
 1442         if (kq == NULL)
 1443                 return (EAGAIN);
 1444 
 1445         ni = nfds * sizeof(struct pollfd) + sizeof(struct poll_continue_args);
 1446         MALLOC(cont, struct poll_continue_args *, ni, M_TEMP, M_WAITOK);
 1447         if (NULL == cont) {
 1448                 error = EAGAIN;
 1449                 goto out;
 1450         }
 1451         
 1452         fds = (struct pollfd *)&cont[1];
 1453         error = copyin(uap->fds, fds, nfds * sizeof(struct pollfd));
 1454         if (error)
 1455                 goto out;
 1456 
 1457         if (uap->timeout != -1) {
 1458                 struct timeval rtv;
 1459 
 1460                 atv.tv_sec = uap->timeout / 1000;
 1461                 atv.tv_usec = (uap->timeout % 1000) * 1000;
 1462                 if (itimerfix(&atv)) {
 1463                         error = EINVAL;
 1464                         goto out;
 1465                 }
 1466                 getmicrouptime(&rtv);
 1467                 timevaladd(&atv, &rtv);
 1468         } else {
 1469                 atv.tv_sec = 0;
 1470                 atv.tv_usec = 0;
 1471         }
 1472 
 1473         /* JMM - all this P_SELECT stuff is bogus */
 1474         ncoll = nselcoll;
 1475         OSBitOrAtomic(P_SELECT, &p->p_flag);
 1476         for (i = 0; i < nfds; i++) {
 1477                 short events = fds[i].events;
 1478                 struct kevent64_s kev;
 1479                 int kerror = 0;
 1480 
 1481                 /* per spec, ignore fd values below zero */
 1482                 if (fds[i].fd < 0) {
 1483                         fds[i].revents = 0;
 1484                         continue;
 1485                 }
 1486 
 1487                 /* convert the poll event into a kqueue kevent */
 1488                 kev.ident = fds[i].fd;
 1489                 kev.flags = EV_ADD | EV_ONESHOT | EV_POLL;
 1490                 kev.udata = CAST_USER_ADDR_T(&fds[i]);
 1491                 kev.fflags = 0;
 1492                 kev.data = 0;
 1493                 kev.ext[0] = 0;
 1494                 kev.ext[1] = 0;
 1495 
 1496                 /* Handle input events */
 1497                 if (events & ( POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND | POLLHUP )) {
 1498                         kev.filter = EVFILT_READ;
 1499                         if (!(events & ( POLLIN | POLLRDNORM )))
 1500                                 kev.flags |= EV_OOBAND;
 1501                         kerror = kevent_register(kq, &kev, p);
 1502                 }
 1503 
 1504                 /* Handle output events */
 1505                 if (kerror == 0 &&
 1506                     events & ( POLLOUT | POLLWRNORM | POLLWRBAND )) {
 1507                         kev.filter = EVFILT_WRITE;
 1508                         kerror = kevent_register(kq, &kev, p);
 1509                 }
 1510 
 1511                 /* Handle BSD extension vnode events */
 1512                 if (kerror == 0 &&
 1513                     events & ( POLLEXTEND | POLLATTRIB | POLLNLINK | POLLWRITE )) {
 1514                         kev.filter = EVFILT_VNODE;
 1515                         kev.fflags = 0;
 1516                         if (events & POLLEXTEND)
 1517                                 kev.fflags |= NOTE_EXTEND;
 1518                         if (events & POLLATTRIB)
 1519                                 kev.fflags |= NOTE_ATTRIB;
 1520                         if (events & POLLNLINK)
 1521                                 kev.fflags |= NOTE_LINK;
 1522                         if (events & POLLWRITE)
 1523                                 kev.fflags |= NOTE_WRITE;
 1524                         kerror = kevent_register(kq, &kev, p);
 1525                 }
 1526 
 1527                 if (kerror != 0) {
 1528                         fds[i].revents = POLLNVAL;
 1529                         rfds++;
 1530                 } else
 1531                         fds[i].revents = 0;
 1532         }
 1533 
 1534         /* Did we have any trouble registering? */
 1535         if (rfds > 0)
 1536                 goto done;
 1537 
 1538         /* scan for, and possibly wait for, the kevents to trigger */
 1539         cont->pca_fds = uap->fds;
 1540         cont->pca_nfds = nfds;
 1541         cont->pca_rfds = rfds;
 1542         error = kqueue_scan(kq, poll_callback, NULL, cont, &atv, p);
 1543         rfds = cont->pca_rfds;
 1544 
 1545  done:
 1546         OSBitAndAtomic(~((uint32_t)P_SELECT), &p->p_flag);
 1547         /* poll is not restarted after signals... */
 1548         if (error == ERESTART)
 1549                 error = EINTR;
 1550         if (error == EWOULDBLOCK)
 1551                 error = 0;
 1552         if (error == 0) {
 1553                 error = copyout(fds, uap->fds, nfds * sizeof(struct pollfd));
 1554                 *retval = rfds;
 1555         }
 1556 
 1557  out:
 1558         if (NULL != cont)
 1559                 FREE(cont, M_TEMP);
 1560 
 1561         kqueue_dealloc(kq);
 1562         return (error);
 1563 }
 1564 
 1565 int
 1566 poll_callback(__unused struct kqueue *kq, struct kevent64_s *kevp, void *data)
 1567 {
 1568         struct poll_continue_args *cont = (struct poll_continue_args *)data;
 1569         struct pollfd *fds = CAST_DOWN(struct pollfd *, kevp->udata);
 1570         short mask;
 1571 
 1572         /* convert the results back into revents */
 1573         if (kevp->flags & EV_EOF)
 1574                 fds->revents |= POLLHUP;
 1575         if (kevp->flags & EV_ERROR)
 1576                 fds->revents |= POLLERR;
 1577 
 1578         switch (kevp->filter) {
 1579         case EVFILT_READ:
 1580                 if (fds->revents & POLLHUP)
 1581                         mask = (POLLIN | POLLRDNORM | POLLPRI | POLLRDBAND );
 1582                 else {
 1583                         mask = 0;
 1584                         if (kevp->data != 0)
 1585                                 mask |= (POLLIN | POLLRDNORM );
 1586                         if (kevp->flags & EV_OOBAND)
 1587                                 mask |= ( POLLPRI | POLLRDBAND );
 1588                 }
 1589                 fds->revents |= (fds->events & mask);
 1590                 break;
 1591 
 1592         case EVFILT_WRITE:
 1593                 if (!(fds->revents & POLLHUP))
 1594                         fds->revents |= (fds->events & ( POLLOUT | POLLWRNORM | POLLWRBAND ));
 1595                 break;
 1596 
 1597         case EVFILT_VNODE:
 1598                 if (kevp->fflags & NOTE_EXTEND)
 1599                         fds->revents |= (fds->events & POLLEXTEND);
 1600                 if (kevp->fflags & NOTE_ATTRIB)
 1601                         fds->revents |= (fds->events & POLLATTRIB);
 1602                 if (kevp->fflags & NOTE_LINK)
 1603                         fds->revents |= (fds->events & POLLNLINK);
 1604                 if (kevp->fflags & NOTE_WRITE)
 1605                         fds->revents |= (fds->events & POLLWRITE);
 1606                 break;
 1607         }
 1608 
 1609         if (fds->revents)
 1610                 cont->pca_rfds++;
 1611 
 1612         return 0;
 1613 }
 1614         
 1615 int
 1616 seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p)
 1617 {
 1618 
 1619         return (1);
 1620 }
 1621 
 1622 /*
 1623  * selcount
 1624  *
 1625  * Count the number of bits set in the input bit vector, and establish an
 1626  * outstanding fp->f_iocount for each of the descriptors which will be in
 1627  * use in the select operation.
 1628  *
 1629  * Parameters:  p                       The process doing the select
 1630  *              ibits                   The input bit vector
 1631  *              nfd                     The number of fd's in the vector
 1632  *              countp                  Pointer to where to store the bit count
 1633  *
 1634  * Returns:     0                       Success
 1635  *              EIO                     Bad per process open file table
 1636  *              EBADF                   One of the bits in the input bit vector
 1637  *                                              references an invalid fd
 1638  *
 1639  * Implicit:    *countp (modified)      Count of fd's
 1640  *
 1641  * Notes:       This function is the first pass under the proc_fdlock() that
 1642  *              permits us to recognize invalid descriptors in the bit vector;
 1643  *              the may, however, not remain valid through the drop and
 1644  *              later reacquisition of the proc_fdlock().
 1645  */
 1646 static int
 1647 selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp)
 1648 {
 1649         struct filedesc *fdp = p->p_fd;
 1650         int msk, i, j, fd;
 1651         u_int32_t bits;
 1652         struct fileproc *fp;
 1653         int n = 0;
 1654         u_int32_t *iptr;
 1655         u_int nw;
 1656         int error=0; 
 1657         int dropcount;
 1658         int need_wakeup = 0;
 1659 
 1660         /*
 1661          * Problems when reboot; due to MacOSX signal probs
 1662          * in Beaker1C ; verify that the p->p_fd is valid
 1663          */
 1664         if (fdp == NULL) {
 1665                 *countp = 0;
 1666                 return(EIO);
 1667         }
 1668         nw = howmany(nfd, NFDBITS);
 1669 
 1670         proc_fdlock(p);
 1671         for (msk = 0; msk < 3; msk++) {
 1672                 iptr = (u_int32_t *)&ibits[msk * nw];
 1673                 for (i = 0; i < nfd; i += NFDBITS) {
 1674                         bits = iptr[i/NFDBITS];
 1675                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1676                                 bits &= ~(1 << j);
 1677                                 fp = fdp->fd_ofiles[fd];
 1678                                 if (fp == NULL ||
 1679                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
 1680                                                 *countp = 0;
 1681                                                 error = EBADF;
 1682                                                 goto bad;
 1683                                 }
 1684                                 fp->f_iocount++;
 1685                                 n++;
 1686                         }
 1687                 }
 1688         }
 1689         proc_fdunlock(p);
 1690 
 1691         *countp = n;
 1692         return (0);
 1693 
 1694 bad:
 1695         dropcount = 0;
 1696         
 1697         if (n== 0)
 1698                 goto out;
 1699         /* Ignore error return; it's already EBADF */
 1700         (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1);
 1701 
 1702 out:
 1703         proc_fdunlock(p);
 1704         if (need_wakeup) {
 1705                 wakeup(&p->p_fpdrainwait);
 1706         }
 1707         return(error);
 1708 }
 1709 
 1710 
 1711 /*
 1712  * seldrop_locked
 1713  *
 1714  * Drop outstanding wait queue references set up during selscan(); drop the
 1715  * outstanding per fileproc f_iocount() picked up during the selcount().
 1716  *
 1717  * Parameters:  p                       Process performing the select
 1718  *              ibits                   Input pit bector of fd's
 1719  *              nfd                     Number of fd's
 1720  *              lim                     Limit to number of vector entries to
 1721  *                                              consider, or -1 for "all"
 1722  *              inselect                True if
 1723  *              need_wakeup             Pointer to flag to set to do a wakeup
 1724  *                                      if f_iocont on any descriptor goes to 0
 1725  *
 1726  * Returns:     0                       Success
 1727  *              EBADF                   One or more fds in the bit vector
 1728  *                                              were invalid, but the rest
 1729  *                                              were successfully dropped
 1730  *
 1731  * Notes:       An fd make become bad while the proc_fdlock() is not held,
 1732  *              if a multithreaded application closes the fd out from under
 1733  *              the in progress select.  In this case, we still have to
 1734  *              clean up after the set up on the remaining fds.
 1735  */
 1736 static int
 1737 seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount)
 1738 {
 1739         struct filedesc *fdp = p->p_fd;
 1740         int msk, i, j, fd;
 1741         u_int32_t bits;
 1742         struct fileproc *fp;
 1743         u_int32_t *iptr;
 1744         u_int nw;
 1745         int error = 0;
 1746         int dropcount = 0;
 1747         uthread_t uth = get_bsdthread_info(current_thread());
 1748 
 1749         *need_wakeup = 0;
 1750 
 1751         /*
 1752          * Problems when reboot; due to MacOSX signal probs
 1753          * in Beaker1C ; verify that the p->p_fd is valid
 1754          */
 1755         if (fdp == NULL) {
 1756                 return(EIO);
 1757         }
 1758 
 1759         nw = howmany(nfd, NFDBITS);
 1760 
 1761         for (msk = 0; msk < 3; msk++) {
 1762                 iptr = (u_int32_t *)&ibits[msk * nw];
 1763                 for (i = 0; i < nfd; i += NFDBITS) {
 1764                         bits = iptr[i/NFDBITS];
 1765                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1766                                 bits &= ~(1 << j);
 1767                                 fp = fdp->fd_ofiles[fd];
 1768                                 /*
 1769                                  * If we've already dropped as many as were
 1770                                  * counted/scanned, then we are done.  
 1771                                  */
 1772                                 if ((fromselcount != 0) && (++dropcount > lim))
 1773                                         goto done;
 1774 
 1775                                 if (fp == NULL) {
 1776                                         /* skip (now) bad fds */
 1777                                         error = EBADF;
 1778                                         continue;
 1779                                 }
 1780                                 /*
 1781                                  * Only clear the flag if we set it.  We'll
 1782                                  * only find that we set it if we had made
 1783                                  * at least one [partial] pass through selscan().
 1784                                  */
 1785                                 if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)uth->uu_wqset)) {
 1786                                         fp->f_flags &= ~FP_INSELECT;
 1787                                         fp->f_waddr = (void *)0;
 1788                                 }
 1789 
 1790                                 fp->f_iocount--;
 1791                                 if (fp->f_iocount < 0)
 1792                                         panic("f_iocount overdecrement!");
 1793 
 1794                                 if (fp->f_iocount == 0) {
 1795                                         /*
 1796                                          * The last iocount is responsible for clearing
 1797                                          * selconfict flag - even if we didn't set it -
 1798                                          * and is also responsible for waking up anyone
 1799                                          * waiting on iocounts to drain.
 1800                                          */
 1801                                         if (fp->f_flags & FP_SELCONFLICT)
 1802                                                 fp->f_flags &= ~FP_SELCONFLICT;
 1803                                         if (p->p_fpdrainwait) {
 1804                                                 p->p_fpdrainwait = 0;
 1805                                                 *need_wakeup = 1;
 1806                                         }
 1807                                 }
 1808                         }
 1809                 }
 1810         }
 1811 done:
 1812         return (error);
 1813 }
 1814 
 1815 
 1816 static int
 1817 seldrop(struct proc *p, u_int32_t *ibits, int nfd)
 1818 {
 1819         int error;
 1820         int need_wakeup = 0;
 1821 
 1822         proc_fdlock(p);
 1823         error =  seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0);
 1824         proc_fdunlock(p);
 1825         if (need_wakeup) {
 1826                 wakeup(&p->p_fpdrainwait);
 1827         }
 1828         return (error);
 1829 }
 1830 
 1831 /*
 1832  * Record a select request.
 1833  */
 1834 void
 1835 selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql)
 1836 {
 1837         thread_t        cur_act = current_thread();
 1838         struct uthread * ut = get_bsdthread_info(cur_act);
 1839 
 1840         /* need to look at collisions */
 1841 
 1842         /*do not record if this is second pass of select */
 1843         if(p_wql == (void *)0) {
 1844                 return;
 1845         }
 1846 
 1847         if ((sip->si_flags & SI_INITED) == 0) {
 1848                 wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
 1849                 sip->si_flags |= SI_INITED;
 1850                 sip->si_flags &= ~SI_CLEAR;
 1851         }
 1852 
 1853         if (sip->si_flags & SI_RECORDED) {
 1854                 sip->si_flags |= SI_COLL;
 1855         } else
 1856                 sip->si_flags &= ~SI_COLL;
 1857 
 1858         sip->si_flags |= SI_RECORDED;
 1859         if (!wait_queue_member(&sip->si_wait_queue, ut->uu_wqset))
 1860                 wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_wqset,
 1861                                         (wait_queue_link_t)p_wql);
 1862 
 1863         return;
 1864 }
 1865 
 1866 void
 1867 selwakeup(struct selinfo *sip)
 1868 {
 1869         
 1870         if ((sip->si_flags & SI_INITED) == 0) {
 1871                 return;
 1872         }
 1873 
 1874         if (sip->si_flags & SI_COLL) {
 1875                 nselcoll++;
 1876                 sip->si_flags &= ~SI_COLL;
 1877 #if 0
 1878                 /* will not  support */
 1879                 //wakeup((caddr_t)&selwait);
 1880 #endif
 1881         }
 1882 
 1883         if (sip->si_flags & SI_RECORDED) {
 1884                 wait_queue_wakeup_all(&sip->si_wait_queue, NULL, THREAD_AWAKENED);
 1885                 sip->si_flags &= ~SI_RECORDED;
 1886         }
 1887 
 1888 }
 1889 
 1890 void 
 1891 selthreadclear(struct selinfo *sip)
 1892 {
 1893 
 1894         if ((sip->si_flags & SI_INITED) == 0) {
 1895                 return;
 1896         }
 1897         if (sip->si_flags & SI_RECORDED) {
 1898                         selwakeup(sip);
 1899                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
 1900         }
 1901         sip->si_flags |= SI_CLEAR;
 1902         wait_queue_unlink_all(&sip->si_wait_queue);
 1903 }
 1904 
 1905 
 1906 
 1907 
 1908 #define DBG_POST        0x10
 1909 #define DBG_WATCH       0x11
 1910 #define DBG_WAIT        0x12
 1911 #define DBG_MOD         0x13
 1912 #define DBG_EWAKEUP     0x14
 1913 #define DBG_ENQUEUE     0x15
 1914 #define DBG_DEQUEUE     0x16
 1915 
 1916 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
 1917 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
 1918 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
 1919 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
 1920 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
 1921 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
 1922 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
 1923 
 1924 
 1925 #define EVPROCDEQUE(p, evq)     do {                            \
 1926         proc_lock(p);                                           \
 1927         if (evq->ee_flags & EV_QUEUED) {                        \
 1928                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);      \
 1929                 evq->ee_flags &= ~EV_QUEUED;                    \
 1930         }                                                       \
 1931         proc_unlock(p);                                         \
 1932 } while (0);
 1933 
 1934 
 1935 /*
 1936  * called upon socket close. deque and free all events for
 1937  * the socket...  socket must be locked by caller.
 1938  */
 1939 void
 1940 evsofree(struct socket *sp)
 1941 {
 1942         struct eventqelt *evq, *next;
 1943         proc_t  p;
 1944 
 1945         if (sp == NULL)
 1946                 return;
 1947 
 1948         for (evq = sp->so_evlist.tqh_first; evq != NULL; evq = next) {
 1949                 next = evq->ee_slist.tqe_next;
 1950                 p = evq->ee_proc;
 1951 
 1952                 if (evq->ee_flags & EV_QUEUED) {
 1953                         EVPROCDEQUE(p, evq);
 1954                 }
 1955                 TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist); // remove from socket q
 1956                 FREE(evq, M_TEMP);
 1957         }
 1958 }
 1959 
 1960 
 1961 /*
 1962  * called upon pipe close. deque and free all events for
 1963  * the pipe... pipe must be locked by caller
 1964  */
 1965 void
 1966 evpipefree(struct pipe *cpipe)
 1967 {
 1968         struct eventqelt *evq, *next;
 1969         proc_t  p;
 1970 
 1971         for (evq = cpipe->pipe_evlist.tqh_first; evq != NULL; evq = next) {
 1972                 next = evq->ee_slist.tqe_next;
 1973                 p = evq->ee_proc;
 1974 
 1975                 EVPROCDEQUE(p, evq);
 1976 
 1977                 TAILQ_REMOVE(&cpipe->pipe_evlist, evq, ee_slist); // remove from pipe q
 1978                 FREE(evq, M_TEMP);
 1979         }
 1980 }
 1981 
 1982 
 1983 /*
 1984  * enqueue this event if it's not already queued. wakeup
 1985  * the proc if we do queue this event to it...
 1986  * entered with proc lock held... we drop it before
 1987  * doing the wakeup and return in that state
 1988  */
 1989 static void
 1990 evprocenque(struct eventqelt *evq)
 1991 {
 1992         proc_t  p;
 1993 
 1994         assert(evq);
 1995         p = evq->ee_proc;
 1996 
 1997         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, (uint32_t)evq, evq->ee_flags, evq->ee_eventmask,0,0);
 1998 
 1999         proc_lock(p);
 2000 
 2001         if (evq->ee_flags & EV_QUEUED) {
 2002                 proc_unlock(p);
 2003 
 2004                 KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
 2005                 return;
 2006         }
 2007         evq->ee_flags |= EV_QUEUED;
 2008 
 2009         TAILQ_INSERT_TAIL(&p->p_evlist, evq, ee_plist);
 2010 
 2011         proc_unlock(p);
 2012 
 2013         wakeup(&p->p_evlist);
 2014 
 2015         KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
 2016 }
 2017 
 2018 
 2019 /*
 2020  * pipe lock must be taken by the caller
 2021  */
 2022 void
 2023 postpipeevent(struct pipe *pipep, int event)
 2024 {
 2025         int     mask;
 2026         struct eventqelt *evq;
 2027 
 2028         if (pipep == NULL)
 2029                 return;
 2030         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,1,0);
 2031 
 2032         for (evq = pipep->pipe_evlist.tqh_first;
 2033              evq != NULL; evq = evq->ee_slist.tqe_next) {
 2034 
 2035                 if (evq->ee_eventmask == 0)
 2036                         continue;
 2037                 mask = 0;
 2038 
 2039                 switch (event & (EV_RWBYTES | EV_RCLOSED | EV_WCLOSED)) {
 2040 
 2041                 case EV_RWBYTES:
 2042                   if ((evq->ee_eventmask & EV_RE) && pipep->pipe_buffer.cnt) {
 2043                           mask |= EV_RE;
 2044                           evq->ee_req.er_rcnt = pipep->pipe_buffer.cnt;
 2045                   }
 2046                   if ((evq->ee_eventmask & EV_WR) && 
 2047                       (pipep->pipe_buffer.size - pipep->pipe_buffer.cnt) >= PIPE_BUF) {
 2048 
 2049                           if (pipep->pipe_state & PIPE_EOF) {
 2050                                   mask |= EV_WR|EV_RESET;
 2051                                   break;
 2052                           }
 2053                           mask |= EV_WR;
 2054                           evq->ee_req.er_wcnt = pipep->pipe_buffer.size - pipep->pipe_buffer.cnt;
 2055                   }
 2056                   break;
 2057 
 2058                 case EV_WCLOSED:
 2059                 case EV_RCLOSED:
 2060                   if ((evq->ee_eventmask & EV_RE)) {
 2061                           mask |= EV_RE|EV_RCLOSED;
 2062                   }
 2063                   if ((evq->ee_eventmask & EV_WR)) {
 2064                           mask |= EV_WR|EV_WCLOSED;
 2065                   }
 2066                   break;
 2067 
 2068                 default:
 2069                   return;
 2070                 }
 2071                 if (mask) {
 2072                         /*
 2073                          * disarm... postevents are nops until this event is 'read' via
 2074                          * waitevent and then re-armed via modwatch
 2075                          */
 2076                         evq->ee_eventmask = 0;
 2077 
 2078                         /*
 2079                          * since events are disarmed until after the waitevent
 2080                          * the ee_req.er_xxxx fields can't change once we've
 2081                          * inserted this event into the proc queue...
 2082                          * therefore, the waitevent will see a 'consistent'
 2083                          * snapshot of the event, even though it won't hold
 2084                          * the pipe lock, and we're updating the event outside
 2085                          * of the proc lock, which it will hold
 2086                          */
 2087                         evq->ee_req.er_eventbits |= mask;
 2088 
 2089                         KERNEL_DEBUG(DBG_MISC_POST, (uint32_t)evq, evq->ee_req.er_eventbits, mask, 1,0);
 2090 
 2091                         evprocenque(evq);
 2092                 }
 2093         }
 2094         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,1,0);
 2095 }
 2096 
 2097 #if SOCKETS
 2098 /*
 2099  * given either a sockbuf or a socket run down the
 2100  * event list and queue ready events found...
 2101  * the socket must be locked by the caller
 2102  */
 2103 void
 2104 postevent(struct socket *sp, struct sockbuf *sb, int event)
 2105 {
 2106         int     mask;
 2107         struct  eventqelt *evq;
 2108         struct  tcpcb *tp;
 2109 
 2110         if (sb)
 2111                 sp = sb->sb_so;
 2112         if (sp == NULL)
 2113                 return;
 2114 
 2115         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, (int)sp, event, 0, 0, 0);
 2116 
 2117         for (evq = sp->so_evlist.tqh_first;
 2118              evq != NULL; evq = evq->ee_slist.tqe_next) {
 2119 
 2120                 if (evq->ee_eventmask == 0)
 2121                         continue;
 2122                 mask = 0;
 2123 
 2124                 /* ready for reading:
 2125                    - byte cnt >= receive low water mark
 2126                    - read-half of conn closed
 2127                    - conn pending for listening sock
 2128                    - socket error pending
 2129 
 2130                    ready for writing
 2131                    - byte cnt avail >= send low water mark
 2132                    - write half of conn closed
 2133                    - socket error pending
 2134                    - non-blocking conn completed successfully
 2135 
 2136                    exception pending
 2137                    - out of band data
 2138                    - sock at out of band mark
 2139                 */
 2140 
 2141                 switch (event & EV_DMASK) {
 2142 
 2143                 case EV_OOB:
 2144                   if ((evq->ee_eventmask & EV_EX)) {
 2145                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
 2146                                   mask |= EV_EX|EV_OOB;
 2147                   }
 2148                   break;
 2149 
 2150                 case EV_RWBYTES|EV_OOB:
 2151                   if ((evq->ee_eventmask & EV_EX)) {
 2152                           if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK)))
 2153                                   mask |= EV_EX|EV_OOB;
 2154                   }
 2155                   /*
 2156                    * fall into the next case
 2157                    */
 2158                 case EV_RWBYTES:
 2159                   if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
 2160                           if (sp->so_error) {
 2161                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
 2162                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
 2163                                               (tp->t_state == TCPS_CLOSED)) {
 2164                                                   mask |= EV_RE|EV_RESET;
 2165                                                   break;
 2166                                           }
 2167                                   }
 2168                           }
 2169                           mask |= EV_RE;
 2170                           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
 2171 
 2172                           if (sp->so_state & SS_CANTRCVMORE) {
 2173                                   mask |= EV_FIN;
 2174                                   break;
 2175                           }
 2176                   }
 2177                   if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
 2178                           if (sp->so_error) {
 2179                                   if ((sp->so_type == SOCK_STREAM) && ((sp->so_error == ECONNREFUSED) || (sp->so_error == ECONNRESET))) {
 2180                                           if ((sp->so_pcb == 0) || (((struct inpcb *)sp->so_pcb)->inp_state == INPCB_STATE_DEAD) || !(tp = sototcpcb(sp)) ||
 2181                                               (tp->t_state == TCPS_CLOSED)) {
 2182                                                   mask |= EV_WR|EV_RESET;
 2183                                                   break;
 2184                                           }
 2185                                   }
 2186                           }
 2187                           mask |= EV_WR;
 2188                           evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
 2189                   }
 2190                   break;
 2191 
 2192                 case EV_RCONN:
 2193                   if ((evq->ee_eventmask & EV_RE)) {
 2194                           mask |= EV_RE|EV_RCONN;
 2195                           evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
 2196                   }
 2197                   break;
 2198 
 2199                 case EV_WCONN:
 2200                   if ((evq->ee_eventmask & EV_WR)) {
 2201                           mask |= EV_WR|EV_WCONN;
 2202                   }
 2203                   break;
 2204 
 2205                 case EV_RCLOSED:
 2206                   if ((evq->ee_eventmask & EV_RE)) {
 2207                           mask |= EV_RE|EV_RCLOSED;
 2208                   }
 2209                   break;
 2210 
 2211                 case EV_WCLOSED:
 2212                   if ((evq->ee_eventmask & EV_WR)) {
 2213                           mask |= EV_WR|EV_WCLOSED;
 2214                   }
 2215                   break;
 2216 
 2217                 case EV_FIN:
 2218                   if (evq->ee_eventmask & EV_RE) {
 2219                           mask |= EV_RE|EV_FIN;
 2220                   }
 2221                   break;
 2222 
 2223                 case EV_RESET:
 2224                 case EV_TIMEOUT:
 2225                   if (evq->ee_eventmask & EV_RE) {
 2226                           mask |= EV_RE | event;
 2227                   } 
 2228                   if (evq->ee_eventmask & EV_WR) {
 2229                           mask |= EV_WR | event;
 2230                   }
 2231                   break;
 2232 
 2233                 default:
 2234                   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, -1, 0, 0, 0);
 2235                   return;
 2236                 } /* switch */
 2237 
 2238                 KERNEL_DEBUG(DBG_MISC_POST, (int)evq, evq->ee_eventmask, evq->ee_req.er_eventbits, mask, 0);
 2239 
 2240                 if (mask) {
 2241                         /*
 2242                          * disarm... postevents are nops until this event is 'read' via
 2243                          * waitevent and then re-armed via modwatch
 2244                          */
 2245                         evq->ee_eventmask = 0;
 2246 
 2247                         /*
 2248                          * since events are disarmed until after the waitevent
 2249                          * the ee_req.er_xxxx fields can't change once we've
 2250                          * inserted this event into the proc queue...
 2251                          * since waitevent can't see this event until we 
 2252                          * enqueue it, waitevent will see a 'consistent'
 2253                          * snapshot of the event, even though it won't hold
 2254                          * the socket lock, and we're updating the event outside
 2255                          * of the proc lock, which it will hold
 2256                          */
 2257                         evq->ee_req.er_eventbits |= mask;
 2258 
 2259                         evprocenque(evq);
 2260                 }
 2261         }
 2262         KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, (int)sp, 0, 0, 0, 0);
 2263 }
 2264 #endif /* SOCKETS */
 2265 
 2266 
 2267 /*
 2268  * watchevent system call. user passes us an event to watch
 2269  * for. we malloc an event object, initialize it, and queue
 2270  * it to the open socket. when the event occurs, postevent()
 2271  * will enque it back to our proc where we can retrieve it
 2272  * via waitevent().
 2273  *
 2274  * should this prevent duplicate events on same socket?
 2275  *
 2276  * Returns:
 2277  *              ENOMEM                  No memory for operation
 2278  *      copyin:EFAULT
 2279  */
 2280 int
 2281 watchevent(proc_t p, struct watchevent_args *uap, __unused int *retval)
 2282 {
 2283         struct eventqelt *evq = (struct eventqelt *)0;
 2284         struct eventqelt *np = NULL;
 2285         struct eventreq64 *erp;
 2286         struct fileproc *fp = NULL;
 2287         int error;
 2288 
 2289         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
 2290 
 2291         // get a qelt and fill with users req
 2292         MALLOC(evq, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
 2293 
 2294         if (evq == NULL)
 2295                 return (ENOMEM);
 2296         erp = &evq->ee_req;
 2297 
 2298         // get users request pkt
 2299 
 2300         if (IS_64BIT_PROCESS(p)) {
 2301                 error = copyin(uap->u_req, (caddr_t)erp, sizeof(struct eventreq64));
 2302         } else {
 2303                 struct eventreq32 er32;
 2304 
 2305                 error = copyin(uap->u_req, (caddr_t)&er32, sizeof(struct eventreq32));
 2306                 if (error == 0) {
 2307                        /*
 2308                         * the user only passes in the
 2309                         * er_type, er_handle and er_data...
 2310                         * the other fields are initialized
 2311                         * below, so don't bother to copy
 2312                         */
 2313                         erp->er_type = er32.er_type;
 2314                         erp->er_handle = er32.er_handle;
 2315                         erp->er_data = (user_addr_t)er32.er_data;
 2316                 }
 2317         }
 2318         if (error) {
 2319                 FREE(evq, M_TEMP);
 2320                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
 2321 
 2322                 return(error);          
 2323         }
 2324         KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
 2325 
 2326         // validate, freeing qelt if errors
 2327         error = 0;
 2328         proc_fdlock(p);
 2329 
 2330         if (erp->er_type != EV_FD) {
 2331                 error = EINVAL;
 2332         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
 2333                 error = EBADF;
 2334 #if SOCKETS
 2335         } else if (fp->f_type == DTYPE_SOCKET) {
 2336                 socket_lock((struct socket *)fp->f_data, 1);
 2337                 np = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
 2338 #endif /* SOCKETS */
 2339         } else if (fp->f_type == DTYPE_PIPE) {
 2340                 PIPE_LOCK((struct pipe *)fp->f_data);
 2341                 np = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
 2342         } else {
 2343                 fp_drop(p, erp->er_handle, fp, 1);
 2344                 error = EINVAL;
 2345         }
 2346         proc_fdunlock(p);
 2347 
 2348         if (error) {
 2349                 FREE(evq, M_TEMP);
 2350 
 2351                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
 2352                 return(error);
 2353         }
 2354                 
 2355         /*
 2356          * only allow one watch per file per proc
 2357          */
 2358         for ( ; np != NULL; np = np->ee_slist.tqe_next) {
 2359                 if (np->ee_proc == p) {
 2360 #if SOCKETS
 2361                         if (fp->f_type == DTYPE_SOCKET)
 2362                                 socket_unlock((struct socket *)fp->f_data, 1);
 2363                         else 
 2364 #endif /* SOCKETS */
 2365                                 PIPE_UNLOCK((struct pipe *)fp->f_data);
 2366                         fp_drop(p, erp->er_handle, fp, 0);
 2367                         FREE(evq, M_TEMP);
 2368                         
 2369                         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
 2370                         return(EINVAL);
 2371                 }
 2372         }
 2373         erp->er_ecnt = erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
 2374         evq->ee_proc = p;
 2375         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
 2376         evq->ee_flags = 0;
 2377 
 2378 #if SOCKETS
 2379         if (fp->f_type == DTYPE_SOCKET) {
 2380                 TAILQ_INSERT_TAIL(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
 2381                 postevent((struct socket *)fp->f_data, 0, EV_RWBYTES); // catch existing events
 2382 
 2383                 socket_unlock((struct socket *)fp->f_data, 1);
 2384         } else
 2385 #endif /* SOCKETS */
 2386         {
 2387                 TAILQ_INSERT_TAIL(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
 2388                 postpipeevent((struct pipe *)fp->f_data, EV_RWBYTES);
 2389 
 2390                 PIPE_UNLOCK((struct pipe *)fp->f_data);
 2391         }
 2392         fp_drop_event(p, erp->er_handle, fp);
 2393 
 2394         KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
 2395         return(0);
 2396 }
 2397 
 2398 
 2399 
 2400 /*
 2401  * waitevent system call.
 2402  * grabs the next waiting event for this proc and returns
 2403  * it. if no events, user can request to sleep with timeout
 2404  * or without or poll mode
 2405  *    ((tv != NULL && interval == 0) || tv == -1)
 2406  */
 2407 int
 2408 waitevent(proc_t p, struct waitevent_args *uap, int *retval)
 2409 {
 2410         int error = 0;
 2411         struct eventqelt *evq;
 2412         struct eventreq64 *erp;
 2413         uint64_t abstime, interval;
 2414         boolean_t fast_poll = FALSE;
 2415         union {
 2416                 struct eventreq64 er64;
 2417                 struct eventreq32 er32;
 2418         } uer;
 2419 
 2420         interval = 0;
 2421 
 2422         if (uap->tv) {
 2423                 struct timeval atv;
 2424                 /*
 2425                  * check for fast poll method
 2426                  */
 2427                 if (IS_64BIT_PROCESS(p)) {
 2428                         if (uap->tv == (user_addr_t)-1)
 2429                                 fast_poll = TRUE;
 2430                 } else if (uap->tv == (user_addr_t)((uint32_t)-1))
 2431                         fast_poll = TRUE;
 2432 
 2433                 if (fast_poll == TRUE) {
 2434                         if (p->p_evlist.tqh_first == NULL) {
 2435                                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_NONE, -1,0,0,0,0);
 2436                                 /*
 2437                                  * poll failed
 2438                                  */
 2439                                 *retval = 1;
 2440                                 return (0);
 2441                         }
 2442                         proc_lock(p);
 2443                         goto retry;
 2444                 }
 2445                 if (IS_64BIT_PROCESS(p)) {
 2446                         struct user64_timeval atv64;
 2447                         error = copyin(uap->tv, (caddr_t)&atv64, sizeof(atv64));
 2448                         /* Loses resolution - assume timeout < 68 years */
 2449                         atv.tv_sec = atv64.tv_sec;
 2450                         atv.tv_usec = atv64.tv_usec;
 2451                 } else {
 2452                         struct user32_timeval atv32;
 2453                         error = copyin(uap->tv, (caddr_t)&atv32, sizeof(atv32));
 2454                         atv.tv_sec = atv32.tv_sec;
 2455                         atv.tv_usec = atv32.tv_usec;
 2456                 }
 2457 
 2458                 if (error)
 2459                         return(error);
 2460                 if (itimerfix(&atv)) {
 2461                         error = EINVAL;
 2462                         return(error);
 2463                 }
 2464                 interval = tvtoabstime(&atv);
 2465         }
 2466         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
 2467 
 2468         proc_lock(p);
 2469 retry:
 2470         if ((evq = p->p_evlist.tqh_first) != NULL) {
 2471                 /*
 2472                  * found one... make a local copy while it's still on the queue
 2473                  * to prevent it from changing while in the midst of copying
 2474                  * don't want to hold the proc lock across a copyout because
 2475                  * it might block on a page fault at the target in user space
 2476                  */
 2477                 erp = &evq->ee_req;
 2478 
 2479                 if (IS_64BIT_PROCESS(p))
 2480                         bcopy((caddr_t)erp, (caddr_t)&uer.er64, sizeof (struct eventreq64));
 2481                 else {
 2482                         uer.er32.er_type  = erp->er_type;
 2483                         uer.er32.er_handle  = erp->er_handle;
 2484                         uer.er32.er_data  = (uint32_t)erp->er_data;
 2485                         uer.er32.er_ecnt  = erp->er_ecnt;
 2486                         uer.er32.er_rcnt  = erp->er_rcnt;
 2487                         uer.er32.er_wcnt  = erp->er_wcnt;
 2488                         uer.er32.er_eventbits = erp->er_eventbits;
 2489                 }
 2490                 TAILQ_REMOVE(&p->p_evlist, evq, ee_plist);
 2491 
 2492                 evq->ee_flags &= ~EV_QUEUED;
 2493 
 2494                 proc_unlock(p);
 2495 
 2496                 if (IS_64BIT_PROCESS(p))
 2497                         error = copyout((caddr_t)&uer.er64, uap->u_req, sizeof(struct eventreq64));
 2498                 else
 2499                         error = copyout((caddr_t)&uer.er32, uap->u_req, sizeof(struct eventreq32));
 2500 
 2501                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
 2502                              evq->ee_req.er_handle,evq->ee_req.er_eventbits,(uint32_t)evq,0);
 2503                 return (error);
 2504         }
 2505         else {
 2506                 if (uap->tv && interval == 0) {
 2507                         proc_unlock(p);
 2508                         *retval = 1;  // poll failed
 2509 
 2510                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
 2511                         return (error);
 2512                 }
 2513                 if (interval != 0)
 2514                         clock_absolutetime_interval_to_deadline(interval, &abstime);
 2515                 else
 2516                         abstime = 0;
 2517 
 2518                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,(uint32_t)&p->p_evlist,0,0,0);
 2519 
 2520                 error = msleep1(&p->p_evlist, &p->p_mlock, (PSOCK | PCATCH), "waitevent", abstime);
 2521 
 2522                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,(uint32_t)&p->p_evlist,0,0,0);
 2523 
 2524                 if (error == 0)
 2525                         goto retry;
 2526                 if (error == ERESTART)
 2527                         error = EINTR;
 2528                 if (error == EWOULDBLOCK) {
 2529                         *retval = 1;
 2530                         error = 0;
 2531                 }
 2532         }
 2533         proc_unlock(p);
 2534 
 2535         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
 2536         return (error);
 2537 }
 2538 
 2539 
 2540 /*
 2541  * modwatch system call. user passes in event to modify.
 2542  * if we find it we reset the event bits and que/deque event
 2543  * it needed.
 2544  */
 2545 int
 2546 modwatch(proc_t p, struct modwatch_args *uap, __unused int *retval)
 2547 {
 2548         struct eventreq64 er;
 2549         struct eventreq64 *erp = &er;
 2550         struct eventqelt *evq = NULL;   /* protected by error return */
 2551         int error;
 2552         struct fileproc *fp;
 2553         int flag;
 2554 
 2555         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
 2556 
 2557         /*
 2558          * get user's request pkt
 2559          * just need the er_type and er_handle which sit above the
 2560          * problematic er_data (32/64 issue)... so only copy in
 2561          * those 2 fields
 2562          */
 2563         if ((error = copyin(uap->u_req, (caddr_t)erp, sizeof(er.er_type) + sizeof(er.er_handle)))) {
 2564                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
 2565                 return(error);
 2566         }
 2567         proc_fdlock(p);
 2568 
 2569         if (erp->er_type != EV_FD) {
 2570                 error = EINVAL;
 2571         } else if ((error = fp_lookup(p, erp->er_handle, &fp, 1)) != 0) {
 2572                 error = EBADF;
 2573 #if SOCKETS
 2574         } else if (fp->f_type == DTYPE_SOCKET) {
 2575                 socket_lock((struct socket *)fp->f_data, 1);
 2576                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
 2577 #endif /* SOCKETS */
 2578         } else if (fp->f_type == DTYPE_PIPE) {
 2579                 PIPE_LOCK((struct pipe *)fp->f_data);
 2580                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
 2581         } else {
 2582                 fp_drop(p, erp->er_handle, fp, 1);
 2583                 error = EINVAL;
 2584         }
 2585 
 2586         if (error) {
 2587                 proc_fdunlock(p);
 2588                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, error,0,0,0,0);
 2589                 return(error);
 2590         }
 2591 
 2592         if ((uap->u_eventmask == EV_RM) && (fp->f_flags & FP_WAITEVENT)) {
 2593                 fp->f_flags &= ~FP_WAITEVENT;
 2594         }
 2595         proc_fdunlock(p);
 2596 
 2597         // locate event if possible
 2598         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
 2599                 if (evq->ee_proc == p)
 2600                         break;
 2601         }
 2602         if (evq == NULL) {
 2603 #if SOCKETS
 2604                 if (fp->f_type == DTYPE_SOCKET) 
 2605                         socket_unlock((struct socket *)fp->f_data, 1);
 2606                 else
 2607 #endif /* SOCKETS */
 2608                         PIPE_UNLOCK((struct pipe *)fp->f_data);
 2609                 fp_drop(p, erp->er_handle, fp, 0);
 2610                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
 2611                 return(EINVAL);
 2612         }
 2613         KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,(uint32_t)evq,0,0);
 2614 
 2615         if (uap->u_eventmask == EV_RM) {
 2616                 EVPROCDEQUE(p, evq);
 2617 
 2618 #if SOCKETS
 2619                 if (fp->f_type == DTYPE_SOCKET) {
 2620                         TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
 2621                         socket_unlock((struct socket *)fp->f_data, 1);
 2622                 } else
 2623 #endif /* SOCKETS */
 2624                 {
 2625                         TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
 2626                         PIPE_UNLOCK((struct pipe *)fp->f_data);
 2627                 }
 2628                 fp_drop(p, erp->er_handle, fp, 0);
 2629                 FREE(evq, M_TEMP);
 2630                 KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
 2631                 return(0);
 2632         }
 2633         switch (uap->u_eventmask & EV_MASK) {
 2634  
 2635         case 0:
 2636                 flag = 0;
 2637                 break;
 2638 
 2639         case EV_RE:
 2640         case EV_WR:
 2641         case EV_RE|EV_WR:
 2642                 flag = EV_RWBYTES;
 2643                 break;
 2644 
 2645         case EV_EX:
 2646                 flag = EV_OOB;
 2647                 break;
 2648 
 2649         case EV_EX|EV_RE:
 2650         case EV_EX|EV_WR:
 2651         case EV_EX|EV_RE|EV_WR:
 2652                 flag = EV_OOB|EV_RWBYTES;
 2653                 break;
 2654 
 2655         default:
 2656 #if SOCKETS
 2657                 if (fp->f_type == DTYPE_SOCKET) 
 2658                         socket_unlock((struct socket *)fp->f_data, 1);
 2659                 else 
 2660 #endif /* SOCKETS */
 2661                         PIPE_UNLOCK((struct pipe *)fp->f_data);
 2662                 fp_drop(p, erp->er_handle, fp, 0);
 2663                 KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
 2664                 return(EINVAL);
 2665         }
 2666         /*
 2667          * since we're holding the socket/pipe lock, the event
 2668          * cannot go from the unqueued state to the queued state
 2669          * however, it can go from the queued state to the unqueued state
 2670          * since that direction is protected by the proc_lock...
 2671          * so do a quick check for EV_QUEUED w/o holding the proc lock
 2672          * since by far the common case will be NOT EV_QUEUED, this saves
 2673          * us taking the proc_lock the majority of the time
 2674          */
 2675         if (evq->ee_flags & EV_QUEUED) {
 2676                 /*
 2677                  * EVPROCDEQUE will recheck the state after it grabs the proc_lock
 2678                  */
 2679                 EVPROCDEQUE(p, evq);
 2680         }
 2681         /*
 2682          * while the event is off the proc queue and
 2683          * we're holding the socket/pipe lock
 2684          * it's safe to update these fields...
 2685          */
 2686         evq->ee_req.er_eventbits = 0;
 2687         evq->ee_eventmask = uap->u_eventmask & EV_MASK;
 2688 
 2689 #if SOCKETS
 2690         if (fp->f_type == DTYPE_SOCKET) {
 2691                 postevent((struct socket *)fp->f_data, 0, flag);
 2692                 socket_unlock((struct socket *)fp->f_data, 1);
 2693         } else
 2694 #endif /* SOCKETS */
 2695         {
 2696                 postpipeevent((struct pipe *)fp->f_data, flag);
 2697                 PIPE_UNLOCK((struct pipe *)fp->f_data);
 2698         }
 2699         fp_drop(p, erp->er_handle, fp, 0);
 2700         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,(uint32_t)fp->f_data,flag,0);
 2701         return(0);
 2702 }
 2703 
 2704 /* this routine is called from the close of fd with proc_fdlock held */
 2705 int
 2706 waitevent_close(struct proc *p, struct fileproc *fp)
 2707 {
 2708         struct eventqelt *evq;
 2709 
 2710 
 2711         fp->f_flags &= ~FP_WAITEVENT;
 2712 
 2713 #if SOCKETS
 2714         if (fp->f_type == DTYPE_SOCKET) {
 2715                 socket_lock((struct socket *)fp->f_data, 1);
 2716                 evq = ((struct socket *)fp->f_data)->so_evlist.tqh_first;
 2717         } else
 2718 #endif /* SOCKETS */
 2719         if (fp->f_type == DTYPE_PIPE) {
 2720                 PIPE_LOCK((struct pipe *)fp->f_data);
 2721                 evq = ((struct pipe *)fp->f_data)->pipe_evlist.tqh_first;
 2722         }
 2723         else {
 2724                 return(EINVAL);
 2725         }
 2726         proc_fdunlock(p);
 2727 
 2728 
 2729         // locate event if possible
 2730         for ( ; evq != NULL; evq = evq->ee_slist.tqe_next) {
 2731                 if (evq->ee_proc == p)
 2732                         break;
 2733         }
 2734         if (evq == NULL) {
 2735 #if SOCKETS
 2736                 if (fp->f_type == DTYPE_SOCKET) 
 2737                         socket_unlock((struct socket *)fp->f_data, 1);
 2738                 else 
 2739 #endif /* SOCKETS */
 2740                         PIPE_UNLOCK((struct pipe *)fp->f_data);
 2741 
 2742                 proc_fdlock(p);
 2743 
 2744                 return(EINVAL);
 2745         }
 2746         EVPROCDEQUE(p, evq);
 2747 
 2748 #if SOCKETS
 2749         if (fp->f_type == DTYPE_SOCKET) {
 2750                 TAILQ_REMOVE(&((struct socket *)fp->f_data)->so_evlist, evq, ee_slist);
 2751                 socket_unlock((struct socket *)fp->f_data, 1);
 2752         } else
 2753 #endif /* SOCKETS */
 2754         {
 2755                 TAILQ_REMOVE(&((struct pipe *)fp->f_data)->pipe_evlist, evq, ee_slist);
 2756                 PIPE_UNLOCK((struct pipe *)fp->f_data);
 2757         }
 2758         FREE(evq, M_TEMP);
 2759 
 2760         proc_fdlock(p);
 2761 
 2762         return(0);
 2763 }
 2764 
 2765 
 2766 /*
 2767  * gethostuuid
 2768  *
 2769  * Description: Get the host UUID from IOKit and return it to user space.
 2770  *
 2771  * Parameters:  uuid_buf                Pointer to buffer to receive UUID
 2772  *              timeout                 Timespec for timout
 2773  *
 2774  * Returns:     0                       Success
 2775  *              EWOULDBLOCK             Timeout is too short
 2776  *              copyout:EFAULT          Bad user buffer
 2777  *
 2778  * Notes:       A timeout seems redundant, since if it's tolerable to not
 2779  *              have a system UUID in hand, then why ask for one?
 2780  */
 2781 int
 2782 gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retval)
 2783 {
 2784         kern_return_t kret;
 2785         int error;
 2786         mach_timespec_t mach_ts;        /* for IOKit call */
 2787         __darwin_uuid_t uuid_kern;      /* for IOKit call */
 2788 
 2789         /* Convert the 32/64 bit timespec into a mach_timespec_t */
 2790         if ( proc_is64bit(p) ) {
 2791                 struct user64_timespec ts;
 2792                 error = copyin(uap->timeoutp, &ts, sizeof(ts));
 2793                 if (error)
 2794                         return (error);
 2795                 mach_ts.tv_sec = ts.tv_sec;
 2796                 mach_ts.tv_nsec = ts.tv_nsec;
 2797         } else {
 2798                 struct user32_timespec ts;
 2799                 error = copyin(uap->timeoutp, &ts, sizeof(ts) );
 2800                 if (error)
 2801                         return (error);
 2802                 mach_ts.tv_sec = ts.tv_sec;
 2803                 mach_ts.tv_nsec = ts.tv_nsec;
 2804         }
 2805 
 2806         /* Call IOKit with the stack buffer to get the UUID */
 2807         kret = IOBSDGetPlatformUUID(uuid_kern, mach_ts);
 2808 
 2809         /*
 2810          * If we get it, copy out the data to the user buffer; note that a
 2811          * uuid_t is an array of characters, so this is size invariant for
 2812          * 32 vs. 64 bit.
 2813          */
 2814         if (kret == KERN_SUCCESS) {
 2815                 error = copyout(uuid_kern, uap->uuid_buf, sizeof(uuid_kern));
 2816         } else {
 2817                 error = EWOULDBLOCK;
 2818         }
 2819 
 2820         return (error);
 2821 }

Cache object: 772db384102984115a8cb1860c962bae


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.