The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/bsd/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_LICENSE_HEADER_START@
    5  * 
    6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
    7  * 
    8  * This file contains Original Code and/or Modifications of Original Code
    9  * as defined in and that are subject to the Apple Public Source License
   10  * Version 2.0 (the 'License'). You may not use this file except in
   11  * compliance with the License. Please obtain a copy of the License at
   12  * http://www.opensource.apple.com/apsl/ and read it before using this
   13  * file.
   14  * 
   15  * The Original Code and all software distributed under the License are
   16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   20  * Please see the License for the specific language governing rights and
   21  * limitations under the License.
   22  * 
   23  * @APPLE_LICENSE_HEADER_END@
   24  */
   25 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
   26 /*
   27  * Copyright (c) 1982, 1986, 1989, 1993
   28  *      The Regents of the University of California.  All rights reserved.
   29  * (c) UNIX System Laboratories, Inc.
   30  * All or some portions of this file are derived from material licensed
   31  * to the University of California by American Telephone and Telegraph
   32  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   33  * the permission of UNIX System Laboratories, Inc.
   34  *
   35  * Redistribution and use in source and binary forms, with or without
   36  * modification, are permitted provided that the following conditions
   37  * are met:
   38  * 1. Redistributions of source code must retain the above copyright
   39  *    notice, this list of conditions and the following disclaimer.
   40  * 2. Redistributions in binary form must reproduce the above copyright
   41  *    notice, this list of conditions and the following disclaimer in the
   42  *    documentation and/or other materials provided with the distribution.
   43  * 3. All advertising materials mentioning features or use of this software
   44  *    must display the following acknowledgement:
   45  *      This product includes software developed by the University of
   46  *      California, Berkeley and its contributors.
   47  * 4. Neither the name of the University nor the names of its contributors
   48  *    may be used to endorse or promote products derived from this software
   49  *    without specific prior written permission.
   50  *
   51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   61  * SUCH DAMAGE.
   62  *
   63  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
   64  */
   65 
   66 #include <sys/param.h>
   67 #include <sys/systm.h>
   68 #include <sys/filedesc.h>
   69 #include <sys/ioctl.h>
   70 #include <sys/file.h>
   71 #include <sys/proc.h>
   72 #include <sys/socketvar.h>
   73 #include <sys/uio.h>
   74 #include <sys/kernel.h>
   75 #include <sys/stat.h>
   76 #include <sys/malloc.h>
   77 
   78 #include <sys/mount.h>
   79 #include <sys/protosw.h>
   80 #include <sys/ev.h>
   81 #include <sys/user.h>
   82 #include <sys/kdebug.h>
   83 #include <kern/assert.h>
   84 #include <kern/thread_act.h>
   85 
   86 #include <sys/mbuf.h>
   87 #include <sys/socket.h>
   88 #include <sys/socketvar.h>
   89 #include <sys/errno.h>
   90 #include <sys/syscall.h>
   91 
   92 #include <net/if.h>
   93 #include <net/route.h>
   94 
   95 #include <netinet/in.h>
   96 #include <netinet/in_systm.h>
   97 #include <netinet/ip.h>
   98 #include <netinet/in_pcb.h>
   99 #include <netinet/ip_var.h>
  100 #include <netinet/ip6.h>
  101 #include <netinet/tcp.h>
  102 #include <netinet/tcp_fsm.h>
  103 #include <netinet/tcp_seq.h>
  104 #include <netinet/tcp_timer.h>
  105 #include <netinet/tcp_var.h>
  106 #include <netinet/tcpip.h>
  107 #include <netinet/tcp_debug.h>
  108 /* for wait queue based select */
  109 #include <kern/wait_queue.h>
  110 #if KTRACE 
  111 #include <sys/ktrace.h>
  112 #endif
  113 #include <sys/vnode.h>
  114 
  115 
  116 __private_extern__ struct file*
  117 holdfp(fdp, fd, flag) 
  118         struct filedesc* fdp;
  119         int fd, flag;
  120 {
  121         struct file* fp;
  122 
  123         if (((u_int)fd) >= fdp->fd_nfiles ||
  124                 (fp = fdp->fd_ofiles[fd]) == NULL ||
  125                 (fp->f_flag & flag) == 0) {
  126                         return (NULL);
  127         }
  128         if (fref(fp) == -1)
  129                 return (NULL);
  130         return (fp);   
  131 }
  132 
  133 /*
  134  * Read system call.
  135  */
  136 #ifndef _SYS_SYSPROTO_H_
  137 struct read_args {
  138         int fd;
  139         char *cbuf;
  140         u_int nbyte;
  141 };
  142 #endif
  143 int
  144 read(p, uap, retval)
  145         struct proc *p;
  146         register struct read_args *uap;
  147         register_t *retval;
  148 {
  149         register struct file *fp;
  150         int error;
  151 
  152         if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
  153                 return (EBADF);
  154         error = dofileread(p, fp, uap->fd, uap->cbuf, uap->nbyte,
  155                         (off_t)-1, 0, retval);
  156         frele(fp);
  157         return(error);
  158 }
  159 
  160 /* 
  161  * Pread system call
  162  */
  163 #ifndef _SYS_SYSPROTO_H_
  164 struct pread_args {
  165         int     fd;
  166         void    *buf;
  167         size_t  nbyte;
  168 #ifdef DOUBLE_ALIGN_PARAMS
  169         int     pad;
  170 #endif
  171         off_t   offset;
  172 };
  173 #endif
  174 int
  175 pread(p, uap, retval)
  176         struct proc *p;
  177         register struct pread_args *uap;
  178         int *retval;
  179 {
  180         register struct file *fp;
  181         int error;
  182 
  183         if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL)
  184                 return (EBADF);
  185         if (fp->f_type != DTYPE_VNODE) {
  186                 error = ESPIPE;
  187         } else {
  188                 error = dofileread(p, fp, uap->fd, uap->buf, uap->nbyte,
  189                                 uap->offset, FOF_OFFSET, retval);
  190         }
  191         frele(fp);
  192         
  193         if (!error)
  194             KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pread) | DBG_FUNC_NONE),
  195               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
  196         
  197         return(error);
  198 }
  199 
  200 /*
  201  * Code common for read and pread
  202  */
  203 __private_extern__ int
  204 dofileread(p, fp, fd, buf, nbyte, offset, flags, retval)
  205         struct proc *p;
  206         struct file *fp;
  207         int fd, flags;
  208         void *buf;
  209         size_t nbyte;
  210         off_t offset;
  211         int *retval;
  212 {
  213         struct uio auio;
  214         struct iovec aiov;
  215         long cnt, error = 0;
  216 #if KTRACE
  217         struct iovec ktriov;
  218         struct uio ktruio;
  219         int didktr = 0;
  220 #endif
  221 
  222         aiov.iov_base = (caddr_t)buf;
  223         aiov.iov_len = nbyte;
  224         auio.uio_iov = &aiov;
  225         auio.uio_iovcnt = 1;
  226         auio.uio_offset = offset;
  227         if (nbyte > INT_MAX)
  228                 return (EINVAL);
  229         auio.uio_resid = nbyte;
  230         auio.uio_rw = UIO_READ;
  231         auio.uio_segflg = UIO_USERSPACE;
  232         auio.uio_procp = p;
  233 #if KTRACE
  234         /*
  235         * if tracing, save a copy of iovec
  236         */
  237         if (KTRPOINT(p, KTR_GENIO)) {
  238                 ktriov = aiov;
  239                 ktruio = auio;
  240                 didktr = 1;
  241         }
  242 #endif
  243         cnt = nbyte;
  244 
  245         if ((error = fo_read(fp, &auio, fp->f_cred, flags, p))) {
  246                 if (auio.uio_resid != cnt && (error == ERESTART ||
  247                         error == EINTR || error == EWOULDBLOCK))
  248                         error = 0;
  249         }
  250         cnt -= auio.uio_resid;
  251 #if KTRACE
  252         if (didktr && error == 0) {
  253                 ktruio.uio_iov = &ktriov;
  254                 ktruio.uio_resid = cnt;
  255                 ktrgenio(p->p_tracep, fd, UIO_READ, &ktruio, error,
  256                     KERNEL_FUNNEL);
  257         }
  258 #endif  
  259         *retval = cnt;
  260         return (error);
  261 }
  262 
  263 /*      
  264  * Scatter read system call.
  265  */
  266 #ifndef _SYS_SYSPROTO_H_
  267 struct readv_args {
  268         int fd;
  269         struct iovec *iovp;
  270         u_int iovcnt;
  271 };
  272 #endif
  273 int
  274 readv(p, uap, retval)
  275         struct proc *p;
  276         register struct readv_args *uap;
  277         int *retval;
  278 {
  279         struct uio auio;
  280         register struct iovec *iov;
  281         int error;
  282         struct iovec aiov[UIO_SMALLIOV];
  283 
  284         if (uap->iovcnt > UIO_SMALLIOV) {
  285                 if (uap->iovcnt > UIO_MAXIOV)
  286                         return (EINVAL);        
  287                 if ((iov = (struct iovec *)
  288                             kalloc(sizeof(struct iovec) * (uap->iovcnt))) == 0)
  289                         return (ENOMEM);
  290         } else
  291                 iov = aiov;
  292         auio.uio_iov = iov;
  293         auio.uio_iovcnt = uap->iovcnt;
  294         auio.uio_rw = UIO_READ;
  295         error = copyin((caddr_t)uap->iovp, (caddr_t)iov,
  296             uap->iovcnt * sizeof (struct iovec));
  297         if (!error)
  298                 error = rwuio(p, uap->fd, &auio, UIO_READ, retval);
  299         if (uap->iovcnt > UIO_SMALLIOV)
  300                 kfree(iov, sizeof(struct iovec)*uap->iovcnt);
  301         return (error);
  302 }
  303 
  304 /*
  305  * Write system call
  306  */
  307 #ifndef _SYS_SYSPROTO_H_
  308 struct write_args {
  309         int fd;
  310         char *cbuf;
  311         u_int nbyte;
  312 };
  313 #endif
  314 int
  315 write(p, uap, retval)
  316         struct proc *p;
  317         register struct write_args *uap;
  318         int *retval;
  319 {
  320         register struct file *fp;
  321         int error;      
  322 
  323         if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
  324                 return (EBADF);
  325         error = dofilewrite(p, fp, uap->fd, uap->cbuf, uap->nbyte,
  326                         (off_t)-1, 0, retval);
  327         frele(fp);
  328         return(error);  
  329 }
  330 
  331 /*                          
  332  * Pwrite system call
  333  */
  334 #ifndef _SYS_SYSPROTO_H_
  335 struct pwrite_args {
  336         int     fd;
  337         const void *buf;
  338         size_t  nbyte;
  339 #ifdef DOUBLE_ALIGN_PARAMS
  340         int     pad;
  341 #endif
  342         off_t   offset;
  343 };      
  344 #endif
  345 int
  346 pwrite(p, uap, retval)
  347         struct proc *p;
  348         register struct pwrite_args *uap;
  349         int *retval;    
  350 {
  351         register struct file *fp;
  352         int error; 
  353 
  354         if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL)
  355                 return (EBADF);
  356         if (fp->f_type != DTYPE_VNODE) {
  357                 error = ESPIPE;
  358         } else {
  359             error = dofilewrite(p, fp, uap->fd, uap->buf, uap->nbyte,
  360                 uap->offset, FOF_OFFSET, retval);
  361         }
  362         frele(fp);
  363 
  364         if (!error)
  365             KERNEL_DEBUG_CONSTANT((BSDDBG_CODE(DBG_BSD_SC_EXTENDED_INFO, SYS_pwrite) | DBG_FUNC_NONE),
  366               uap->fd, uap->nbyte, (unsigned int)((uap->offset >> 32)), (unsigned int)(uap->offset), 0);
  367         
  368         return(error);
  369 }
  370 
  371 __private_extern__ int                  
  372 dofilewrite(p, fp, fd, buf, nbyte, offset, flags, retval)
  373         struct proc *p;
  374         struct file *fp; 
  375         int fd, flags;
  376         const void *buf;
  377         size_t nbyte;   
  378         off_t offset; 
  379         int *retval;
  380 {       
  381         struct uio auio;
  382         struct iovec aiov;
  383         long cnt, error = 0;
  384 #if KTRACE
  385         struct iovec ktriov;
  386         struct uio ktruio;
  387         int didktr = 0; 
  388 #endif
  389         
  390         aiov.iov_base = (void *)(uintptr_t)buf;
  391         aiov.iov_len = nbyte;
  392         auio.uio_iov = &aiov;
  393         auio.uio_iovcnt = 1;   
  394         auio.uio_offset = offset;
  395         if (nbyte > INT_MAX)   
  396                 return (EINVAL);
  397         auio.uio_resid = nbyte;
  398         auio.uio_rw = UIO_WRITE;
  399         auio.uio_segflg = UIO_USERSPACE;
  400         auio.uio_procp = p;
  401 #if KTRACE
  402         /*
  403         * if tracing, save a copy of iovec and uio
  404         */
  405         if (KTRPOINT(p, KTR_GENIO)) {
  406                 ktriov = aiov;
  407                 ktruio = auio;
  408                 didktr = 1;
  409         }
  410 #endif  
  411         cnt = nbyte; 
  412         if (fp->f_type == DTYPE_VNODE)
  413                 bwillwrite();
  414         if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
  415                 if (auio.uio_resid != cnt && (error == ERESTART ||
  416                         error == EINTR || error == EWOULDBLOCK))
  417                         error = 0;
  418                 /* The socket layer handles SIGPIPE */
  419                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
  420                         psignal(p, SIGPIPE);
  421         }
  422         cnt -= auio.uio_resid;
  423 #if KTRACE 
  424         if (didktr && error == 0) {
  425                 ktruio.uio_iov = &ktriov;
  426                 ktruio.uio_resid = cnt;
  427                 ktrgenio(p->p_tracep, fd, UIO_WRITE, &ktruio, error,
  428                     KERNEL_FUNNEL);
  429         }
  430 #endif  
  431         *retval = cnt;
  432         return (error); 
  433 }
  434         
  435 /*      
  436  * Gather write system call  
  437  */     
  438 #ifndef _SYS_SYSPROTO_H_
  439 struct writev_args {
  440         int fd;
  441         struct iovec *iovp;
  442         u_int iovcnt;
  443 };
  444 #endif
  445 int
  446 writev(p, uap, retval)
  447         struct proc *p;
  448         register struct writev_args *uap;
  449         int *retval;
  450 {
  451         struct uio auio;
  452         register struct iovec *iov;
  453         int error;
  454         struct iovec aiov[UIO_SMALLIOV];
  455 
  456         if (uap->iovcnt > UIO_SMALLIOV) {
  457                 if (uap->iovcnt > UIO_MAXIOV)
  458                         return (EINVAL);        
  459                 if ((iov = (struct iovec *)
  460                             kalloc(sizeof(struct iovec) * (uap->iovcnt))) == 0)
  461                         return (ENOMEM);
  462         } else
  463                 iov = aiov;
  464         auio.uio_iov = iov;
  465         auio.uio_iovcnt = uap->iovcnt;
  466         auio.uio_rw = UIO_WRITE;
  467         error = copyin((caddr_t)uap->iovp, (caddr_t)iov,
  468             uap->iovcnt * sizeof (struct iovec));
  469         if (!error)
  470                 error = rwuio(p, uap->fd, &auio, UIO_WRITE, retval);
  471         if (uap->iovcnt > UIO_SMALLIOV)
  472                 kfree(iov, sizeof(struct iovec)*uap->iovcnt);
  473         return (error);
  474 }
  475 
  476 int
  477 rwuio(p, fdes, uio, rw, retval)
  478         struct proc *p;
  479         int fdes;
  480         register struct uio *uio;
  481         enum uio_rw rw;
  482         int *retval;
  483 {
  484         struct file *fp;
  485         register struct iovec *iov;
  486         int i, count, flag, error;
  487 #if KTRACE
  488         struct iovec *ktriov;
  489         struct uio ktruio;
  490         int didktr = 0;
  491         u_int iovlen;
  492 #endif
  493 
  494         if (error = fdgetf(p, fdes, &fp))
  495                 return (error);
  496 
  497         if ((fp->f_flag&(rw==UIO_READ ? FREAD : FWRITE)) == 0) {
  498                 return(EBADF);
  499         }
  500         uio->uio_resid = 0;
  501         uio->uio_segflg = UIO_USERSPACE;
  502         uio->uio_procp = p;
  503         iov = uio->uio_iov;
  504         for (i = 0; i < uio->uio_iovcnt; i++) {
  505                 if (iov->iov_len < 0) {
  506                         return(EINVAL);
  507                 }
  508                 uio->uio_resid += iov->iov_len;
  509                 if (uio->uio_resid < 0) {
  510                         return(EINVAL);
  511                 }
  512                 iov++;
  513         }
  514         count = uio->uio_resid;
  515 #if KTRACE
  516         /*
  517          * if tracing, save a copy of iovec
  518          */
  519         if (KTRPOINT(p, KTR_GENIO)) {
  520                 iovlen = uio->uio_iovcnt * sizeof (struct iovec);
  521                 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
  522                 bcopy((caddr_t)uio->uio_iov, (caddr_t)ktriov, iovlen);
  523                 ktruio = *uio;
  524                 didktr = 1;
  525         } 
  526 #endif  
  527 
  528         if (rw == UIO_READ) {
  529                 if (error = fo_read(fp, uio, fp->f_cred, 0, p))
  530                         if (uio->uio_resid != count && (error == ERESTART ||
  531                                 error == EINTR || error == EWOULDBLOCK))
  532                                 error = 0;
  533         } else {
  534                 if (fp->f_type == DTYPE_VNODE)
  535                         bwillwrite();
  536                 if (error = fo_write(fp, uio, fp->f_cred, 0, p)) {
  537                         if (uio->uio_resid != count && (error == ERESTART ||
  538                                 error == EINTR || error == EWOULDBLOCK))
  539                                 error = 0;
  540                         /* The socket layer handles SIGPIPE */
  541                         if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
  542                                 psignal(p, SIGPIPE);
  543                 }
  544         }
  545 
  546         *retval = count - uio->uio_resid;
  547 
  548 #if KTRACE
  549         if (didktr) {
  550                 if (error == 0) {
  551                         ktruio.uio_iov = ktriov; 
  552                         ktruio.uio_resid = *retval;
  553                         ktrgenio(p->p_tracep, fdes, rw, &ktruio, error,
  554                             KERNEL_FUNNEL);
  555                 }
  556                 FREE(ktriov, M_TEMP);
  557         }
  558 #endif
  559 
  560         return(error);
  561 }
  562 
  563 /*
  564  * Ioctl system call
  565  */
  566 #ifndef _SYS_SYSPROTO_H_
  567 struct ioctl_args {
  568         int fd;
  569         u_long com;
  570         caddr_t data;
  571 };
  572 #endif
  573 int
  574 ioctl(p, uap, retval)
  575         struct proc *p;
  576         register struct ioctl_args *uap;
  577         register_t *retval;
  578 {
  579         struct file *fp;
  580         register u_long com;
  581         register int error;
  582         register u_int size;
  583         caddr_t data, memp;
  584         int tmp;
  585 #define STK_PARAMS      128
  586         char stkbuf[STK_PARAMS];
  587 
  588         if (error = fdgetf(p, uap->fd, &fp))
  589                 return (error);
  590 
  591         if ((fp->f_flag & (FREAD | FWRITE)) == 0)
  592                 return (EBADF);
  593                 
  594 #if NETAT
  595         /*
  596          * ### LD 6/11/97 Hack Alert: this is to get AppleTalk to work
  597          * while implementing an ATioctl system call
  598          */
  599         {
  600                 extern int appletalk_inited;
  601 
  602                 if (appletalk_inited && ((uap->com & 0x0000FFFF) == 0xff99)) {
  603 #ifdef APPLETALK_DEBUG
  604                         kprintf("ioctl: special AppleTalk \n");
  605 #endif
  606                         error = fo_ioctl(fp, uap->com, uap->data, p);
  607                         return(error);
  608                 }
  609         }
  610 
  611 #endif /* NETAT */
  612 
  613 
  614         switch (com = uap->com) {
  615         case FIONCLEX:
  616                 *fdflags(p, uap->fd) &= ~UF_EXCLOSE;
  617                 return (0);
  618         case FIOCLEX:
  619                 *fdflags(p, uap->fd) |= UF_EXCLOSE;
  620                 return (0);
  621         }
  622 
  623         /*
  624          * Interpret high order word to find amount of data to be
  625          * copied to/from the user's address space.
  626          */
  627         size = IOCPARM_LEN(com);
  628         if (size > IOCPARM_MAX)
  629                 return (ENOTTY);
  630         memp = NULL;
  631         if (size > sizeof (stkbuf)) {
  632                 if ((memp = (caddr_t)kalloc(size)) == 0)
  633                         return(ENOMEM);
  634                 data = memp;
  635         } else
  636                 data = stkbuf;
  637         if (com&IOC_IN) {
  638                 if (size) {
  639                         error = copyin(uap->data, data, (u_int)size);
  640                         if (error) {
  641                                 if (memp)
  642                                         kfree(memp, size);
  643                                 return (error);
  644                         }
  645                 } else
  646                         *(caddr_t *)data = uap->data;
  647         } else if ((com&IOC_OUT) && size)
  648                 /*
  649                  * Zero the buffer so the user always
  650                  * gets back something deterministic.
  651                  */
  652                 bzero(data, size);
  653         else if (com&IOC_VOID)
  654                 *(caddr_t *)data = uap->data;
  655 
  656         switch (com) {
  657 
  658         case FIONBIO:
  659                 if (tmp = *(int *)data)
  660                         fp->f_flag |= FNONBLOCK;
  661                 else
  662                         fp->f_flag &= ~FNONBLOCK;
  663                 error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, p);
  664                 break;
  665 
  666         case FIOASYNC:
  667                 if (tmp = *(int *)data)
  668                         fp->f_flag |= FASYNC;
  669                 else
  670                         fp->f_flag &= ~FASYNC;
  671                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, p);
  672                 break;
  673 
  674         case FIOSETOWN:
  675                 tmp = *(int *)data;
  676                 if (fp->f_type == DTYPE_SOCKET) {
  677                         ((struct socket *)fp->f_data)->so_pgid = tmp;
  678                         error = 0;
  679                         break;
  680                 }
  681                 if (tmp <= 0) {
  682                         tmp = -tmp;
  683                 } else {
  684                         struct proc *p1 = pfind(tmp);
  685                         if (p1 == 0) {
  686                                 error = ESRCH;
  687                                 break;
  688                         }
  689                         tmp = p1->p_pgrp->pg_id;
  690                 }
  691                 error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
  692                 break;
  693 
  694         case FIOGETOWN:
  695                 if (fp->f_type == DTYPE_SOCKET) {
  696                         error = 0;
  697                         *(int *)data = ((struct socket *)fp->f_data)->so_pgid;
  698                         break;
  699                 }
  700                 error = fo_ioctl(fp, TIOCGPGRP, data, p);
  701                 *(int *)data = -*(int *)data;
  702                 break;
  703 
  704         default:
  705                 error = fo_ioctl(fp, com, data, p);
  706                 /*
  707                  * Copy any data to user, size was
  708                  * already set and checked above.
  709                  */
  710                 if (error == 0 && (com&IOC_OUT) && size)
  711                         error = copyout(data, uap->data, (u_int)size);
  712                 break;
  713         }
  714         if (memp)
  715                 kfree(memp, size);
  716         return (error);
  717 }
  718 
  719 int     selwait, nselcoll;
  720 #define SEL_FIRSTPASS 1
  721 #define SEL_SECONDPASS 2
  722 extern int selcontinue(int error);
  723 extern int selprocess(int error, int sel_pass);
  724 static int selscan(struct proc *p, struct _select * sel,
  725                         int nfd, register_t *retval, int sel_pass);
  726 static int selcount(struct proc *p, u_int32_t *ibits, u_int32_t *obits,
  727                         int nfd, int * count, int * nfcount);
  728 extern uint64_t tvtoabstime(struct timeval      *tvp);
  729 
  730 /*
  731  * Select system call.
  732  */
  733 #ifndef _SYS_SYSPROTO_H_
  734 struct select_args {
  735         int nd;
  736         u_int32_t *in;
  737         u_int32_t *ou;
  738         u_int32_t *ex;
  739         struct timeval *tv;
  740 };
  741 #endif
  742 int
  743 select(p, uap, retval)
  744         register struct proc *p;
  745         register struct select_args *uap;
  746         register_t *retval;
  747 {
  748         int error = 0;
  749         u_int ni, nw, size;
  750         thread_act_t th_act;
  751         struct uthread  *uth;
  752         struct _select *sel;
  753         int needzerofill = 1;
  754         int kfcount =0;
  755         int nfcount = 0;
  756         int count = 0;
  757 
  758         th_act = current_act();
  759         uth = get_bsdthread_info(th_act);
  760         sel = &uth->uu_state.ss_select;
  761         retval = (int *)get_bsduthreadrval(th_act);
  762         *retval = 0;
  763 
  764         if (uap->nd < 0) {
  765                 return (EINVAL);
  766         }
  767 
  768         if (uap->nd > p->p_fd->fd_nfiles)
  769                 uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
  770 
  771         nw = howmany(uap->nd, NFDBITS);
  772         ni = nw * sizeof(fd_mask);
  773 
  774         /*
  775          * if this is the first select by the thread 
  776          * allocate the space for bits.
  777          */
  778         if (sel->nbytes == 0) {
  779                 sel->nbytes = 3 * ni;
  780                 MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK);
  781                 MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK);
  782                 bzero((caddr_t)sel->ibits, sel->nbytes);
  783                 bzero((caddr_t)sel->obits, sel->nbytes);
  784                 needzerofill = 0;
  785         }
  786 
  787         /*
  788          * if the previously allocated space for the bits
  789          * is smaller than what is requested. Reallocate.
  790          */
  791         if (sel->nbytes < (3 * ni)) {
  792                 sel->nbytes = (3 * ni);
  793                 FREE(sel->ibits, M_TEMP);
  794                 FREE(sel->obits, M_TEMP);
  795                 MALLOC(sel->ibits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK);
  796                 MALLOC(sel->obits, u_int32_t *, sel->nbytes, M_TEMP, M_WAITOK);
  797                 bzero((caddr_t)sel->ibits, sel->nbytes);
  798                 bzero((caddr_t)sel->obits, sel->nbytes);
  799                 needzerofill = 0;
  800         }   
  801 
  802         if (needzerofill) {
  803                 bzero((caddr_t)sel->ibits, sel->nbytes);
  804                 bzero((caddr_t)sel->obits, sel->nbytes);
  805         }
  806 
  807         /*
  808          * get the bits from the user address space
  809          */
  810 #define getbits(name, x) \
  811         do { \
  812                 if (uap->name && (error = copyin((caddr_t)uap->name, \
  813                         (caddr_t)&sel->ibits[(x) * nw], ni))) \
  814                         goto continuation; \
  815         } while (0)
  816 
  817         getbits(in, 0);
  818         getbits(ou, 1);
  819         getbits(ex, 2);
  820 #undef  getbits
  821 
  822         if (uap->tv) {
  823                 struct timeval atv;
  824 
  825                 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof (atv));
  826                 if (error)
  827                         goto continuation;
  828                 if (itimerfix(&atv)) {
  829                         error = EINVAL;
  830                         goto continuation;
  831                 }
  832 
  833                 clock_absolutetime_interval_to_deadline(
  834                                                                                 tvtoabstime(&atv), &sel->abstime);
  835         }
  836         else
  837                 sel->abstime = 0;
  838 
  839         sel->nfcount = 0;
  840         if (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count, &nfcount)) {
  841                         goto continuation;
  842         }
  843 
  844         sel->nfcount = nfcount;
  845         sel->count = count;
  846         size = SIZEOF_WAITQUEUE_SUB + (count * SIZEOF_WAITQUEUE_LINK);
  847         if (sel->allocsize) {
  848                 if (uth->uu_wqsub == 0)
  849                         panic("select: wql memory smashed");
  850                 /* needed for the select now */
  851                 if (size > sel->allocsize) {
  852                         kfree(uth->uu_wqsub,  sel->allocsize);
  853                         sel->allocsize = size;
  854                         uth->uu_wqsub = (wait_queue_sub_t)kalloc(sel->allocsize);
  855                         if (uth->uu_wqsub == (wait_queue_sub_t)NULL)
  856                                 panic("failed to allocate memory for waitqueue\n");
  857                         sel->wql = (char *)uth->uu_wqsub + SIZEOF_WAITQUEUE_SUB;
  858                 }
  859         } else {
  860                 sel->count = count;
  861                 sel->allocsize = size;
  862                 uth->uu_wqsub = (wait_queue_sub_t)kalloc(sel->allocsize);
  863                 if (uth->uu_wqsub == (wait_queue_sub_t)NULL)
  864                         panic("failed to allocate memory for waitqueue\n");
  865                 sel->wql = (char *)uth->uu_wqsub + SIZEOF_WAITQUEUE_SUB;
  866         }
  867         bzero(uth->uu_wqsub, size);
  868         wait_queue_sub_init(uth->uu_wqsub, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST));
  869 
  870 continuation:
  871         return selprocess(error, SEL_FIRSTPASS);
  872 }
  873 
  874 int
  875 selcontinue(int error)
  876 {
  877         return selprocess(error, SEL_SECONDPASS);
  878 }
  879 
  880 int
  881 selprocess(error, sel_pass)
  882 {
  883         int ncoll;
  884         u_int ni, nw;
  885         thread_act_t th_act;
  886         struct uthread  *uth;
  887         struct proc *p;
  888         struct select_args *uap;
  889         int *retval;
  890         struct _select *sel;
  891         int unwind = 1;
  892         int prepost = 0;
  893         int somewakeup = 0;
  894         int doretry = 0;
  895         wait_result_t wait_result;
  896 
  897         p = current_proc();
  898         th_act = current_act();
  899         uap = (struct select_args *)get_bsduthreadarg(th_act);
  900         retval = (int *)get_bsduthreadrval(th_act);
  901         uth = get_bsdthread_info(th_act);
  902         sel = &uth->uu_state.ss_select;
  903 
  904         /* if it is first pass wait queue is not setup yet */
  905         if ((error != 0) && (sel_pass == SEL_FIRSTPASS))
  906                         unwind = 0;
  907         if (sel->count == 0)
  908                         unwind = 0;
  909 retry:
  910         if (error != 0) {
  911           goto done;
  912         }
  913 
  914         ncoll = nselcoll;
  915         p->p_flag |= P_SELECT;
  916         /* skip scans if the select is just for timeouts */
  917         if (sel->count) {
  918                 if (sel_pass == SEL_FIRSTPASS)
  919                         wait_queue_sub_clearrefs(uth->uu_wqsub);
  920 
  921                 error = selscan(p, sel, uap->nd, retval, sel_pass);
  922                 if (error || *retval) {
  923                         goto done;
  924                 }
  925                 if (prepost) {
  926                         /* if the select of log, then we canwakeup and discover some one
  927                         * else already read the data; go toselct again if time permits
  928                         */
  929                         prepost = 0;
  930                         doretry = 1;
  931                 }
  932                 if (somewakeup) {
  933                         somewakeup = 0;
  934                         doretry = 1;
  935                 }
  936         }
  937 
  938         if (uap->tv) {
  939                 uint64_t        now;
  940 
  941                 clock_get_uptime(&now);
  942                 if (now >= sel->abstime)
  943                         goto done;
  944         }
  945 
  946         if (doretry) {
  947                 /* cleanup obits and try again */
  948                 doretry = 0;
  949                 sel_pass = SEL_FIRSTPASS;
  950                 goto retry;
  951         }
  952 
  953         /*
  954          * To effect a poll, the timeout argument should be
  955          * non-nil, pointing to a zero-valued timeval structure.
  956          */
  957         if (uap->tv && sel->abstime == 0) {
  958                 goto done;
  959         }
  960 
  961         /* No spurious wakeups due to colls,no need to check for them */
  962          if ((sel_pass == SEL_SECONDPASS) || ((p->p_flag & P_SELECT) == 0)) {
  963                 sel_pass = SEL_FIRSTPASS;
  964                 goto retry;
  965         }
  966 
  967         p->p_flag &= ~P_SELECT;
  968 
  969         /* if the select is just for timeout skip check */
  970         if (sel->count &&(sel_pass == SEL_SECONDPASS))
  971                 panic("selprocess: 2nd pass assertwaiting");
  972 
  973         /* Wait Queue Subordinate has waitqueue as first element */
  974         wait_result = wait_queue_assert_wait((wait_queue_t)uth->uu_wqsub,
  975                                                                                  &selwait, THREAD_ABORTSAFE);
  976         if (wait_result != THREAD_AWAKENED) {
  977                 /* there are no preposted events */
  978         error = tsleep1(NULL, PSOCK | PCATCH,
  979                                                                         "select", sel->abstime, selcontinue);
  980         } else  {
  981                 prepost = 1;
  982                 error = 0;
  983         }
  984 
  985         sel_pass = SEL_SECONDPASS;
  986         if (error == 0) {
  987                 if (!prepost)
  988                         somewakeup =1;
  989                 goto retry;
  990         }
  991 done:
  992         if (unwind)
  993                 wait_subqueue_unlink_all(uth->uu_wqsub);
  994         p->p_flag &= ~P_SELECT;
  995         /* select is not restarted after signals... */
  996         if (error == ERESTART)
  997                 error = EINTR;
  998         if (error == EWOULDBLOCK)
  999                 error = 0;
 1000         nw = howmany(uap->nd, NFDBITS);
 1001         ni = nw * sizeof(fd_mask);
 1002 
 1003 #define putbits(name, x) \
 1004         do { \
 1005                 if (uap->name && (error2 = copyout((caddr_t)&sel->obits[(x) * nw], \
 1006                         (caddr_t)uap->name, ni))) \
 1007                         error = error2; \
 1008         } while (0)
 1009 
 1010         if (error == 0) {
 1011                 int error2;
 1012 
 1013                 putbits(in, 0);
 1014                 putbits(ou, 1);
 1015                 putbits(ex, 2);
 1016 #undef putbits
 1017         }
 1018         return(error);
 1019 }
 1020 
 1021 static int
 1022 selscan(p, sel, nfd, retval, sel_pass)
 1023         struct proc *p;
 1024         struct _select *sel;
 1025         int nfd;
 1026         register_t *retval;
 1027         int sel_pass;
 1028 {
 1029         register struct filedesc *fdp = p->p_fd;
 1030         register int msk, i, j, fd;
 1031         register u_int32_t bits;
 1032         struct file *fp;
 1033         int n = 0;
 1034         int nc = 0;
 1035         static int flag[3] = { FREAD, FWRITE, 0 };
 1036         u_int32_t *iptr, *optr;
 1037         u_int nw;
 1038         u_int32_t *ibits, *obits;
 1039         char * wql;
 1040         int nfunnel = 0;
 1041         int count, nfcount;
 1042         char * wql_ptr;
 1043         struct vnode *vp;
 1044 
 1045         /*
 1046          * Problems when reboot; due to MacOSX signal probs
 1047          * in Beaker1C ; verify that the p->p_fd is valid
 1048          */
 1049         if (fdp == NULL) {
 1050                 *retval=0;
 1051                 return(EIO);
 1052         }
 1053 
 1054         ibits = sel->ibits;
 1055         obits = sel->obits;
 1056         wql = sel->wql;
 1057 
 1058         count = sel->count;
 1059         nfcount = sel->nfcount;
 1060 
 1061         if (nfcount > count)
 1062                 panic("selcount count<nfcount");
 1063 
 1064         nw = howmany(nfd, NFDBITS);
 1065 
 1066         nc = 0;
 1067         if ( nfcount < count) {
 1068                 /* some or all in kernel funnel */
 1069                 for (msk = 0; msk < 3; msk++) {
 1070                         iptr = (u_int32_t *)&ibits[msk * nw];
 1071                         optr = (u_int32_t *)&obits[msk * nw];
 1072                         for (i = 0; i < nfd; i += NFDBITS) {
 1073                                 bits = iptr[i/NFDBITS];
 1074                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1075                                         bits &= ~(1 << j);
 1076                                         fp = fdp->fd_ofiles[fd];
 1077                                         if (fp == NULL ||
 1078                                                 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
 1079                                                 return(EBADF);
 1080                                         }
 1081                                         if (sel_pass == SEL_SECONDPASS)
 1082                                                 wql_ptr = (char *)0;
 1083                                         else
 1084                                                 wql_ptr = (wql+ nc * SIZEOF_WAITQUEUE_LINK);
 1085                                         /*
 1086                                          * Merlot: need to remove the bogus f_data check
 1087                                          * from the following "if" statement.  It's there
 1088                                          * because of various problems stemming from 
 1089                                          * races due to the split-funnels and lack of real
 1090                                          * referencing on sockets...
 1091                                          */
 1092                                         if (fp->f_ops && (fp->f_type != DTYPE_SOCKET)
 1093                                                 && (fp->f_data != (caddr_t)-1) 
 1094                                                 && !(fp->f_type == DTYPE_VNODE 
 1095                                                         && (vp = (struct vnode *)fp->f_data) 
 1096                                                         && vp->v_type == VFIFO)
 1097                                                 && fo_select(fp, flag[msk], wql_ptr, p)) {
 1098                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
 1099                                                 n++;
 1100                                         }
 1101                                         nc++;
 1102                                 }
 1103                         }
 1104                 }
 1105         }
 1106 
 1107         if (nfcount) {
 1108                 /* socket file descriptors for scan */
 1109                 thread_funnel_switch(KERNEL_FUNNEL, NETWORK_FUNNEL);
 1110 
 1111                 nc = 0;
 1112                 for (msk = 0; msk < 3; msk++) {
 1113                         iptr = (u_int32_t *)&ibits[msk * nw];
 1114                         optr = (u_int32_t *)&obits[msk * nw];
 1115                         for (i = 0; i < nfd; i += NFDBITS) {
 1116                                 bits = iptr[i/NFDBITS];
 1117                                 while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1118                                         bits &= ~(1 << j);
 1119                                         fp = fdp->fd_ofiles[fd];
 1120                                         if (fp == NULL ||
 1121                                                 (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
 1122                                                 thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL);
 1123                                                 return(EBADF);
 1124                                         }
 1125                                         if (sel_pass == SEL_SECONDPASS)
 1126                                                 wql_ptr = (char *)0;
 1127                                         else
 1128                                                 wql_ptr = (wql+ nc * SIZEOF_WAITQUEUE_LINK);
 1129                                         if (fp->f_ops 
 1130                                                 && (fp->f_type == DTYPE_SOCKET
 1131                                                         || (fp->f_type == DTYPE_VNODE 
 1132                                                         && (vp = (struct vnode *)fp->f_data)  
 1133                                                         && vp != (struct vnode *)-1 
 1134                                                         && vp->v_type == VFIFO))
 1135                                                 && fo_select(fp, flag[msk], wql_ptr, p)) {
 1136                                                 optr[fd/NFDBITS] |= (1 << (fd % NFDBITS));
 1137                                                 n++;
 1138                                         }
 1139                                         nc++;
 1140                                 }
 1141                         }
 1142                 }
 1143                 thread_funnel_switch(NETWORK_FUNNEL, KERNEL_FUNNEL);
 1144         }
 1145 
 1146         *retval = n;
 1147         return (0);
 1148 }
 1149 
 1150 /*ARGSUSED*/
 1151 int
 1152 seltrue(dev, flag, p)
 1153         dev_t dev;
 1154         int flag;
 1155         struct proc *p;
 1156 {
 1157 
 1158         return (1);
 1159 }
 1160 
 1161 static int
 1162 selcount(p, ibits, obits, nfd, count, nfcount)
 1163         struct proc *p;
 1164         u_int32_t *ibits, *obits;
 1165         int nfd;
 1166         int *count;
 1167         int *nfcount;
 1168 {
 1169         register struct filedesc *fdp = p->p_fd;
 1170         register int msk, i, j, fd;
 1171         register u_int32_t bits;
 1172         struct file *fp;
 1173         int n = 0;
 1174         int nc = 0;
 1175         int nfc = 0;
 1176         static int flag[3] = { FREAD, FWRITE, 0 };
 1177         u_int32_t *iptr, *fptr, *fbits;
 1178         u_int nw;
 1179         struct vnode *vp;
 1180 
 1181         /*
 1182          * Problems when reboot; due to MacOSX signal probs
 1183          * in Beaker1C ; verify that the p->p_fd is valid
 1184          */
 1185         if (fdp == NULL) {
 1186                 *count=0;
 1187                 *nfcount=0;
 1188                 return(EIO);
 1189         }
 1190 
 1191         nw = howmany(nfd, NFDBITS);
 1192 
 1193 
 1194         for (msk = 0; msk < 3; msk++) {
 1195                 iptr = (u_int32_t *)&ibits[msk * nw];
 1196                 for (i = 0; i < nfd; i += NFDBITS) {
 1197                         bits = iptr[i/NFDBITS];
 1198                         while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 1199                                 bits &= ~(1 << j);
 1200                                 fp = fdp->fd_ofiles[fd];
 1201                                 if (fp == NULL ||
 1202                                         (fdp->fd_ofileflags[fd] & UF_RESERVED)) {
 1203                                                 *count=0;
 1204                                                 *nfcount=0;
 1205                                                 return(EBADF);
 1206                                 }
 1207                                 if (fp->f_type == DTYPE_SOCKET || 
 1208                                         (fp->f_type == DTYPE_VNODE 
 1209                                                 && (vp = (struct vnode *)fp->f_data)  
 1210                                                 && vp->v_type == VFIFO))
 1211                                         nfc++;
 1212                                 n++;
 1213                         }
 1214                 }
 1215         }
 1216         *count = n;
 1217         *nfcount = nfc;
 1218         return (0);
 1219 }
 1220 
 1221 /*
 1222  * Record a select request.
 1223  */
 1224 void
 1225 selrecord(selector, sip, p_wql)
 1226         struct proc *selector;
 1227         struct selinfo *sip;
 1228         void * p_wql;
 1229 {
 1230         thread_act_t    cur_act = current_act();
 1231         struct uthread * ut = get_bsdthread_info(cur_act);
 1232 
 1233         /* need to look at collisions */
 1234 
 1235         if ((p_wql == (void *)0) && ((sip->si_flags & SI_INITED) == 0)) {
 1236                 return;
 1237         }
 1238 
 1239         /*do not record if this is second pass of select */
 1240         if((p_wql == (void *)0)) {
 1241                 return;
 1242         }
 1243 
 1244         if ((sip->si_flags & SI_INITED) == 0) {
 1245                 wait_queue_init(&sip->si_wait_queue, SYNC_POLICY_FIFO);
 1246                 sip->si_flags |= SI_INITED;
 1247                 sip->si_flags &= ~SI_CLEAR;
 1248         }
 1249 
 1250         if (sip->si_flags & SI_RECORDED) {
 1251                 sip->si_flags |= SI_COLL;
 1252         } else
 1253                 sip->si_flags &= ~SI_COLL;
 1254 
 1255         sip->si_flags |= SI_RECORDED;
 1256         if (!wait_queue_member(&sip->si_wait_queue, ut->uu_wqsub))
 1257                 wait_queue_link_noalloc(&sip->si_wait_queue, ut->uu_wqsub, (wait_queue_link_t)p_wql);
 1258 
 1259         return;
 1260 }
 1261 
 1262 void
 1263 selwakeup(sip)
 1264         register struct selinfo *sip;
 1265 {
 1266         
 1267         if ((sip->si_flags & SI_INITED) == 0) {
 1268                 return;
 1269         }
 1270 
 1271         if (sip->si_flags & SI_COLL) {
 1272                 nselcoll++;
 1273                 sip->si_flags &= ~SI_COLL;
 1274 #if 0
 1275                 /* will not  support */
 1276                 //wakeup((caddr_t)&selwait);
 1277 #endif
 1278         }
 1279 
 1280         if (sip->si_flags & SI_RECORDED) {
 1281                 wait_queue_wakeup_all(&sip->si_wait_queue, &selwait, THREAD_AWAKENED);
 1282                 sip->si_flags &= ~SI_RECORDED;
 1283         }
 1284 
 1285 }
 1286 
 1287 void 
 1288 selthreadclear(sip)
 1289         register struct selinfo *sip;
 1290 {
 1291 
 1292         if ((sip->si_flags & SI_INITED) == 0) {
 1293                 return;
 1294         }
 1295         if (sip->si_flags & SI_RECORDED) {
 1296                         selwakeup(sip);
 1297                         sip->si_flags &= ~(SI_RECORDED | SI_COLL);
 1298         }
 1299         sip->si_flags |= SI_CLEAR;
 1300         wait_queue_unlinkall_nofree(&sip->si_wait_queue);
 1301 }
 1302 
 1303 
 1304 extern struct eventqelt *evprocdeque(struct proc *p, struct eventqelt *eqp);
 1305 
 1306 /*
 1307  * called upon socket close. deque and free all events for
 1308  * the socket
 1309  */
 1310 void
 1311 evsofree(struct socket *sp)
 1312 {
 1313   struct eventqelt *eqp, *next;
 1314 
 1315   if (sp == NULL) return;
 1316 
 1317   for (eqp = sp->so_evlist.tqh_first; eqp != NULL; eqp = next) {
 1318     next = eqp->ee_slist.tqe_next;
 1319     evprocdeque(eqp->ee_proc, eqp); // remove from proc q if there
 1320     TAILQ_REMOVE(&sp->so_evlist, eqp, ee_slist); // remove from socket q
 1321     FREE(eqp, M_TEMP);
 1322   }
 1323 }
 1324 
 1325 
 1326 #define DBG_EVENT 0x10
 1327 
 1328 #define DBG_POST 0x10
 1329 #define DBG_WATCH 0x11
 1330 #define DBG_WAIT 0x12
 1331 #define DBG_MOD 0x13
 1332 #define DBG_EWAKEUP 0x14
 1333 #define DBG_ENQUEUE 0x15
 1334 #define DBG_DEQUEUE 0x16
 1335 
 1336 #define DBG_MISC_POST MISCDBG_CODE(DBG_EVENT,DBG_POST)
 1337 #define DBG_MISC_WATCH MISCDBG_CODE(DBG_EVENT,DBG_WATCH)
 1338 #define DBG_MISC_WAIT MISCDBG_CODE(DBG_EVENT,DBG_WAIT)
 1339 #define DBG_MISC_MOD MISCDBG_CODE(DBG_EVENT,DBG_MOD)
 1340 #define DBG_MISC_EWAKEUP MISCDBG_CODE(DBG_EVENT,DBG_EWAKEUP)
 1341 #define DBG_MISC_ENQUEUE MISCDBG_CODE(DBG_EVENT,DBG_ENQUEUE)
 1342 #define DBG_MISC_DEQUEUE MISCDBG_CODE(DBG_EVENT,DBG_DEQUEUE)
 1343 
 1344 
 1345 /*
 1346  * enque this event if it's not already queued. wakeup
 1347    the proc if we do queue this event to it.
 1348  */
 1349 void
 1350 evprocenque(struct eventqelt *eqp)
 1351 {
 1352   struct proc *p;
 1353 
 1354   assert(eqp);
 1355   KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_START, eqp, eqp->ee_flags, eqp->ee_eventmask,0,0);
 1356   if (eqp->ee_flags & EV_QUEUED) {
 1357     KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
 1358     return;
 1359   }
 1360   eqp->ee_flags |= EV_QUEUED;
 1361   eqp->ee_eventmask = 0;  // disarm
 1362   p = eqp->ee_proc;
 1363   TAILQ_INSERT_TAIL(&p->p_evlist, eqp, ee_plist);
 1364   KERNEL_DEBUG(DBG_MISC_EWAKEUP,0,0,0,eqp,0);
 1365   wakeup(&p->p_evlist);
 1366   KERNEL_DEBUG(DBG_MISC_ENQUEUE|DBG_FUNC_END, 0,0,0,0,0);
 1367 }
 1368 
 1369 /*
 1370  * given either a sockbuf or a socket run down the
 1371  * event list and queue ready events found
 1372  */
 1373 void
 1374 postevent(struct socket *sp, struct sockbuf *sb, int event)
 1375 {
 1376   int mask;
 1377   struct eventqelt *evq;
 1378   register struct tcpcb *tp;
 1379 
 1380   if (sb) sp = sb->sb_so;
 1381   if (!sp || sp->so_evlist.tqh_first == NULL) return;
 1382 
 1383   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_START, event,0,0,0,0);
 1384 
 1385   for (evq = sp->so_evlist.tqh_first;
 1386        evq != NULL; evq = evq->ee_slist.tqe_next) {
 1387 
 1388     mask = 0;
 1389 
 1390     /* ready for reading:
 1391        - byte cnt >= receive low water mark
 1392        - read-half of conn closed
 1393        - conn pending for listening sock
 1394        - socket error pending
 1395 
 1396        ready for writing
 1397        - byte cnt avail >= send low water mark
 1398        - write half of conn closed
 1399        - socket error pending
 1400        - non-blocking conn completed successfully
 1401 
 1402        exception pending
 1403        - out of band data
 1404        - sock at out of band mark
 1405 
 1406     */
 1407     switch (event & EV_DMASK) {
 1408 
 1409     case EV_RWBYTES:
 1410     case EV_OOB:
 1411     case EV_RWBYTES|EV_OOB:
 1412       if (event & EV_OOB) {
 1413       if ((evq->ee_eventmask & EV_EX)) {
 1414         if (sp->so_oobmark || ((sp->so_state & SS_RCVATMARK))) {
 1415           mask |= EV_EX|EV_OOB;
 1416         }
 1417       }
 1418       }
 1419       if (event & EV_RWBYTES) {
 1420       if ((evq->ee_eventmask & EV_RE) && soreadable(sp)) {
 1421         if ((sp->so_type == SOCK_STREAM) && (sp->so_error == ECONNREFUSED) ||
 1422             (sp->so_error == ECONNRESET)) {
 1423           if ((sp->so_pcb == 0) ||
 1424               !(tp = sototcpcb(sp)) ||
 1425               (tp->t_state == TCPS_CLOSED)) {
 1426             mask |= EV_RE|EV_RESET;
 1427             break;
 1428           }
 1429         }
 1430         if (sp->so_state & SS_CANTRCVMORE) {
 1431           mask |= EV_RE|EV_FIN;
 1432           evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
 1433           break;
 1434         }
 1435         mask |= EV_RE;
 1436         evq->ee_req.er_rcnt = sp->so_rcv.sb_cc;
 1437       }
 1438 
 1439       if ((evq->ee_eventmask & EV_WR) && sowriteable(sp)) {
 1440         if ((sp->so_type == SOCK_STREAM) &&(sp->so_error == ECONNREFUSED) ||
 1441             (sp->so_error == ECONNRESET)) {
 1442           if ((sp->so_pcb == 0) ||
 1443               !(tp = sototcpcb(sp)) ||
 1444               (tp->t_state == TCPS_CLOSED)) {
 1445           mask |= EV_WR|EV_RESET;
 1446           break;
 1447           }
 1448         }
 1449         mask |= EV_WR;
 1450         evq->ee_req.er_wcnt = sbspace(&sp->so_snd);
 1451       }
 1452       }
 1453     break;
 1454 
 1455     case EV_RCONN:
 1456       if ((evq->ee_eventmask & EV_RE)) {
 1457         evq->ee_req.er_rcnt = sp->so_qlen + 1;  // incl this one
 1458         mask |= EV_RE|EV_RCONN;
 1459       }
 1460       break;
 1461 
 1462     case EV_WCONN:
 1463       if ((evq->ee_eventmask & EV_WR)) {
 1464         mask |= EV_WR|EV_WCONN;
 1465       }
 1466       break;
 1467 
 1468     case EV_RCLOSED:
 1469       if ((evq->ee_eventmask & EV_RE)) {
 1470         mask |= EV_RE|EV_RCLOSED;
 1471       }
 1472       break;
 1473 
 1474     case EV_WCLOSED:
 1475       if ((evq->ee_eventmask & EV_WR)) {
 1476         mask |= EV_WR|EV_WCLOSED;
 1477       }
 1478       break;
 1479 
 1480     case EV_FIN:
 1481       if (evq->ee_eventmask & EV_RE) {
 1482         mask |= EV_RE|EV_FIN;
 1483       }
 1484       break;
 1485 
 1486     case EV_RESET:
 1487     case EV_TIMEOUT:
 1488       if (evq->ee_eventmask & EV_RE) {
 1489         mask |= EV_RE | event;
 1490       } 
 1491       if (evq->ee_eventmask & EV_WR) {
 1492         mask |= EV_WR | event;
 1493       }
 1494       break;
 1495 
 1496     default:
 1497       return;
 1498     } /* switch */
 1499 
 1500     if (mask) {
 1501       evq->ee_req.er_eventbits |= mask;
 1502       KERNEL_DEBUG(DBG_MISC_POST, evq, evq->ee_req.er_eventbits, mask,0,0);
 1503       evprocenque(evq);
 1504     }
 1505   }
 1506   KERNEL_DEBUG(DBG_MISC_POST|DBG_FUNC_END, 0,0,0,0,0);
 1507 }
 1508 
 1509 /*
 1510  * remove and return the first event (eqp=NULL) or a specific
 1511  * event, or return NULL if no events found
 1512  */
 1513 struct eventqelt *
 1514 evprocdeque(struct proc *p, struct eventqelt *eqp)
 1515 {
 1516   
 1517   KERNEL_DEBUG(DBG_MISC_DEQUEUE|DBG_FUNC_START,p,eqp,0,0,0);
 1518 
 1519   if (eqp && ((eqp->ee_flags & EV_QUEUED) == NULL)) {
 1520     KERNEL_DEBUG(DBG_MISC_DEQUEUE|DBG_FUNC_END,0,0,0,0,0);
 1521     return(NULL);
 1522   }
 1523   if (p->p_evlist.tqh_first == NULL) {
 1524     KERNEL_DEBUG(DBG_MISC_DEQUEUE|DBG_FUNC_END,0,0,0,0,0);
 1525     return(NULL);
 1526   }
 1527   if (eqp == NULL) {  // remove first
 1528     eqp = p->p_evlist.tqh_first;
 1529   }
 1530   TAILQ_REMOVE(&p->p_evlist, eqp, ee_plist);
 1531   eqp->ee_flags &= ~EV_QUEUED;
 1532   KERNEL_DEBUG(DBG_MISC_DEQUEUE|DBG_FUNC_END,eqp,0,0,0,0);
 1533   return(eqp);
 1534 }
 1535 
 1536 struct evwatch_args {
 1537   struct eventreq  *u_req;
 1538   int               u_eventmask;
 1539 };
 1540 
 1541 
 1542 /*
 1543  * watchevent system call. user passes us an event to watch
 1544  * for. we malloc an event object, initialize it, and queue
 1545  * it to the open socket. when the event occurs, postevent()
 1546  * will enque it back to our proc where we can retrieve it
 1547  * via waitevent().
 1548  *
 1549  * should this prevent duplicate events on same socket?
 1550  */
 1551 int
 1552 watchevent(p, uap, retval)
 1553      struct proc *p;
 1554      struct evwatch_args *uap;
 1555      register_t *retval;
 1556 {
 1557   struct eventqelt *eqp = (struct eventqelt *)0;
 1558   struct eventqelt *np;
 1559   struct eventreq *erp;
 1560   struct file *fp;
 1561   struct socket *sp;
 1562   int error;
 1563 
 1564   KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_START, 0,0,0,0,0);
 1565 
 1566   // get a qelt and fill with users req
 1567   MALLOC(eqp, struct eventqelt *, sizeof(struct eventqelt), M_TEMP, M_WAITOK);
 1568   if (!eqp) panic("can't MALLOC eqp");
 1569   erp = &eqp->ee_req;
 1570   // get users request pkt
 1571   if (error = copyin((caddr_t)uap->u_req, (caddr_t)erp,
 1572                      sizeof(struct eventreq))) {
 1573     FREE(eqp, M_TEMP);
 1574     KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
 1575     return(error);
 1576   }
 1577   KERNEL_DEBUG(DBG_MISC_WATCH, erp->er_handle,uap->u_eventmask,eqp,0,0);
 1578   // validate, freeing qelt if errors
 1579   error = 0;
 1580   if (erp->er_type != EV_FD) {
 1581     error = EINVAL;
 1582   } else  if (erp->er_handle < 0) {
 1583     error = EBADF;
 1584   } else  if (erp->er_handle > p->p_fd->fd_nfiles) {
 1585     error = EBADF;
 1586   } else if ((fp = *fdfile(p, erp->er_handle)) == NULL) {
 1587     error = EBADF;
 1588   } else if (fp->f_type != DTYPE_SOCKET) {
 1589     error = EINVAL;
 1590   }
 1591   if (error) {
 1592     FREE(eqp,M_TEMP);
 1593     KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, error,0,0,0,0);
 1594     return(error);
 1595   }
 1596 
 1597   erp->er_rcnt = erp->er_wcnt = erp->er_eventbits = 0;
 1598   eqp->ee_proc = p;
 1599   eqp->ee_eventmask = uap->u_eventmask & EV_MASK;
 1600   eqp->ee_flags = 0;
 1601 
 1602   sp = (struct socket *)fp->f_data;
 1603   assert(sp != NULL);
 1604 
 1605   // only allow one watch per file per proc
 1606   for (np = sp->so_evlist.tqh_first; np != NULL; np = np->ee_slist.tqe_next) {
 1607     if (np->ee_proc == p) {
 1608       FREE(eqp,M_TEMP);
 1609       KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, EINVAL,0,0,0,0);
 1610       return(EINVAL);
 1611     }
 1612   }
 1613 
 1614   TAILQ_INSERT_TAIL(&sp->so_evlist, eqp, ee_slist);
 1615   postevent(sp, 0, EV_RWBYTES); // catch existing events
 1616   KERNEL_DEBUG(DBG_MISC_WATCH|DBG_FUNC_END, 0,0,0,0,0);
 1617   return(0);
 1618 }
 1619 
 1620 struct evwait_args {
 1621   struct eventreq *u_req;
 1622   struct timeval *tv;
 1623 };
 1624 
 1625 /*
 1626  * waitevent system call.
 1627  * grabs the next waiting event for this proc and returns
 1628  * it. if no events, user can request to sleep with timeout
 1629  * or poll mode (tv=NULL);
 1630  */
 1631 int
 1632 waitevent(p, uap, retval)
 1633         struct proc *p;
 1634         struct evwait_args *uap;
 1635         register_t *retval;
 1636 {
 1637         int error = 0;
 1638         struct eventqelt *eqp;
 1639         uint64_t abstime, interval;
 1640 
 1641         if (uap->tv) {
 1642                 struct timeval atv;
 1643 
 1644                 error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof (atv));
 1645                 if (error)
 1646                         return(error);
 1647                 if (itimerfix(&atv)) {
 1648                         error = EINVAL;
 1649                         return(error);
 1650                 }
 1651 
 1652                 interval = tvtoabstime(&atv);
 1653         }
 1654         else
 1655                 abstime = interval = 0;
 1656 
 1657         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_START, 0,0,0,0,0);
 1658 
 1659 retry:
 1660         if ((eqp = evprocdeque(p,NULL)) != NULL) {
 1661                 error = copyout((caddr_t)&eqp->ee_req,
 1662                                                                 (caddr_t)uap->u_req, sizeof(struct eventreq));
 1663                 KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,
 1664                                                 eqp->ee_req.er_handle,eqp->ee_req.er_eventbits,eqp,0);
 1665 
 1666                 return (error);
 1667         }
 1668         else {
 1669                 if (uap->tv && interval == 0) {
 1670                         *retval = 1;  // poll failed
 1671                         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, error,0,0,0,0);
 1672 
 1673                         return (error);
 1674                 }
 1675 
 1676                 if (interval != 0)
 1677                         clock_absolutetime_interval_to_deadline(interval, &abstime);
 1678 
 1679                 KERNEL_DEBUG(DBG_MISC_WAIT, 1,&p->p_evlist,0,0,0);
 1680                 error = tsleep1(&p->p_evlist, PSOCK | PCATCH,
 1681                                                                         "waitevent", abstime, (int (*)(int))0);
 1682                 KERNEL_DEBUG(DBG_MISC_WAIT, 2,&p->p_evlist,0,0,0);
 1683                 if (error == 0)
 1684                         goto retry;
 1685                 if (error == ERESTART)
 1686                         error = EINTR;
 1687                 if (error == EWOULDBLOCK) {
 1688                         *retval = 1;
 1689                         error = 0;
 1690                 }
 1691         }
 1692 
 1693         KERNEL_DEBUG(DBG_MISC_WAIT|DBG_FUNC_END, 0,0,0,0,0);
 1694 
 1695         return (error);
 1696 }
 1697 
 1698 struct modwatch_args {
 1699   struct eventreq *u_req;
 1700   int               u_eventmask;
 1701 };
 1702 
 1703 /*
 1704  * modwatch system call. user passes in event to modify.
 1705  * if we find it we reset the event bits and que/deque event
 1706  * it needed.
 1707  */
 1708 int
 1709 modwatch(p, uap, retval)
 1710      struct proc *p;
 1711      struct modwatch_args *uap;
 1712      register_t *retval;
 1713 {
 1714   struct eventreq er;
 1715   struct eventreq *erp = &er;
 1716   struct eventqelt *evq;
 1717   int error;
 1718   struct file *fp;
 1719   struct socket *sp;
 1720   int flag;
 1721 
 1722   KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_START, 0,0,0,0,0);
 1723 
 1724   // get users request pkt
 1725   if (error = copyin((caddr_t)uap->u_req, (caddr_t)erp,
 1726                      sizeof(struct eventreq))) return(error);
 1727 
 1728   if (erp->er_type != EV_FD) return(EINVAL);
 1729   if (erp->er_handle < 0) return(EBADF);
 1730   if (erp->er_handle > p->p_fd->fd_nfiles) return(EBADF);
 1731   if ((fp = *fdfile(p, erp->er_handle)) == NULL)
 1732     return(EBADF);
 1733   if (fp->f_type != DTYPE_SOCKET) return(EINVAL); // for now must be sock
 1734   sp = (struct socket *)fp->f_data;
 1735 
 1736   /* soo_close sets f_data to 0 before switching funnel */
 1737   if (sp == (struct socket *)0) 
 1738     return(EBADF);
 1739 
 1740   // locate event if possible
 1741   for (evq = sp->so_evlist.tqh_first;
 1742        evq != NULL; evq = evq->ee_slist.tqe_next) {
 1743     if (evq->ee_proc == p) break;
 1744   }
 1745 
 1746   if (evq == NULL) {
 1747         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, EINVAL,0,0,0,0);
 1748     return(EINVAL);
 1749   }
 1750   KERNEL_DEBUG(DBG_MISC_MOD, erp->er_handle,uap->u_eventmask,evq,0,0);
 1751 
 1752     if (uap->u_eventmask == EV_RM) {
 1753     evprocdeque(p, evq);
 1754     TAILQ_REMOVE(&sp->so_evlist, evq, ee_slist);
 1755     FREE(evq, M_TEMP);
 1756         KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, 0,0,0,0,0);
 1757     return(0);
 1758     }
 1759 
 1760   switch (uap->u_eventmask & EV_MASK) {
 1761  
 1762   case 0:
 1763     flag = 0;
 1764     break;
 1765 
 1766   case EV_RE:
 1767   case EV_WR:
 1768   case EV_RE|EV_WR:
 1769     flag = EV_RWBYTES;
 1770     break;
 1771 
 1772   case EV_EX:
 1773     flag = EV_OOB;
 1774     break;
 1775 
 1776   case EV_EX|EV_RE:
 1777   case EV_EX|EV_WR:
 1778   case EV_EX|EV_RE|EV_WR:
 1779     flag = EV_OOB|EV_RWBYTES;
 1780     break;
 1781 
 1782   default:
 1783     return(EINVAL);
 1784   }
 1785 
 1786    evq->ee_eventmask = uap->u_eventmask & EV_MASK;
 1787    evprocdeque(p, evq);
 1788    evq->ee_req.er_eventbits = 0;
 1789    postevent(sp, 0, flag);
 1790    KERNEL_DEBUG(DBG_MISC_MOD|DBG_FUNC_END, evq->ee_req.er_handle,evq->ee_eventmask,sp,flag,0);
 1791    return(0);
 1792 }

Cache object: 0bea4681e3b644751f00561240565861


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.