The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)sys_generic.c       8.5 (Berkeley) 1/21/94
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD: releng/5.3/sys/kern/sys_generic.c 136588 2004-10-16 08:43:07Z cvs2svn $");
   39 
   40 #include "opt_ktrace.h"
   41 
   42 #include <sys/param.h>
   43 #include <sys/systm.h>
   44 #include <sys/sysproto.h>
   45 #include <sys/filedesc.h>
   46 #include <sys/filio.h>
   47 #include <sys/fcntl.h>
   48 #include <sys/file.h>
   49 #include <sys/proc.h>
   50 #include <sys/signalvar.h>
   51 #include <sys/socketvar.h>
   52 #include <sys/uio.h>
   53 #include <sys/kernel.h>
   54 #include <sys/limits.h>
   55 #include <sys/malloc.h>
   56 #include <sys/poll.h>
   57 #include <sys/resourcevar.h>
   58 #include <sys/selinfo.h>
   59 #include <sys/sleepqueue.h>
   60 #include <sys/syscallsubr.h>
   61 #include <sys/sysctl.h>
   62 #include <sys/sysent.h>
   63 #include <sys/vnode.h>
   64 #include <sys/bio.h>
   65 #include <sys/buf.h>
   66 #include <sys/condvar.h>
   67 #ifdef KTRACE
   68 #include <sys/ktrace.h>
   69 #endif
   70 #include <vm/vm.h>
   71 #include <vm/vm_page.h>
   72 
   73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
   74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
   75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
   76 
   77 static int      pollscan(struct thread *, struct pollfd *, u_int);
   78 static int      selscan(struct thread *, fd_mask **, fd_mask **, int);
   79 static int      dofileread(struct thread *, struct file *, int, void *,
   80                     size_t, off_t, int);
   81 static int      dofilewrite(struct thread *, struct file *, int,
   82                     const void *, size_t, off_t, int);
   83 static void     doselwakeup(struct selinfo *, int);
   84 
   85 /*
   86  * Read system call.
   87  */
   88 #ifndef _SYS_SYSPROTO_H_
   89 struct read_args {
   90         int     fd;
   91         void    *buf;
   92         size_t  nbyte;
   93 };
   94 #endif
   95 /*
   96  * MPSAFE
   97  */
   98 int
   99 read(td, uap)
  100         struct thread *td;
  101         struct read_args *uap;
  102 {
  103         struct file *fp;
  104         int error;
  105 
  106         if ((error = fget_read(td, uap->fd, &fp)) == 0) {
  107                 error = dofileread(td, fp, uap->fd, uap->buf,
  108                             uap->nbyte, (off_t)-1, 0);
  109                 fdrop(fp, td);
  110         }
  111         return(error);
  112 }
  113 
  114 /*
  115  * Pread system call
  116  */
  117 #ifndef _SYS_SYSPROTO_H_
  118 struct pread_args {
  119         int     fd;
  120         void    *buf;
  121         size_t  nbyte;
  122         int     pad;
  123         off_t   offset;
  124 };
  125 #endif
  126 /*
  127  * MPSAFE
  128  */
  129 int
  130 pread(td, uap)
  131         struct thread *td;
  132         struct pread_args *uap;
  133 {
  134         struct file *fp;
  135         int error;
  136 
  137         if ((error = fget_read(td, uap->fd, &fp)) != 0)
  138                 return (error);
  139         if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
  140                 error = ESPIPE;
  141         else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
  142                 error = EINVAL;
  143         else {
  144                 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 
  145                             uap->offset, FOF_OFFSET);
  146         }
  147         fdrop(fp, td);
  148         return(error);
  149 }
  150 
  151 /*
  152  * Code common for read and pread
  153  */
  154 static int
  155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
  156         struct thread *td;
  157         struct file *fp;
  158         int fd, flags;
  159         void *buf;
  160         size_t nbyte;
  161         off_t offset;
  162 {
  163         struct uio auio;
  164         struct iovec aiov;
  165         long cnt, error = 0;
  166 #ifdef KTRACE
  167         struct uio *ktruio = NULL;
  168 #endif
  169 
  170         aiov.iov_base = buf;
  171         aiov.iov_len = nbyte;
  172         auio.uio_iov = &aiov;
  173         auio.uio_iovcnt = 1;
  174         auio.uio_offset = offset;
  175         if (nbyte > INT_MAX)
  176                 return (EINVAL);
  177         auio.uio_resid = nbyte;
  178         auio.uio_rw = UIO_READ;
  179         auio.uio_segflg = UIO_USERSPACE;
  180         auio.uio_td = td;
  181 #ifdef KTRACE
  182         if (KTRPOINT(td, KTR_GENIO))
  183                 ktruio = cloneuio(&auio);
  184 #endif
  185         cnt = nbyte;
  186 
  187         if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
  188                 if (auio.uio_resid != cnt && (error == ERESTART ||
  189                     error == EINTR || error == EWOULDBLOCK))
  190                         error = 0;
  191         }
  192         cnt -= auio.uio_resid;
  193 #ifdef KTRACE
  194         if (ktruio != NULL) {
  195                 ktruio->uio_resid = cnt;
  196                 ktrgenio(fd, UIO_READ, ktruio, error);
  197         }
  198 #endif
  199         td->td_retval[0] = cnt;
  200         return (error);
  201 }
  202 
  203 /*
  204  * Scatter read system call.
  205  */
  206 #ifndef _SYS_SYSPROTO_H_
  207 struct readv_args {
  208         int     fd;
  209         struct  iovec *iovp;
  210         u_int   iovcnt;
  211 };
  212 #endif
  213 /*
  214  * MPSAFE
  215  */
  216 int
  217 readv(struct thread *td, struct readv_args *uap)
  218 {
  219         struct file *fp;
  220         struct uio *auio = NULL;
  221         long cnt;
  222         int error;
  223 #ifdef KTRACE
  224         struct uio *ktruio = NULL;
  225 #endif
  226 
  227         error = fget_read(td, uap->fd, &fp);
  228         if (error)
  229                 return (error);
  230         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  231         if (error) {
  232                 fdrop(fp, td);
  233                 return (error);
  234         }
  235         auio->uio_rw = UIO_READ;
  236         auio->uio_td = td;
  237 #ifdef KTRACE
  238         if (KTRPOINT(td, KTR_GENIO)) 
  239                 ktruio = cloneuio(auio);
  240 #endif
  241         cnt = auio->uio_resid;
  242         if ((error = fo_read(fp, auio, td->td_ucred, 0, td))) {
  243                 if (auio->uio_resid != cnt && (error == ERESTART ||
  244                     error == EINTR || error == EWOULDBLOCK))
  245                         error = 0;
  246         }
  247         cnt -= auio->uio_resid;
  248 #ifdef KTRACE
  249         if (ktruio != NULL) {
  250                 ktruio->uio_resid = cnt;
  251                 ktrgenio(uap->fd, UIO_READ, ktruio, error);
  252         }
  253 #endif
  254         td->td_retval[0] = cnt;
  255         free(auio, M_IOV);
  256         fdrop(fp, td);
  257         return (error);
  258 }
  259 
  260 /*
  261  * Write system call
  262  */
  263 #ifndef _SYS_SYSPROTO_H_
  264 struct write_args {
  265         int     fd;
  266         const void *buf;
  267         size_t  nbyte;
  268 };
  269 #endif
  270 /*
  271  * MPSAFE
  272  */
  273 int
  274 write(td, uap)
  275         struct thread *td;
  276         struct write_args *uap;
  277 {
  278         struct file *fp;
  279         int error;
  280 
  281         if ((error = fget_write(td, uap->fd, &fp)) == 0) {
  282                 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
  283                             (off_t)-1, 0);
  284                 fdrop(fp, td);
  285         } else {
  286                 error = EBADF;  /* XXX this can't be right */
  287         }
  288         return(error);
  289 }
  290 
  291 /*
  292  * Pwrite system call
  293  */
  294 #ifndef _SYS_SYSPROTO_H_
  295 struct pwrite_args {
  296         int     fd;
  297         const void *buf;
  298         size_t  nbyte;
  299         int     pad;
  300         off_t   offset;
  301 };
  302 #endif
  303 /*
  304  * MPSAFE
  305  */
  306 int
  307 pwrite(td, uap)
  308         struct thread *td;
  309         struct pwrite_args *uap;
  310 {
  311         struct file *fp;
  312         int error;
  313 
  314         if ((error = fget_write(td, uap->fd, &fp)) == 0) {
  315                 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
  316                         error = ESPIPE;
  317                 else if (uap->offset < 0 && fp->f_vnode->v_type != VCHR)
  318                         error = EINVAL;
  319                 else {
  320                         error = dofilewrite(td, fp, uap->fd, uap->buf,
  321                                     uap->nbyte, uap->offset, FOF_OFFSET);
  322                 }
  323                 fdrop(fp, td);
  324         } else {
  325                 error = EBADF;  /* this can't be right */
  326         }
  327         return(error);
  328 }
  329 
  330 static int
  331 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
  332         struct thread *td;
  333         struct file *fp;
  334         int fd, flags;
  335         const void *buf;
  336         size_t nbyte;
  337         off_t offset;
  338 {
  339         struct uio auio;
  340         struct iovec aiov;
  341         long cnt, error = 0;
  342 #ifdef KTRACE
  343         struct uio *ktruio = NULL;
  344 #endif
  345 
  346         aiov.iov_base = (void *)(uintptr_t)buf;
  347         aiov.iov_len = nbyte;
  348         auio.uio_iov = &aiov;
  349         auio.uio_iovcnt = 1;
  350         auio.uio_offset = offset;
  351         if (nbyte > INT_MAX)
  352                 return (EINVAL);
  353         auio.uio_resid = nbyte;
  354         auio.uio_rw = UIO_WRITE;
  355         auio.uio_segflg = UIO_USERSPACE;
  356         auio.uio_td = td;
  357 #ifdef KTRACE
  358         if (KTRPOINT(td, KTR_GENIO))
  359                 ktruio = cloneuio(&auio);
  360 #endif
  361         cnt = nbyte;
  362         if (fp->f_type == DTYPE_VNODE)
  363                 bwillwrite();
  364         if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
  365                 if (auio.uio_resid != cnt && (error == ERESTART ||
  366                     error == EINTR || error == EWOULDBLOCK))
  367                         error = 0;
  368                 /* Socket layer is responsible for issuing SIGPIPE. */
  369                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
  370                         PROC_LOCK(td->td_proc);
  371                         psignal(td->td_proc, SIGPIPE);
  372                         PROC_UNLOCK(td->td_proc);
  373                 }
  374         }
  375         cnt -= auio.uio_resid;
  376 #ifdef KTRACE
  377         if (ktruio != NULL) {
  378                 ktruio->uio_resid = cnt;
  379                 ktrgenio(fd, UIO_WRITE, ktruio, error);
  380         }
  381 #endif
  382         td->td_retval[0] = cnt;
  383         return (error);
  384 }
  385 
  386 /*
  387  * Gather write system call
  388  */
  389 #ifndef _SYS_SYSPROTO_H_
  390 struct writev_args {
  391         int     fd;
  392         struct  iovec *iovp;
  393         u_int   iovcnt;
  394 };
  395 #endif
  396 /*
  397  * MPSAFE
  398  */
  399 int
  400 writev(struct thread *td, struct writev_args *uap)
  401 {
  402         struct file *fp;
  403         struct uio *auio = NULL;
  404         long cnt;
  405         int error;
  406 #ifdef KTRACE
  407         struct uio *ktruio = NULL;
  408 #endif
  409 
  410         error = fget_write(td, uap->fd, &fp);
  411         if (error)
  412                 return (EBADF);
  413         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  414         if (error) {
  415                 fdrop(fp, td);
  416                 return (error);
  417         }
  418         auio->uio_rw = UIO_WRITE;
  419         auio->uio_td = td;
  420 #ifdef KTRACE
  421         if (KTRPOINT(td, KTR_GENIO))
  422                 ktruio = cloneuio(auio);
  423 #endif
  424         cnt = auio->uio_resid;
  425         if (fp->f_type == DTYPE_VNODE)
  426                 bwillwrite();
  427         if ((error = fo_write(fp, auio, td->td_ucred, 0, td))) {
  428                 if (auio->uio_resid != cnt && (error == ERESTART ||
  429                     error == EINTR || error == EWOULDBLOCK))
  430                         error = 0;
  431                 if (error == EPIPE) {
  432                         PROC_LOCK(td->td_proc);
  433                         psignal(td->td_proc, SIGPIPE);
  434                         PROC_UNLOCK(td->td_proc);
  435                 }
  436         }
  437         cnt -= auio->uio_resid;
  438 #ifdef KTRACE
  439         if (ktruio != NULL) {
  440                 ktruio->uio_resid = cnt;
  441                 ktrgenio(uap->fd, UIO_WRITE, ktruio, error);
  442         }
  443 #endif
  444         td->td_retval[0] = cnt;
  445         fdrop(fp, td);
  446         free(auio, M_IOV);
  447         return (error);
  448 }
  449 
  450 /*
  451  * Ioctl system call
  452  */
  453 #ifndef _SYS_SYSPROTO_H_
  454 struct ioctl_args {
  455         int     fd;
  456         u_long  com;
  457         caddr_t data;
  458 };
  459 #endif
  460 /*
  461  * MPSAFE
  462  */
  463 /* ARGSUSED */
  464 int
  465 ioctl(td, uap)
  466         struct thread *td;
  467         register struct ioctl_args *uap;
  468 {
  469         struct file *fp;
  470         register struct filedesc *fdp;
  471         register u_long com;
  472         int error = 0;
  473         register u_int size;
  474         caddr_t data, memp;
  475         int tmp;
  476 #define STK_PARAMS      128
  477         union {
  478             char stkbuf[STK_PARAMS];
  479             long align;
  480         } ubuf;
  481 
  482         if ((error = fget(td, uap->fd, &fp)) != 0)
  483                 return (error);
  484         mtx_lock(&Giant);
  485         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  486                 fdrop(fp, td);
  487                 mtx_unlock(&Giant);
  488                 return (EBADF);
  489         }
  490         fdp = td->td_proc->p_fd;
  491         switch (com = uap->com) {
  492         case FIONCLEX:
  493                 FILEDESC_LOCK(fdp);
  494                 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
  495                 FILEDESC_UNLOCK(fdp);
  496                 fdrop(fp, td);
  497                 mtx_unlock(&Giant);
  498                 return (0);
  499         case FIOCLEX:
  500                 FILEDESC_LOCK(fdp);
  501                 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
  502                 FILEDESC_UNLOCK(fdp);
  503                 fdrop(fp, td);
  504                 mtx_unlock(&Giant);
  505                 return (0);
  506         }
  507 
  508         /*
  509          * Interpret high order word to find amount of data to be
  510          * copied to/from the user's address space.
  511          */
  512         size = IOCPARM_LEN(com);
  513         if (size > IOCPARM_MAX) {
  514                 fdrop(fp, td);
  515                 mtx_unlock(&Giant);
  516                 return (ENOTTY);
  517         }
  518 
  519         memp = NULL;
  520         if (size > sizeof (ubuf.stkbuf)) {
  521                 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
  522                 data = memp;
  523         } else {
  524                 data = ubuf.stkbuf;
  525         }
  526         if (com&IOC_IN) {
  527                 if (size) {
  528                         error = copyin(uap->data, data, (u_int)size);
  529                         if (error) {
  530                                 if (memp)
  531                                         free(memp, M_IOCTLOPS);
  532                                 fdrop(fp, td);
  533                                 goto done;
  534                         }
  535                 } else {
  536                         *(caddr_t *)data = uap->data;
  537                 }
  538         } else if ((com&IOC_OUT) && size) {
  539                 /*
  540                  * Zero the buffer so the user always
  541                  * gets back something deterministic.
  542                  */
  543                 bzero(data, size);
  544         } else if (com&IOC_VOID) {
  545                 *(caddr_t *)data = uap->data;
  546         }
  547 
  548         switch (com) {
  549 
  550         case FIONBIO:
  551                 FILE_LOCK(fp);
  552                 if ((tmp = *(int *)data))
  553                         fp->f_flag |= FNONBLOCK;
  554                 else
  555                         fp->f_flag &= ~FNONBLOCK;
  556                 FILE_UNLOCK(fp);
  557                 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
  558                 break;
  559 
  560         case FIOASYNC:
  561                 FILE_LOCK(fp);
  562                 if ((tmp = *(int *)data))
  563                         fp->f_flag |= FASYNC;
  564                 else
  565                         fp->f_flag &= ~FASYNC;
  566                 FILE_UNLOCK(fp);
  567                 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
  568                 break;
  569 
  570         default:
  571                 error = fo_ioctl(fp, com, data, td->td_ucred, td);
  572                 /*
  573                  * Copy any data to user, size was
  574                  * already set and checked above.
  575                  */
  576                 if (error == 0 && (com&IOC_OUT) && size)
  577                         error = copyout(data, uap->data, (u_int)size);
  578                 break;
  579         }
  580         if (memp)
  581                 free(memp, M_IOCTLOPS);
  582         fdrop(fp, td);
  583 done:
  584         mtx_unlock(&Giant);
  585         return (error);
  586 }
  587 
  588 /*
  589  * sellock and selwait are initialized in selectinit() via SYSINIT.
  590  */
  591 struct mtx      sellock;
  592 struct cv       selwait;
  593 u_int           nselcoll;       /* Select collisions since boot */
  594 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
  595 
  596 /*
  597  * Select system call.
  598  */
  599 #ifndef _SYS_SYSPROTO_H_
  600 struct select_args {
  601         int     nd;
  602         fd_set  *in, *ou, *ex;
  603         struct  timeval *tv;
  604 };
  605 #endif
  606 /*
  607  * MPSAFE
  608  */
  609 int
  610 select(td, uap)
  611         register struct thread *td;
  612         register struct select_args *uap;
  613 {
  614         struct timeval tv, *tvp;
  615         int error;
  616 
  617         if (uap->tv != NULL) {
  618                 error = copyin(uap->tv, &tv, sizeof(tv));
  619                 if (error)
  620                         return (error);
  621                 tvp = &tv;
  622         } else
  623                 tvp = NULL;
  624 
  625         return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
  626 }
  627 
  628 int
  629 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
  630     fd_set *fd_ex, struct timeval *tvp)
  631 {
  632         struct filedesc *fdp;
  633         /*
  634          * The magic 2048 here is chosen to be just enough for FD_SETSIZE
  635          * infds with the new FD_SETSIZE of 1024, and more than enough for
  636          * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
  637          * of 256.
  638          */
  639         fd_mask s_selbits[howmany(2048, NFDBITS)];
  640         fd_mask *ibits[3], *obits[3], *selbits, *sbp;
  641         struct timeval atv, rtv, ttv;
  642         int error, timo;
  643         u_int ncoll, nbufbytes, ncpbytes, nfdbits;
  644 
  645         if (nd < 0)
  646                 return (EINVAL);
  647         fdp = td->td_proc->p_fd;
  648         /*
  649          * XXX: kern_select() currently requires that we acquire Giant
  650          * even if none of the file descriptors we poll requires Giant.
  651          */
  652         mtx_lock(&Giant);
  653         FILEDESC_LOCK(fdp);
  654 
  655         if (nd > td->td_proc->p_fd->fd_nfiles)
  656                 nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
  657         FILEDESC_UNLOCK(fdp);
  658 
  659         /*
  660          * Allocate just enough bits for the non-null fd_sets.  Use the
  661          * preallocated auto buffer if possible.
  662          */
  663         nfdbits = roundup(nd, NFDBITS);
  664         ncpbytes = nfdbits / NBBY;
  665         nbufbytes = 0;
  666         if (fd_in != NULL)
  667                 nbufbytes += 2 * ncpbytes;
  668         if (fd_ou != NULL)
  669                 nbufbytes += 2 * ncpbytes;
  670         if (fd_ex != NULL)
  671                 nbufbytes += 2 * ncpbytes;
  672         if (nbufbytes <= sizeof s_selbits)
  673                 selbits = &s_selbits[0];
  674         else
  675                 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
  676 
  677         /*
  678          * Assign pointers into the bit buffers and fetch the input bits.
  679          * Put the output buffers together so that they can be bzeroed
  680          * together.
  681          */
  682         sbp = selbits;
  683 #define getbits(name, x) \
  684         do {                                                            \
  685                 if (name == NULL)                                       \
  686                         ibits[x] = NULL;                                \
  687                 else {                                                  \
  688                         ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;   \
  689                         obits[x] = sbp;                                 \
  690                         sbp += ncpbytes / sizeof *sbp;                  \
  691                         error = copyin(name, ibits[x], ncpbytes);       \
  692                         if (error != 0)                                 \
  693                                 goto done_nosellock;                    \
  694                 }                                                       \
  695         } while (0)
  696         getbits(fd_in, 0);
  697         getbits(fd_ou, 1);
  698         getbits(fd_ex, 2);
  699 #undef  getbits
  700         if (nbufbytes != 0)
  701                 bzero(selbits, nbufbytes / 2);
  702 
  703         if (tvp != NULL) {
  704                 atv = *tvp;
  705                 if (itimerfix(&atv)) {
  706                         error = EINVAL;
  707                         goto done_nosellock;
  708                 }
  709                 getmicrouptime(&rtv);
  710                 timevaladd(&atv, &rtv);
  711         } else {
  712                 atv.tv_sec = 0;
  713                 atv.tv_usec = 0;
  714         }
  715         timo = 0;
  716         TAILQ_INIT(&td->td_selq);
  717         mtx_lock(&sellock);
  718 retry:
  719         ncoll = nselcoll;
  720         mtx_lock_spin(&sched_lock);
  721         td->td_flags |= TDF_SELECT;
  722         mtx_unlock_spin(&sched_lock);
  723         mtx_unlock(&sellock);
  724 
  725         error = selscan(td, ibits, obits, nd);
  726         mtx_lock(&sellock);
  727         if (error || td->td_retval[0])
  728                 goto done;
  729         if (atv.tv_sec || atv.tv_usec) {
  730                 getmicrouptime(&rtv);
  731                 if (timevalcmp(&rtv, &atv, >=))
  732                         goto done;
  733                 ttv = atv;
  734                 timevalsub(&ttv, &rtv);
  735                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  736                     24 * 60 * 60 * hz : tvtohz(&ttv);
  737         }
  738 
  739         /*
  740          * An event of interest may occur while we do not hold
  741          * sellock, so check TDF_SELECT and the number of
  742          * collisions and rescan the file descriptors if
  743          * necessary.
  744          */
  745         mtx_lock_spin(&sched_lock);
  746         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  747                 mtx_unlock_spin(&sched_lock);
  748                 goto retry;
  749         }
  750         mtx_unlock_spin(&sched_lock);
  751 
  752         if (timo > 0)
  753                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  754         else
  755                 error = cv_wait_sig(&selwait, &sellock);
  756         
  757         if (error == 0)
  758                 goto retry;
  759 
  760 done:
  761         clear_selinfo_list(td);
  762         mtx_lock_spin(&sched_lock);
  763         td->td_flags &= ~TDF_SELECT;
  764         mtx_unlock_spin(&sched_lock);
  765         mtx_unlock(&sellock);
  766 
  767 done_nosellock:
  768         /* select is not restarted after signals... */
  769         if (error == ERESTART)
  770                 error = EINTR;
  771         if (error == EWOULDBLOCK)
  772                 error = 0;
  773 #define putbits(name, x) \
  774         if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
  775                 error = error2;
  776         if (error == 0) {
  777                 int error2;
  778 
  779                 putbits(fd_in, 0);
  780                 putbits(fd_ou, 1);
  781                 putbits(fd_ex, 2);
  782 #undef putbits
  783         }
  784         if (selbits != &s_selbits[0])
  785                 free(selbits, M_SELECT);
  786 
  787         mtx_unlock(&Giant);
  788         return (error);
  789 }
  790 
  791 static int
  792 selscan(td, ibits, obits, nfd)
  793         struct thread *td;
  794         fd_mask **ibits, **obits;
  795         int nfd;
  796 {
  797         int msk, i, fd;
  798         fd_mask bits;
  799         struct file *fp;
  800         int n = 0;
  801         /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
  802         static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
  803         struct filedesc *fdp = td->td_proc->p_fd;
  804 
  805         FILEDESC_LOCK(fdp);
  806         for (msk = 0; msk < 3; msk++) {
  807                 if (ibits[msk] == NULL)
  808                         continue;
  809                 for (i = 0; i < nfd; i += NFDBITS) {
  810                         bits = ibits[msk][i/NFDBITS];
  811                         /* ffs(int mask) not portable, fd_mask is long */
  812                         for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
  813                                 if (!(bits & 1))
  814                                         continue;
  815                                 if ((fp = fget_locked(fdp, fd)) == NULL) {
  816                                         FILEDESC_UNLOCK(fdp);
  817                                         return (EBADF);
  818                                 }
  819                                 if (fo_poll(fp, flag[msk], td->td_ucred,
  820                                     td)) {
  821                                         obits[msk][(fd)/NFDBITS] |=
  822                                             ((fd_mask)1 << ((fd) % NFDBITS));
  823                                         n++;
  824                                 }
  825                         }
  826                 }
  827         }
  828         FILEDESC_UNLOCK(fdp);
  829         td->td_retval[0] = n;
  830         return (0);
  831 }
  832 
  833 /*
  834  * Poll system call.
  835  */
  836 #ifndef _SYS_SYSPROTO_H_
  837 struct poll_args {
  838         struct pollfd *fds;
  839         u_int   nfds;
  840         int     timeout;
  841 };
  842 #endif
  843 /*
  844  * MPSAFE
  845  */
  846 int
  847 poll(td, uap)
  848         struct thread *td;
  849         struct poll_args *uap;
  850 {
  851         struct pollfd *bits;
  852         struct pollfd smallbits[32];
  853         struct timeval atv, rtv, ttv;
  854         int error = 0, timo;
  855         u_int ncoll, nfds;
  856         size_t ni;
  857 
  858         nfds = uap->nfds;
  859 
  860         /*
  861          * XXX: poll() currently requires that we acquire Giant even if
  862          * none of the file descriptors we poll requires Giant.
  863          */
  864         mtx_lock(&Giant);
  865         /*
  866          * This is kinda bogus.  We have fd limits, but that is not
  867          * really related to the size of the pollfd array.  Make sure
  868          * we let the process use at least FD_SETSIZE entries and at
  869          * least enough for the current limits.  We want to be reasonably
  870          * safe, but not overly restrictive.
  871          */
  872         PROC_LOCK(td->td_proc);
  873         if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
  874             (nfds > FD_SETSIZE)) {
  875                 PROC_UNLOCK(td->td_proc);
  876                 error = EINVAL;
  877                 goto done2;
  878         }
  879         PROC_UNLOCK(td->td_proc);
  880         ni = nfds * sizeof(struct pollfd);
  881         if (ni > sizeof(smallbits))
  882                 bits = malloc(ni, M_TEMP, M_WAITOK);
  883         else
  884                 bits = smallbits;
  885         error = copyin(uap->fds, bits, ni);
  886         if (error)
  887                 goto done_nosellock;
  888         if (uap->timeout != INFTIM) {
  889                 atv.tv_sec = uap->timeout / 1000;
  890                 atv.tv_usec = (uap->timeout % 1000) * 1000;
  891                 if (itimerfix(&atv)) {
  892                         error = EINVAL;
  893                         goto done_nosellock;
  894                 }
  895                 getmicrouptime(&rtv);
  896                 timevaladd(&atv, &rtv);
  897         } else {
  898                 atv.tv_sec = 0;
  899                 atv.tv_usec = 0;
  900         }
  901         timo = 0;
  902         TAILQ_INIT(&td->td_selq);
  903         mtx_lock(&sellock);
  904 retry:
  905         ncoll = nselcoll;
  906         mtx_lock_spin(&sched_lock);
  907         td->td_flags |= TDF_SELECT;
  908         mtx_unlock_spin(&sched_lock);
  909         mtx_unlock(&sellock);
  910 
  911         error = pollscan(td, bits, nfds);
  912         mtx_lock(&sellock);
  913         if (error || td->td_retval[0])
  914                 goto done;
  915         if (atv.tv_sec || atv.tv_usec) {
  916                 getmicrouptime(&rtv);
  917                 if (timevalcmp(&rtv, &atv, >=))
  918                         goto done;
  919                 ttv = atv;
  920                 timevalsub(&ttv, &rtv);
  921                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  922                     24 * 60 * 60 * hz : tvtohz(&ttv);
  923         }
  924         /*
  925          * An event of interest may occur while we do not hold
  926          * sellock, so check TDF_SELECT and the number of collisions
  927          * and rescan the file descriptors if necessary.
  928          */
  929         mtx_lock_spin(&sched_lock);
  930         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  931                 mtx_unlock_spin(&sched_lock);
  932                 goto retry;
  933         }
  934         mtx_unlock_spin(&sched_lock);
  935 
  936         if (timo > 0)
  937                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  938         else
  939                 error = cv_wait_sig(&selwait, &sellock);
  940 
  941         if (error == 0)
  942                 goto retry;
  943 
  944 done:
  945         clear_selinfo_list(td);
  946         mtx_lock_spin(&sched_lock);
  947         td->td_flags &= ~TDF_SELECT;
  948         mtx_unlock_spin(&sched_lock);
  949         mtx_unlock(&sellock);
  950 
  951 done_nosellock:
  952         /* poll is not restarted after signals... */
  953         if (error == ERESTART)
  954                 error = EINTR;
  955         if (error == EWOULDBLOCK)
  956                 error = 0;
  957         if (error == 0) {
  958                 error = copyout(bits, uap->fds, ni);
  959                 if (error)
  960                         goto out;
  961         }
  962 out:
  963         if (ni > sizeof(smallbits))
  964                 free(bits, M_TEMP);
  965 done2:
  966         mtx_unlock(&Giant);
  967         return (error);
  968 }
  969 
  970 static int
  971 pollscan(td, fds, nfd)
  972         struct thread *td;
  973         struct pollfd *fds;
  974         u_int nfd;
  975 {
  976         register struct filedesc *fdp = td->td_proc->p_fd;
  977         int i;
  978         struct file *fp;
  979         int n = 0;
  980 
  981         FILEDESC_LOCK(fdp);
  982         for (i = 0; i < nfd; i++, fds++) {
  983                 if (fds->fd >= fdp->fd_nfiles) {
  984                         fds->revents = POLLNVAL;
  985                         n++;
  986                 } else if (fds->fd < 0) {
  987                         fds->revents = 0;
  988                 } else {
  989                         fp = fdp->fd_ofiles[fds->fd];
  990                         if (fp == NULL) {
  991                                 fds->revents = POLLNVAL;
  992                                 n++;
  993                         } else {
  994                                 /*
  995                                  * Note: backend also returns POLLHUP and
  996                                  * POLLERR if appropriate.
  997                                  */
  998                                 fds->revents = fo_poll(fp, fds->events,
  999                                     td->td_ucred, td);
 1000                                 if (fds->revents != 0)
 1001                                         n++;
 1002                         }
 1003                 }
 1004         }
 1005         FILEDESC_UNLOCK(fdp);
 1006         td->td_retval[0] = n;
 1007         return (0);
 1008 }
 1009 
 1010 /*
 1011  * OpenBSD poll system call.
 1012  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
 1013  */
 1014 #ifndef _SYS_SYSPROTO_H_
 1015 struct openbsd_poll_args {
 1016         struct pollfd *fds;
 1017         u_int   nfds;
 1018         int     timeout;
 1019 };
 1020 #endif
 1021 /*
 1022  * MPSAFE
 1023  */
 1024 int
 1025 openbsd_poll(td, uap)
 1026         register struct thread *td;
 1027         register struct openbsd_poll_args *uap;
 1028 {
 1029         return (poll(td, (struct poll_args *)uap));
 1030 }
 1031 
 1032 /*
 1033  * Remove the references to the thread from all of the objects
 1034  * we were polling.
 1035  *
 1036  * This code assumes that the underlying owner of the selinfo
 1037  * structure will hold sellock before it changes it, and that
 1038  * it will unlink itself from our list if it goes away.
 1039  */
 1040 void
 1041 clear_selinfo_list(td)
 1042         struct thread *td;
 1043 {
 1044         struct selinfo *si;
 1045 
 1046         mtx_assert(&sellock, MA_OWNED);
 1047         TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 1048                 si->si_thread = NULL;
 1049         TAILQ_INIT(&td->td_selq);
 1050 }
 1051 
 1052 /*
 1053  * Record a select request.
 1054  */
 1055 void
 1056 selrecord(selector, sip)
 1057         struct thread *selector;
 1058         struct selinfo *sip;
 1059 {
 1060 
 1061         mtx_lock(&sellock);
 1062         /*
 1063          * If the selinfo's thread pointer is NULL then take ownership of it.
 1064          *
 1065          * If the thread pointer is not NULL and it points to another
 1066          * thread, then we have a collision.
 1067          *
 1068          * If the thread pointer is not NULL and points back to us then leave
 1069          * it alone as we've already added pointed it at us and added it to
 1070          * our list.
 1071          */
 1072         if (sip->si_thread == NULL) {
 1073                 sip->si_thread = selector;
 1074                 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 1075         } else if (sip->si_thread != selector) {
 1076                 sip->si_flags |= SI_COLL;
 1077         }
 1078 
 1079         mtx_unlock(&sellock);
 1080 }
 1081 
 1082 /* Wake up a selecting thread. */
 1083 void
 1084 selwakeup(sip)
 1085         struct selinfo *sip;
 1086 {
 1087         doselwakeup(sip, -1);
 1088 }
 1089 
 1090 /* Wake up a selecting thread, and set its priority. */
 1091 void
 1092 selwakeuppri(sip, pri)
 1093         struct selinfo *sip;
 1094         int pri;
 1095 {
 1096         doselwakeup(sip, pri);
 1097 }
 1098 
 1099 /*
 1100  * Do a wakeup when a selectable event occurs.
 1101  */
 1102 static void
 1103 doselwakeup(sip, pri)
 1104         struct selinfo *sip;
 1105         int pri;
 1106 {
 1107         struct thread *td;
 1108 
 1109         mtx_lock(&sellock);
 1110         td = sip->si_thread;
 1111         if ((sip->si_flags & SI_COLL) != 0) {
 1112                 nselcoll++;
 1113                 sip->si_flags &= ~SI_COLL;
 1114                 cv_broadcastpri(&selwait, pri);
 1115         }
 1116         if (td == NULL) {
 1117                 mtx_unlock(&sellock);
 1118                 return;
 1119         }
 1120         TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 1121         sip->si_thread = NULL;
 1122         mtx_lock_spin(&sched_lock);
 1123         td->td_flags &= ~TDF_SELECT;
 1124         mtx_unlock_spin(&sched_lock);
 1125         sleepq_remove(td, &selwait);
 1126         mtx_unlock(&sellock);
 1127 }
 1128 
 1129 static void selectinit(void *);
 1130 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 1131 
 1132 /* ARGSUSED*/
 1133 static void
 1134 selectinit(dummy)
 1135         void *dummy;
 1136 {
 1137         cv_init(&selwait, "select");
 1138         mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 1139 }

Cache object: 1157470bfb0d7d2f2cabdc89f628f59c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.