The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      @(#)sys_generic.c       8.5 (Berkeley) 1/21/94
   39  */
   40 
   41 #include <sys/cdefs.h>
   42 __FBSDID("$FreeBSD: releng/5.2/sys/kern/sys_generic.c 122352 2003-11-09 09:17:26Z tanimura $");
   43 
   44 #include "opt_ktrace.h"
   45 
   46 #include <sys/param.h>
   47 #include <sys/systm.h>
   48 #include <sys/sysproto.h>
   49 #include <sys/filedesc.h>
   50 #include <sys/filio.h>
   51 #include <sys/fcntl.h>
   52 #include <sys/file.h>
   53 #include <sys/proc.h>
   54 #include <sys/signalvar.h>
   55 #include <sys/socketvar.h>
   56 #include <sys/uio.h>
   57 #include <sys/kernel.h>
   58 #include <sys/limits.h>
   59 #include <sys/malloc.h>
   60 #include <sys/poll.h>
   61 #include <sys/resourcevar.h>
   62 #include <sys/selinfo.h>
   63 #include <sys/syscallsubr.h>
   64 #include <sys/sysctl.h>
   65 #include <sys/sysent.h>
   66 #include <sys/bio.h>
   67 #include <sys/buf.h>
   68 #include <sys/condvar.h>
   69 #ifdef KTRACE
   70 #include <sys/ktrace.h>
   71 #endif
   72 #include <vm/vm.h>
   73 #include <vm/vm_page.h>
   74 
   75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
   76 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
   77 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
   78 
   79 static int      pollscan(struct thread *, struct pollfd *, u_int);
   80 static int      selscan(struct thread *, fd_mask **, fd_mask **, int);
   81 static int      dofileread(struct thread *, struct file *, int, void *,
   82                     size_t, off_t, int);
   83 static int      dofilewrite(struct thread *, struct file *, int,
   84                     const void *, size_t, off_t, int);
   85 static void     doselwakeup(struct selinfo *, int);
   86 
   87 /*
   88  * Read system call.
   89  */
   90 #ifndef _SYS_SYSPROTO_H_
   91 struct read_args {
   92         int     fd;
   93         void    *buf;
   94         size_t  nbyte;
   95 };
   96 #endif
   97 /*
   98  * MPSAFE
   99  */
  100 int
  101 read(td, uap)
  102         struct thread *td;
  103         struct read_args *uap;
  104 {
  105         struct file *fp;
  106         int error;
  107 
  108         if ((error = fget_read(td, uap->fd, &fp)) == 0) {
  109                 error = dofileread(td, fp, uap->fd, uap->buf,
  110                             uap->nbyte, (off_t)-1, 0);
  111                 fdrop(fp, td);
  112         }
  113         return(error);
  114 }
  115 
  116 /*
  117  * Pread system call
  118  */
  119 #ifndef _SYS_SYSPROTO_H_
  120 struct pread_args {
  121         int     fd;
  122         void    *buf;
  123         size_t  nbyte;
  124         int     pad;
  125         off_t   offset;
  126 };
  127 #endif
  128 /*
  129  * MPSAFE
  130  */
  131 int
  132 pread(td, uap)
  133         struct thread *td;
  134         struct pread_args *uap;
  135 {
  136         struct file *fp;
  137         int error;
  138 
  139         if ((error = fget_read(td, uap->fd, &fp)) != 0)
  140                 return (error);
  141         if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
  142                 error = ESPIPE;
  143         } else {
  144                 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 
  145                             uap->offset, FOF_OFFSET);
  146         }
  147         fdrop(fp, td);
  148         return(error);
  149 }
  150 
  151 /*
  152  * Code common for read and pread
  153  */
  154 static int
  155 dofileread(td, fp, fd, buf, nbyte, offset, flags)
  156         struct thread *td;
  157         struct file *fp;
  158         int fd, flags;
  159         void *buf;
  160         size_t nbyte;
  161         off_t offset;
  162 {
  163         struct uio auio;
  164         struct iovec aiov;
  165         long cnt, error = 0;
  166 #ifdef KTRACE
  167         struct iovec ktriov;
  168         struct uio ktruio;
  169         int didktr = 0;
  170 #endif
  171 
  172         aiov.iov_base = buf;
  173         aiov.iov_len = nbyte;
  174         auio.uio_iov = &aiov;
  175         auio.uio_iovcnt = 1;
  176         auio.uio_offset = offset;
  177         if (nbyte > INT_MAX)
  178                 return (EINVAL);
  179         auio.uio_resid = nbyte;
  180         auio.uio_rw = UIO_READ;
  181         auio.uio_segflg = UIO_USERSPACE;
  182         auio.uio_td = td;
  183 #ifdef KTRACE
  184         /*
  185          * if tracing, save a copy of iovec
  186          */
  187         if (KTRPOINT(td, KTR_GENIO)) {
  188                 ktriov = aiov;
  189                 ktruio = auio;
  190                 didktr = 1;
  191         }
  192 #endif
  193         cnt = nbyte;
  194 
  195         if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
  196                 if (auio.uio_resid != cnt && (error == ERESTART ||
  197                     error == EINTR || error == EWOULDBLOCK))
  198                         error = 0;
  199         }
  200         cnt -= auio.uio_resid;
  201 #ifdef KTRACE
  202         if (didktr && error == 0) {
  203                 ktruio.uio_iov = &ktriov;
  204                 ktruio.uio_resid = cnt;
  205                 ktrgenio(fd, UIO_READ, &ktruio, error);
  206         }
  207 #endif
  208         td->td_retval[0] = cnt;
  209         return (error);
  210 }
  211 
  212 /*
  213  * Scatter read system call.
  214  */
  215 #ifndef _SYS_SYSPROTO_H_
  216 struct readv_args {
  217         int     fd;
  218         struct  iovec *iovp;
  219         u_int   iovcnt;
  220 };
  221 #endif
  222 /*
  223  * MPSAFE
  224  */
  225 int
  226 readv(td, uap)
  227         struct thread *td;
  228         struct readv_args *uap;
  229 {
  230         struct file *fp;
  231         struct uio auio;
  232         struct iovec *iov;
  233         struct iovec *needfree;
  234         struct iovec aiov[UIO_SMALLIOV];
  235         long i, cnt;
  236         int error;
  237         u_int iovlen;
  238 #ifdef KTRACE
  239         struct iovec *ktriov = NULL;
  240         struct uio ktruio;
  241 #endif
  242 
  243         if ((error = fget_read(td, uap->fd, &fp)) != 0)
  244                 return (error);
  245         needfree = NULL;
  246         /* note: can't use iovlen until iovcnt is validated */
  247         iovlen = uap->iovcnt * sizeof (struct iovec);
  248         if (uap->iovcnt > UIO_SMALLIOV) {
  249                 if (uap->iovcnt > UIO_MAXIOV) {
  250                         error = EINVAL;
  251                         goto done;
  252                 }
  253                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
  254                 needfree = iov;
  255         } else
  256                 iov = aiov;
  257         auio.uio_iov = iov;
  258         auio.uio_iovcnt = uap->iovcnt;
  259         auio.uio_rw = UIO_READ;
  260         auio.uio_segflg = UIO_USERSPACE;
  261         auio.uio_td = td;
  262         auio.uio_offset = -1;
  263         if ((error = copyin(uap->iovp, iov, iovlen)))
  264                 goto done;
  265         auio.uio_resid = 0;
  266         for (i = 0; i < uap->iovcnt; i++) {
  267                 if (iov->iov_len > INT_MAX - auio.uio_resid) {
  268                         error = EINVAL;
  269                         goto done;
  270                 }
  271                 auio.uio_resid += iov->iov_len;
  272                 iov++;
  273         }
  274 #ifdef KTRACE
  275         /*
  276          * if tracing, save a copy of iovec
  277          */
  278         if (KTRPOINT(td, KTR_GENIO))  {
  279                 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
  280                 bcopy(auio.uio_iov, ktriov, iovlen);
  281                 ktruio = auio;
  282         }
  283 #endif
  284         cnt = auio.uio_resid;
  285         if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
  286                 if (auio.uio_resid != cnt && (error == ERESTART ||
  287                     error == EINTR || error == EWOULDBLOCK))
  288                         error = 0;
  289         }
  290         cnt -= auio.uio_resid;
  291 #ifdef KTRACE
  292         if (ktriov != NULL) {
  293                 if (error == 0) {
  294                         ktruio.uio_iov = ktriov;
  295                         ktruio.uio_resid = cnt;
  296                         ktrgenio(uap->fd, UIO_READ, &ktruio, error);
  297                 }
  298                 FREE(ktriov, M_TEMP);
  299         }
  300 #endif
  301         td->td_retval[0] = cnt;
  302 done:
  303         fdrop(fp, td);
  304         if (needfree)
  305                 FREE(needfree, M_IOV);
  306         return (error);
  307 }
  308 
  309 /*
  310  * Write system call
  311  */
  312 #ifndef _SYS_SYSPROTO_H_
  313 struct write_args {
  314         int     fd;
  315         const void *buf;
  316         size_t  nbyte;
  317 };
  318 #endif
  319 /*
  320  * MPSAFE
  321  */
  322 int
  323 write(td, uap)
  324         struct thread *td;
  325         struct write_args *uap;
  326 {
  327         struct file *fp;
  328         int error;
  329 
  330         if ((error = fget_write(td, uap->fd, &fp)) == 0) {
  331                 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
  332                             (off_t)-1, 0);
  333                 fdrop(fp, td);
  334         } else {
  335                 error = EBADF;  /* XXX this can't be right */
  336         }
  337         return(error);
  338 }
  339 
  340 /*
  341  * Pwrite system call
  342  */
  343 #ifndef _SYS_SYSPROTO_H_
  344 struct pwrite_args {
  345         int     fd;
  346         const void *buf;
  347         size_t  nbyte;
  348         int     pad;
  349         off_t   offset;
  350 };
  351 #endif
  352 /*
  353  * MPSAFE
  354  */
  355 int
  356 pwrite(td, uap)
  357         struct thread *td;
  358         struct pwrite_args *uap;
  359 {
  360         struct file *fp;
  361         int error;
  362 
  363         if ((error = fget_write(td, uap->fd, &fp)) == 0) {
  364                 if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
  365                         error = ESPIPE;
  366                 } else {
  367                         error = dofilewrite(td, fp, uap->fd, uap->buf,
  368                                     uap->nbyte, uap->offset, FOF_OFFSET);
  369                 }
  370                 fdrop(fp, td);
  371         } else {
  372                 error = EBADF;  /* this can't be right */
  373         }
  374         return(error);
  375 }
  376 
  377 static int
  378 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
  379         struct thread *td;
  380         struct file *fp;
  381         int fd, flags;
  382         const void *buf;
  383         size_t nbyte;
  384         off_t offset;
  385 {
  386         struct uio auio;
  387         struct iovec aiov;
  388         long cnt, error = 0;
  389 #ifdef KTRACE
  390         struct iovec ktriov;
  391         struct uio ktruio;
  392         int didktr = 0;
  393 #endif
  394 
  395         aiov.iov_base = (void *)(uintptr_t)buf;
  396         aiov.iov_len = nbyte;
  397         auio.uio_iov = &aiov;
  398         auio.uio_iovcnt = 1;
  399         auio.uio_offset = offset;
  400         if (nbyte > INT_MAX)
  401                 return (EINVAL);
  402         auio.uio_resid = nbyte;
  403         auio.uio_rw = UIO_WRITE;
  404         auio.uio_segflg = UIO_USERSPACE;
  405         auio.uio_td = td;
  406 #ifdef KTRACE
  407         /*
  408          * if tracing, save a copy of iovec and uio
  409          */
  410         if (KTRPOINT(td, KTR_GENIO)) {
  411                 ktriov = aiov;
  412                 ktruio = auio;
  413                 didktr = 1;
  414         }
  415 #endif
  416         cnt = nbyte;
  417         if (fp->f_type == DTYPE_VNODE)
  418                 bwillwrite();
  419         if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
  420                 if (auio.uio_resid != cnt && (error == ERESTART ||
  421                     error == EINTR || error == EWOULDBLOCK))
  422                         error = 0;
  423                 /* Socket layer is responsible for issuing SIGPIPE. */
  424                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
  425                         PROC_LOCK(td->td_proc);
  426                         psignal(td->td_proc, SIGPIPE);
  427                         PROC_UNLOCK(td->td_proc);
  428                 }
  429         }
  430         cnt -= auio.uio_resid;
  431 #ifdef KTRACE
  432         if (didktr && error == 0) {
  433                 ktruio.uio_iov = &ktriov;
  434                 ktruio.uio_resid = cnt;
  435                 ktrgenio(fd, UIO_WRITE, &ktruio, error);
  436         }
  437 #endif
  438         td->td_retval[0] = cnt;
  439         return (error);
  440 }
  441 
  442 /*
  443  * Gather write system call
  444  */
  445 #ifndef _SYS_SYSPROTO_H_
  446 struct writev_args {
  447         int     fd;
  448         struct  iovec *iovp;
  449         u_int   iovcnt;
  450 };
  451 #endif
  452 /*
  453  * MPSAFE
  454  */
  455 int
  456 writev(td, uap)
  457         struct thread *td;
  458         register struct writev_args *uap;
  459 {
  460         struct file *fp;
  461         struct uio auio;
  462         register struct iovec *iov;
  463         struct iovec *needfree;
  464         struct iovec aiov[UIO_SMALLIOV];
  465         long i, cnt, error = 0;
  466         u_int iovlen;
  467 #ifdef KTRACE
  468         struct iovec *ktriov = NULL;
  469         struct uio ktruio;
  470 #endif
  471 
  472         if ((error = fget_write(td, uap->fd, &fp)) != 0)
  473                 return (EBADF);
  474         needfree = NULL;
  475         /* note: can't use iovlen until iovcnt is validated */
  476         iovlen = uap->iovcnt * sizeof (struct iovec);
  477         if (uap->iovcnt > UIO_SMALLIOV) {
  478                 if (uap->iovcnt > UIO_MAXIOV) {
  479                         error = EINVAL;
  480                         goto done;
  481                 }
  482                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
  483                 needfree = iov;
  484         } else
  485                 iov = aiov;
  486         auio.uio_iov = iov;
  487         auio.uio_iovcnt = uap->iovcnt;
  488         auio.uio_rw = UIO_WRITE;
  489         auio.uio_segflg = UIO_USERSPACE;
  490         auio.uio_td = td;
  491         auio.uio_offset = -1;
  492         if ((error = copyin(uap->iovp, iov, iovlen)))
  493                 goto done;
  494         auio.uio_resid = 0;
  495         for (i = 0; i < uap->iovcnt; i++) {
  496                 if (iov->iov_len > INT_MAX - auio.uio_resid) {
  497                         error = EINVAL;
  498                         goto done;
  499                 }
  500                 auio.uio_resid += iov->iov_len;
  501                 iov++;
  502         }
  503 #ifdef KTRACE
  504         /*
  505          * if tracing, save a copy of iovec and uio
  506          */
  507         if (KTRPOINT(td, KTR_GENIO))  {
  508                 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
  509                 bcopy(auio.uio_iov, ktriov, iovlen);
  510                 ktruio = auio;
  511         }
  512 #endif
  513         cnt = auio.uio_resid;
  514         if (fp->f_type == DTYPE_VNODE)
  515                 bwillwrite();
  516         if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
  517                 if (auio.uio_resid != cnt && (error == ERESTART ||
  518                     error == EINTR || error == EWOULDBLOCK))
  519                         error = 0;
  520                 if (error == EPIPE) {
  521                         PROC_LOCK(td->td_proc);
  522                         psignal(td->td_proc, SIGPIPE);
  523                         PROC_UNLOCK(td->td_proc);
  524                 }
  525         }
  526         cnt -= auio.uio_resid;
  527 #ifdef KTRACE
  528         if (ktriov != NULL) {
  529                 if (error == 0) {
  530                         ktruio.uio_iov = ktriov;
  531                         ktruio.uio_resid = cnt;
  532                         ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
  533                 }
  534                 FREE(ktriov, M_TEMP);
  535         }
  536 #endif
  537         td->td_retval[0] = cnt;
  538 done:
  539         fdrop(fp, td);
  540         if (needfree)
  541                 FREE(needfree, M_IOV);
  542         return (error);
  543 }
  544 
  545 /*
  546  * Ioctl system call
  547  */
  548 #ifndef _SYS_SYSPROTO_H_
  549 struct ioctl_args {
  550         int     fd;
  551         u_long  com;
  552         caddr_t data;
  553 };
  554 #endif
  555 /*
  556  * MPSAFE
  557  */
  558 /* ARGSUSED */
  559 int
  560 ioctl(td, uap)
  561         struct thread *td;
  562         register struct ioctl_args *uap;
  563 {
  564         struct file *fp;
  565         register struct filedesc *fdp;
  566         register u_long com;
  567         int error = 0;
  568         register u_int size;
  569         caddr_t data, memp;
  570         int tmp;
  571 #define STK_PARAMS      128
  572         union {
  573             char stkbuf[STK_PARAMS];
  574             long align;
  575         } ubuf;
  576 
  577         if ((error = fget(td, uap->fd, &fp)) != 0)
  578                 return (error);
  579         mtx_lock(&Giant);
  580         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  581                 fdrop(fp, td);
  582                 mtx_unlock(&Giant);
  583                 return (EBADF);
  584         }
  585         fdp = td->td_proc->p_fd;
  586         switch (com = uap->com) {
  587         case FIONCLEX:
  588                 FILEDESC_LOCK(fdp);
  589                 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
  590                 FILEDESC_UNLOCK(fdp);
  591                 fdrop(fp, td);
  592                 mtx_unlock(&Giant);
  593                 return (0);
  594         case FIOCLEX:
  595                 FILEDESC_LOCK(fdp);
  596                 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
  597                 FILEDESC_UNLOCK(fdp);
  598                 fdrop(fp, td);
  599                 mtx_unlock(&Giant);
  600                 return (0);
  601         }
  602 
  603         /*
  604          * Interpret high order word to find amount of data to be
  605          * copied to/from the user's address space.
  606          */
  607         size = IOCPARM_LEN(com);
  608         if (size > IOCPARM_MAX) {
  609                 fdrop(fp, td);
  610                 mtx_unlock(&Giant);
  611                 return (ENOTTY);
  612         }
  613 
  614         memp = NULL;
  615         if (size > sizeof (ubuf.stkbuf)) {
  616                 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
  617                 data = memp;
  618         } else {
  619                 data = ubuf.stkbuf;
  620         }
  621         if (com&IOC_IN) {
  622                 if (size) {
  623                         error = copyin(uap->data, data, (u_int)size);
  624                         if (error) {
  625                                 if (memp)
  626                                         free(memp, M_IOCTLOPS);
  627                                 fdrop(fp, td);
  628                                 goto done;
  629                         }
  630                 } else {
  631                         *(caddr_t *)data = uap->data;
  632                 }
  633         } else if ((com&IOC_OUT) && size) {
  634                 /*
  635                  * Zero the buffer so the user always
  636                  * gets back something deterministic.
  637                  */
  638                 bzero(data, size);
  639         } else if (com&IOC_VOID) {
  640                 *(caddr_t *)data = uap->data;
  641         }
  642 
  643         switch (com) {
  644 
  645         case FIONBIO:
  646                 FILE_LOCK(fp);
  647                 if ((tmp = *(int *)data))
  648                         fp->f_flag |= FNONBLOCK;
  649                 else
  650                         fp->f_flag &= ~FNONBLOCK;
  651                 FILE_UNLOCK(fp);
  652                 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
  653                 break;
  654 
  655         case FIOASYNC:
  656                 FILE_LOCK(fp);
  657                 if ((tmp = *(int *)data))
  658                         fp->f_flag |= FASYNC;
  659                 else
  660                         fp->f_flag &= ~FASYNC;
  661                 FILE_UNLOCK(fp);
  662                 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
  663                 break;
  664 
  665         default:
  666                 error = fo_ioctl(fp, com, data, td->td_ucred, td);
  667                 /*
  668                  * Copy any data to user, size was
  669                  * already set and checked above.
  670                  */
  671                 if (error == 0 && (com&IOC_OUT) && size)
  672                         error = copyout(data, uap->data, (u_int)size);
  673                 break;
  674         }
  675         if (memp)
  676                 free(memp, M_IOCTLOPS);
  677         fdrop(fp, td);
  678 done:
  679         mtx_unlock(&Giant);
  680         return (error);
  681 }
  682 
  683 /*
  684  * sellock and selwait are initialized in selectinit() via SYSINIT.
  685  */
  686 struct mtx      sellock;
  687 struct cv       selwait;
  688 u_int           nselcoll;       /* Select collisions since boot */
  689 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
  690 
  691 /*
  692  * Select system call.
  693  */
  694 #ifndef _SYS_SYSPROTO_H_
  695 struct select_args {
  696         int     nd;
  697         fd_set  *in, *ou, *ex;
  698         struct  timeval *tv;
  699 };
  700 #endif
  701 /*
  702  * MPSAFE
  703  */
  704 int
  705 select(td, uap)
  706         register struct thread *td;
  707         register struct select_args *uap;
  708 {
  709         struct timeval tv, *tvp;
  710         int error;
  711 
  712         if (uap->tv != NULL) {
  713                 error = copyin(uap->tv, &tv, sizeof(tv));
  714                 if (error)
  715                         return (error);
  716                 tvp = &tv;
  717         } else
  718                 tvp = NULL;
  719 
  720         return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
  721 }
  722 
  723 int
  724 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
  725     fd_set *fd_ex, struct timeval *tvp)
  726 {
  727         struct filedesc *fdp;
  728         /*
  729          * The magic 2048 here is chosen to be just enough for FD_SETSIZE
  730          * infds with the new FD_SETSIZE of 1024, and more than enough for
  731          * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
  732          * of 256.
  733          */
  734         fd_mask s_selbits[howmany(2048, NFDBITS)];
  735         fd_mask *ibits[3], *obits[3], *selbits, *sbp;
  736         struct timeval atv, rtv, ttv;
  737         int error, timo;
  738         u_int ncoll, nbufbytes, ncpbytes, nfdbits;
  739 
  740         if (nd < 0)
  741                 return (EINVAL);
  742         fdp = td->td_proc->p_fd;
  743         mtx_lock(&Giant);
  744         FILEDESC_LOCK(fdp);
  745 
  746         if (nd > td->td_proc->p_fd->fd_nfiles)
  747                 nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
  748         FILEDESC_UNLOCK(fdp);
  749 
  750         /*
  751          * Allocate just enough bits for the non-null fd_sets.  Use the
  752          * preallocated auto buffer if possible.
  753          */
  754         nfdbits = roundup(nd, NFDBITS);
  755         ncpbytes = nfdbits / NBBY;
  756         nbufbytes = 0;
  757         if (fd_in != NULL)
  758                 nbufbytes += 2 * ncpbytes;
  759         if (fd_ou != NULL)
  760                 nbufbytes += 2 * ncpbytes;
  761         if (fd_ex != NULL)
  762                 nbufbytes += 2 * ncpbytes;
  763         if (nbufbytes <= sizeof s_selbits)
  764                 selbits = &s_selbits[0];
  765         else
  766                 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
  767 
  768         /*
  769          * Assign pointers into the bit buffers and fetch the input bits.
  770          * Put the output buffers together so that they can be bzeroed
  771          * together.
  772          */
  773         sbp = selbits;
  774 #define getbits(name, x) \
  775         do {                                                            \
  776                 if (name == NULL)                                       \
  777                         ibits[x] = NULL;                                \
  778                 else {                                                  \
  779                         ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;   \
  780                         obits[x] = sbp;                                 \
  781                         sbp += ncpbytes / sizeof *sbp;                  \
  782                         error = copyin(name, ibits[x], ncpbytes);       \
  783                         if (error != 0)                                 \
  784                                 goto done_nosellock;                    \
  785                 }                                                       \
  786         } while (0)
  787         getbits(fd_in, 0);
  788         getbits(fd_ou, 1);
  789         getbits(fd_ex, 2);
  790 #undef  getbits
  791         if (nbufbytes != 0)
  792                 bzero(selbits, nbufbytes / 2);
  793 
  794         if (tvp != NULL) {
  795                 atv = *tvp;
  796                 if (itimerfix(&atv)) {
  797                         error = EINVAL;
  798                         goto done_nosellock;
  799                 }
  800                 getmicrouptime(&rtv);
  801                 timevaladd(&atv, &rtv);
  802         } else {
  803                 atv.tv_sec = 0;
  804                 atv.tv_usec = 0;
  805         }
  806         timo = 0;
  807         TAILQ_INIT(&td->td_selq);
  808         mtx_lock(&sellock);
  809 retry:
  810         ncoll = nselcoll;
  811         mtx_lock_spin(&sched_lock);
  812         td->td_flags |= TDF_SELECT;
  813         mtx_unlock_spin(&sched_lock);
  814         mtx_unlock(&sellock);
  815 
  816         error = selscan(td, ibits, obits, nd);
  817         mtx_lock(&sellock);
  818         if (error || td->td_retval[0])
  819                 goto done;
  820         if (atv.tv_sec || atv.tv_usec) {
  821                 getmicrouptime(&rtv);
  822                 if (timevalcmp(&rtv, &atv, >=))
  823                         goto done;
  824                 ttv = atv;
  825                 timevalsub(&ttv, &rtv);
  826                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  827                     24 * 60 * 60 * hz : tvtohz(&ttv);
  828         }
  829 
  830         /*
  831          * An event of interest may occur while we do not hold
  832          * sellock, so check TDF_SELECT and the number of
  833          * collisions and rescan the file descriptors if
  834          * necessary.
  835          */
  836         mtx_lock_spin(&sched_lock);
  837         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  838                 mtx_unlock_spin(&sched_lock);
  839                 goto retry;
  840         }
  841         mtx_unlock_spin(&sched_lock);
  842 
  843         if (timo > 0)
  844                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  845         else
  846                 error = cv_wait_sig(&selwait, &sellock);
  847         
  848         if (error == 0)
  849                 goto retry;
  850 
  851 done:
  852         clear_selinfo_list(td);
  853         mtx_lock_spin(&sched_lock);
  854         td->td_flags &= ~TDF_SELECT;
  855         mtx_unlock_spin(&sched_lock);
  856         mtx_unlock(&sellock);
  857 
  858 done_nosellock:
  859         /* select is not restarted after signals... */
  860         if (error == ERESTART)
  861                 error = EINTR;
  862         if (error == EWOULDBLOCK)
  863                 error = 0;
  864 #define putbits(name, x) \
  865         if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
  866                 error = error2;
  867         if (error == 0) {
  868                 int error2;
  869 
  870                 putbits(fd_in, 0);
  871                 putbits(fd_ou, 1);
  872                 putbits(fd_ex, 2);
  873 #undef putbits
  874         }
  875         if (selbits != &s_selbits[0])
  876                 free(selbits, M_SELECT);
  877 
  878         mtx_unlock(&Giant);
  879         return (error);
  880 }
  881 
  882 static int
  883 selscan(td, ibits, obits, nfd)
  884         struct thread *td;
  885         fd_mask **ibits, **obits;
  886         int nfd;
  887 {
  888         int msk, i, fd;
  889         fd_mask bits;
  890         struct file *fp;
  891         int n = 0;
  892         /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
  893         static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
  894         struct filedesc *fdp = td->td_proc->p_fd;
  895 
  896         FILEDESC_LOCK(fdp);
  897         for (msk = 0; msk < 3; msk++) {
  898                 if (ibits[msk] == NULL)
  899                         continue;
  900                 for (i = 0; i < nfd; i += NFDBITS) {
  901                         bits = ibits[msk][i/NFDBITS];
  902                         /* ffs(int mask) not portable, fd_mask is long */
  903                         for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
  904                                 if (!(bits & 1))
  905                                         continue;
  906                                 if ((fp = fget_locked(fdp, fd)) == NULL) {
  907                                         FILEDESC_UNLOCK(fdp);
  908                                         return (EBADF);
  909                                 }
  910                                 if (fo_poll(fp, flag[msk], td->td_ucred,
  911                                     td)) {
  912                                         obits[msk][(fd)/NFDBITS] |=
  913                                             ((fd_mask)1 << ((fd) % NFDBITS));
  914                                         n++;
  915                                 }
  916                         }
  917                 }
  918         }
  919         FILEDESC_UNLOCK(fdp);
  920         td->td_retval[0] = n;
  921         return (0);
  922 }
  923 
  924 /*
  925  * Poll system call.
  926  */
  927 #ifndef _SYS_SYSPROTO_H_
  928 struct poll_args {
  929         struct pollfd *fds;
  930         u_int   nfds;
  931         int     timeout;
  932 };
  933 #endif
  934 /*
  935  * MPSAFE
  936  */
  937 int
  938 poll(td, uap)
  939         struct thread *td;
  940         struct poll_args *uap;
  941 {
  942         caddr_t bits;
  943         char smallbits[32 * sizeof(struct pollfd)];
  944         struct timeval atv, rtv, ttv;
  945         int error = 0, timo;
  946         u_int ncoll, nfds;
  947         size_t ni;
  948 
  949         nfds = uap->nfds;
  950 
  951         mtx_lock(&Giant);
  952         /*
  953          * This is kinda bogus.  We have fd limits, but that is not
  954          * really related to the size of the pollfd array.  Make sure
  955          * we let the process use at least FD_SETSIZE entries and at
  956          * least enough for the current limits.  We want to be reasonably
  957          * safe, but not overly restrictive.
  958          */
  959         if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
  960             (nfds > FD_SETSIZE)) {
  961                 error = EINVAL;
  962                 goto done2;
  963         }
  964         ni = nfds * sizeof(struct pollfd);
  965         if (ni > sizeof(smallbits))
  966                 bits = malloc(ni, M_TEMP, M_WAITOK);
  967         else
  968                 bits = smallbits;
  969         error = copyin(uap->fds, bits, ni);
  970         if (error)
  971                 goto done_nosellock;
  972         if (uap->timeout != INFTIM) {
  973                 atv.tv_sec = uap->timeout / 1000;
  974                 atv.tv_usec = (uap->timeout % 1000) * 1000;
  975                 if (itimerfix(&atv)) {
  976                         error = EINVAL;
  977                         goto done_nosellock;
  978                 }
  979                 getmicrouptime(&rtv);
  980                 timevaladd(&atv, &rtv);
  981         } else {
  982                 atv.tv_sec = 0;
  983                 atv.tv_usec = 0;
  984         }
  985         timo = 0;
  986         TAILQ_INIT(&td->td_selq);
  987         mtx_lock(&sellock);
  988 retry:
  989         ncoll = nselcoll;
  990         mtx_lock_spin(&sched_lock);
  991         td->td_flags |= TDF_SELECT;
  992         mtx_unlock_spin(&sched_lock);
  993         mtx_unlock(&sellock);
  994 
  995         error = pollscan(td, (struct pollfd *)bits, nfds);
  996         mtx_lock(&sellock);
  997         if (error || td->td_retval[0])
  998                 goto done;
  999         if (atv.tv_sec || atv.tv_usec) {
 1000                 getmicrouptime(&rtv);
 1001                 if (timevalcmp(&rtv, &atv, >=))
 1002                         goto done;
 1003                 ttv = atv;
 1004                 timevalsub(&ttv, &rtv);
 1005                 timo = ttv.tv_sec > 24 * 60 * 60 ?
 1006                     24 * 60 * 60 * hz : tvtohz(&ttv);
 1007         }
 1008         /*
 1009          * An event of interest may occur while we do not hold
 1010          * sellock, so check TDF_SELECT and the number of collisions
 1011          * and rescan the file descriptors if necessary.
 1012          */
 1013         mtx_lock_spin(&sched_lock);
 1014         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 1015                 mtx_unlock_spin(&sched_lock);
 1016                 goto retry;
 1017         }
 1018         mtx_unlock_spin(&sched_lock);
 1019 
 1020         if (timo > 0)
 1021                 error = cv_timedwait_sig(&selwait, &sellock, timo);
 1022         else
 1023                 error = cv_wait_sig(&selwait, &sellock);
 1024 
 1025         if (error == 0)
 1026                 goto retry;
 1027 
 1028 done:
 1029         clear_selinfo_list(td);
 1030         mtx_lock_spin(&sched_lock);
 1031         td->td_flags &= ~TDF_SELECT;
 1032         mtx_unlock_spin(&sched_lock);
 1033         mtx_unlock(&sellock);
 1034 
 1035 done_nosellock:
 1036         /* poll is not restarted after signals... */
 1037         if (error == ERESTART)
 1038                 error = EINTR;
 1039         if (error == EWOULDBLOCK)
 1040                 error = 0;
 1041         if (error == 0) {
 1042                 error = copyout(bits, uap->fds, ni);
 1043                 if (error)
 1044                         goto out;
 1045         }
 1046 out:
 1047         if (ni > sizeof(smallbits))
 1048                 free(bits, M_TEMP);
 1049 done2:
 1050         mtx_unlock(&Giant);
 1051         return (error);
 1052 }
 1053 
 1054 static int
 1055 pollscan(td, fds, nfd)
 1056         struct thread *td;
 1057         struct pollfd *fds;
 1058         u_int nfd;
 1059 {
 1060         register struct filedesc *fdp = td->td_proc->p_fd;
 1061         int i;
 1062         struct file *fp;
 1063         int n = 0;
 1064 
 1065         FILEDESC_LOCK(fdp);
 1066         for (i = 0; i < nfd; i++, fds++) {
 1067                 if (fds->fd >= fdp->fd_nfiles) {
 1068                         fds->revents = POLLNVAL;
 1069                         n++;
 1070                 } else if (fds->fd < 0) {
 1071                         fds->revents = 0;
 1072                 } else {
 1073                         fp = fdp->fd_ofiles[fds->fd];
 1074                         if (fp == NULL) {
 1075                                 fds->revents = POLLNVAL;
 1076                                 n++;
 1077                         } else {
 1078                                 /*
 1079                                  * Note: backend also returns POLLHUP and
 1080                                  * POLLERR if appropriate.
 1081                                  */
 1082                                 fds->revents = fo_poll(fp, fds->events,
 1083                                     td->td_ucred, td);
 1084                                 if (fds->revents != 0)
 1085                                         n++;
 1086                         }
 1087                 }
 1088         }
 1089         FILEDESC_UNLOCK(fdp);
 1090         td->td_retval[0] = n;
 1091         return (0);
 1092 }
 1093 
 1094 /*
 1095  * OpenBSD poll system call.
 1096  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
 1097  */
 1098 #ifndef _SYS_SYSPROTO_H_
 1099 struct openbsd_poll_args {
 1100         struct pollfd *fds;
 1101         u_int   nfds;
 1102         int     timeout;
 1103 };
 1104 #endif
 1105 /*
 1106  * MPSAFE
 1107  */
 1108 int
 1109 openbsd_poll(td, uap)
 1110         register struct thread *td;
 1111         register struct openbsd_poll_args *uap;
 1112 {
 1113         return (poll(td, (struct poll_args *)uap));
 1114 }
 1115 
 1116 /*
 1117  * Remove the references to the thread from all of the objects
 1118  * we were polling.
 1119  *
 1120  * This code assumes that the underlying owner of the selinfo
 1121  * structure will hold sellock before it changes it, and that
 1122  * it will unlink itself from our list if it goes away.
 1123  */
 1124 void
 1125 clear_selinfo_list(td)
 1126         struct thread *td;
 1127 {
 1128         struct selinfo *si;
 1129 
 1130         mtx_assert(&sellock, MA_OWNED);
 1131         TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 1132                 si->si_thread = NULL;
 1133         TAILQ_INIT(&td->td_selq);
 1134 }
 1135 
 1136 /*
 1137  * Record a select request.
 1138  */
 1139 void
 1140 selrecord(selector, sip)
 1141         struct thread *selector;
 1142         struct selinfo *sip;
 1143 {
 1144 
 1145         mtx_lock(&sellock);
 1146         /*
 1147          * If the selinfo's thread pointer is NULL then take ownership of it.
 1148          *
 1149          * If the thread pointer is not NULL and it points to another
 1150          * thread, then we have a collision.
 1151          *
 1152          * If the thread pointer is not NULL and points back to us then leave
 1153          * it alone as we've already added pointed it at us and added it to
 1154          * our list.
 1155          */
 1156         if (sip->si_thread == NULL) {
 1157                 sip->si_thread = selector;
 1158                 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 1159         } else if (sip->si_thread != selector) {
 1160                 sip->si_flags |= SI_COLL;
 1161         }
 1162 
 1163         mtx_unlock(&sellock);
 1164 }
 1165 
 1166 /* Wake up a selecting thread. */
 1167 void
 1168 selwakeup(sip)
 1169         struct selinfo *sip;
 1170 {
 1171         doselwakeup(sip, -1);
 1172 }
 1173 
 1174 /* Wake up a selecting thread, and set its priority. */
 1175 void
 1176 selwakeuppri(sip, pri)
 1177         struct selinfo *sip;
 1178         int pri;
 1179 {
 1180         doselwakeup(sip, pri);
 1181 }
 1182 
 1183 /*
 1184  * Do a wakeup when a selectable event occurs.
 1185  */
 1186 static void
 1187 doselwakeup(sip, pri)
 1188         struct selinfo *sip;
 1189         int pri;
 1190 {
 1191         struct thread *td;
 1192 
 1193         mtx_lock(&sellock);
 1194         td = sip->si_thread;
 1195         if ((sip->si_flags & SI_COLL) != 0) {
 1196                 nselcoll++;
 1197                 sip->si_flags &= ~SI_COLL;
 1198                 cv_broadcastpri(&selwait, pri);
 1199         }
 1200         if (td == NULL) {
 1201                 mtx_unlock(&sellock);
 1202                 return;
 1203         }
 1204         TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 1205         sip->si_thread = NULL;
 1206         mtx_lock_spin(&sched_lock);
 1207         if (td->td_wchan == &selwait) {
 1208                 cv_waitq_remove(td);
 1209                 TD_CLR_SLEEPING(td);
 1210                 if (pri >= PRI_MIN && pri <= PRI_MAX && td->td_priority > pri)
 1211                         td->td_priority = pri;
 1212                 setrunnable(td);
 1213         } else
 1214                 td->td_flags &= ~TDF_SELECT;
 1215         mtx_unlock_spin(&sched_lock);
 1216         mtx_unlock(&sellock);
 1217 }
 1218 
 1219 static void selectinit(void *);
 1220 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 1221 
 1222 /* ARGSUSED*/
 1223 static void
 1224 selectinit(dummy)
 1225         void *dummy;
 1226 {
 1227         cv_init(&selwait, "select");
 1228         mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 1229 }

Cache object: 9e4fe40d49018cf818d1cc0cf190a9f1


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.