The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      @(#)sys_generic.c       8.5 (Berkeley) 1/21/94
   39  * $FreeBSD: releng/5.1/sys/kern/sys_generic.c 114216 2003-04-29 13:36:06Z kan $
   40  */
   41 
   42 #include "opt_ktrace.h"
   43 
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/sysproto.h>
   47 #include <sys/filedesc.h>
   48 #include <sys/filio.h>
   49 #include <sys/fcntl.h>
   50 #include <sys/file.h>
   51 #include <sys/proc.h>
   52 #include <sys/signalvar.h>
   53 #include <sys/socketvar.h>
   54 #include <sys/uio.h>
   55 #include <sys/kernel.h>
   56 #include <sys/limits.h>
   57 #include <sys/malloc.h>
   58 #include <sys/poll.h>
   59 #include <sys/resourcevar.h>
   60 #include <sys/selinfo.h>
   61 #include <sys/syscallsubr.h>
   62 #include <sys/sysctl.h>
   63 #include <sys/sysent.h>
   64 #include <sys/bio.h>
   65 #include <sys/buf.h>
   66 #include <sys/condvar.h>
   67 #ifdef KTRACE
   68 #include <sys/ktrace.h>
   69 #endif
   70 #include <vm/vm.h>
   71 #include <vm/vm_page.h>
   72 
   73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
   74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
   75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
   76 
   77 static int      pollscan(struct thread *, struct pollfd *, u_int);
   78 static int      selscan(struct thread *, fd_mask **, fd_mask **, int);
   79 static int      dofileread(struct thread *, struct file *, int, void *,
   80                     size_t, off_t, int);
   81 static int      dofilewrite(struct thread *, struct file *, int,
   82                     const void *, size_t, off_t, int);
   83 
   84 /*
   85  * Read system call.
   86  */
   87 #ifndef _SYS_SYSPROTO_H_
   88 struct read_args {
   89         int     fd;
   90         void    *buf;
   91         size_t  nbyte;
   92 };
   93 #endif
   94 /*
   95  * MPSAFE
   96  */
   97 int
   98 read(td, uap)
   99         struct thread *td;
  100         struct read_args *uap;
  101 {
  102         struct file *fp;
  103         int error;
  104 
  105         if ((error = fget_read(td, uap->fd, &fp)) == 0) {
  106                 error = dofileread(td, fp, uap->fd, uap->buf,
  107                             uap->nbyte, (off_t)-1, 0);
  108                 fdrop(fp, td);
  109         }
  110         return(error);
  111 }
  112 
  113 /*
  114  * Pread system call
  115  */
  116 #ifndef _SYS_SYSPROTO_H_
  117 struct pread_args {
  118         int     fd;
  119         void    *buf;
  120         size_t  nbyte;
  121         int     pad;
  122         off_t   offset;
  123 };
  124 #endif
  125 /*
  126  * MPSAFE
  127  */
  128 int
  129 pread(td, uap)
  130         struct thread *td;
  131         struct pread_args *uap;
  132 {
  133         struct file *fp;
  134         int error;
  135 
  136         if ((error = fget_read(td, uap->fd, &fp)) != 0)
  137                 return (error);
  138         if (fp->f_type != DTYPE_VNODE) {
  139                 error = ESPIPE;
  140         } else {
  141                 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 
  142                             uap->offset, FOF_OFFSET);
  143         }
  144         fdrop(fp, td);
  145         return(error);
  146 }
  147 
  148 /*
  149  * Code common for read and pread
  150  */
  151 static int
  152 dofileread(td, fp, fd, buf, nbyte, offset, flags)
  153         struct thread *td;
  154         struct file *fp;
  155         int fd, flags;
  156         void *buf;
  157         size_t nbyte;
  158         off_t offset;
  159 {
  160         struct uio auio;
  161         struct iovec aiov;
  162         long cnt, error = 0;
  163 #ifdef KTRACE
  164         struct iovec ktriov;
  165         struct uio ktruio;
  166         int didktr = 0;
  167 #endif
  168 
  169         aiov.iov_base = buf;
  170         aiov.iov_len = nbyte;
  171         auio.uio_iov = &aiov;
  172         auio.uio_iovcnt = 1;
  173         auio.uio_offset = offset;
  174         if (nbyte > INT_MAX)
  175                 return (EINVAL);
  176         auio.uio_resid = nbyte;
  177         auio.uio_rw = UIO_READ;
  178         auio.uio_segflg = UIO_USERSPACE;
  179         auio.uio_td = td;
  180 #ifdef KTRACE
  181         /*
  182          * if tracing, save a copy of iovec
  183          */
  184         if (KTRPOINT(td, KTR_GENIO)) {
  185                 ktriov = aiov;
  186                 ktruio = auio;
  187                 didktr = 1;
  188         }
  189 #endif
  190         cnt = nbyte;
  191 
  192         if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
  193                 if (auio.uio_resid != cnt && (error == ERESTART ||
  194                     error == EINTR || error == EWOULDBLOCK))
  195                         error = 0;
  196         }
  197         cnt -= auio.uio_resid;
  198 #ifdef KTRACE
  199         if (didktr && error == 0) {
  200                 ktruio.uio_iov = &ktriov;
  201                 ktruio.uio_resid = cnt;
  202                 ktrgenio(fd, UIO_READ, &ktruio, error);
  203         }
  204 #endif
  205         td->td_retval[0] = cnt;
  206         return (error);
  207 }
  208 
  209 /*
  210  * Scatter read system call.
  211  */
  212 #ifndef _SYS_SYSPROTO_H_
  213 struct readv_args {
  214         int     fd;
  215         struct  iovec *iovp;
  216         u_int   iovcnt;
  217 };
  218 #endif
  219 /*
  220  * MPSAFE
  221  */
  222 int
  223 readv(td, uap)
  224         struct thread *td;
  225         struct readv_args *uap;
  226 {
  227         struct file *fp;
  228         struct uio auio;
  229         struct iovec *iov;
  230         struct iovec *needfree;
  231         struct iovec aiov[UIO_SMALLIOV];
  232         long i, cnt;
  233         int error;
  234         u_int iovlen;
  235 #ifdef KTRACE
  236         struct iovec *ktriov = NULL;
  237         struct uio ktruio;
  238 #endif
  239 
  240         if ((error = fget_read(td, uap->fd, &fp)) != 0)
  241                 return (error);
  242         needfree = NULL;
  243         /* note: can't use iovlen until iovcnt is validated */
  244         iovlen = uap->iovcnt * sizeof (struct iovec);
  245         if (uap->iovcnt > UIO_SMALLIOV) {
  246                 if (uap->iovcnt > UIO_MAXIOV) {
  247                         error = EINVAL;
  248                         goto done;
  249                 }
  250                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
  251                 needfree = iov;
  252         } else
  253                 iov = aiov;
  254         auio.uio_iov = iov;
  255         auio.uio_iovcnt = uap->iovcnt;
  256         auio.uio_rw = UIO_READ;
  257         auio.uio_segflg = UIO_USERSPACE;
  258         auio.uio_td = td;
  259         auio.uio_offset = -1;
  260         if ((error = copyin(uap->iovp, iov, iovlen)))
  261                 goto done;
  262         auio.uio_resid = 0;
  263         for (i = 0; i < uap->iovcnt; i++) {
  264                 if (iov->iov_len > INT_MAX - auio.uio_resid) {
  265                         error = EINVAL;
  266                         goto done;
  267                 }
  268                 auio.uio_resid += iov->iov_len;
  269                 iov++;
  270         }
  271 #ifdef KTRACE
  272         /*
  273          * if tracing, save a copy of iovec
  274          */
  275         if (KTRPOINT(td, KTR_GENIO))  {
  276                 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
  277                 bcopy(auio.uio_iov, ktriov, iovlen);
  278                 ktruio = auio;
  279         }
  280 #endif
  281         cnt = auio.uio_resid;
  282         if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
  283                 if (auio.uio_resid != cnt && (error == ERESTART ||
  284                     error == EINTR || error == EWOULDBLOCK))
  285                         error = 0;
  286         }
  287         cnt -= auio.uio_resid;
  288 #ifdef KTRACE
  289         if (ktriov != NULL) {
  290                 if (error == 0) {
  291                         ktruio.uio_iov = ktriov;
  292                         ktruio.uio_resid = cnt;
  293                         ktrgenio(uap->fd, UIO_READ, &ktruio, error);
  294                 }
  295                 FREE(ktriov, M_TEMP);
  296         }
  297 #endif
  298         td->td_retval[0] = cnt;
  299 done:
  300         fdrop(fp, td);
  301         if (needfree)
  302                 FREE(needfree, M_IOV);
  303         return (error);
  304 }
  305 
  306 /*
  307  * Write system call
  308  */
  309 #ifndef _SYS_SYSPROTO_H_
  310 struct write_args {
  311         int     fd;
  312         const void *buf;
  313         size_t  nbyte;
  314 };
  315 #endif
  316 /*
  317  * MPSAFE
  318  */
  319 int
  320 write(td, uap)
  321         struct thread *td;
  322         struct write_args *uap;
  323 {
  324         struct file *fp;
  325         int error;
  326 
  327         if ((error = fget_write(td, uap->fd, &fp)) == 0) {
  328                 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
  329                             (off_t)-1, 0);
  330                 fdrop(fp, td);
  331         } else {
  332                 error = EBADF;  /* XXX this can't be right */
  333         }
  334         return(error);
  335 }
  336 
  337 /*
  338  * Pwrite system call
  339  */
  340 #ifndef _SYS_SYSPROTO_H_
  341 struct pwrite_args {
  342         int     fd;
  343         const void *buf;
  344         size_t  nbyte;
  345         int     pad;
  346         off_t   offset;
  347 };
  348 #endif
  349 /*
  350  * MPSAFE
  351  */
  352 int
  353 pwrite(td, uap)
  354         struct thread *td;
  355         struct pwrite_args *uap;
  356 {
  357         struct file *fp;
  358         int error;
  359 
  360         if ((error = fget_write(td, uap->fd, &fp)) == 0) {
  361                 if (fp->f_type == DTYPE_VNODE) {
  362                         error = dofilewrite(td, fp, uap->fd, uap->buf,
  363                                     uap->nbyte, uap->offset, FOF_OFFSET);
  364                 } else {
  365                         error = ESPIPE;
  366                 }
  367                 fdrop(fp, td);
  368         } else {
  369                 error = EBADF;  /* this can't be right */
  370         }
  371         return(error);
  372 }
  373 
  374 static int
  375 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
  376         struct thread *td;
  377         struct file *fp;
  378         int fd, flags;
  379         const void *buf;
  380         size_t nbyte;
  381         off_t offset;
  382 {
  383         struct uio auio;
  384         struct iovec aiov;
  385         long cnt, error = 0;
  386 #ifdef KTRACE
  387         struct iovec ktriov;
  388         struct uio ktruio;
  389         int didktr = 0;
  390 #endif
  391 
  392         aiov.iov_base = (void *)(uintptr_t)buf;
  393         aiov.iov_len = nbyte;
  394         auio.uio_iov = &aiov;
  395         auio.uio_iovcnt = 1;
  396         auio.uio_offset = offset;
  397         if (nbyte > INT_MAX)
  398                 return (EINVAL);
  399         auio.uio_resid = nbyte;
  400         auio.uio_rw = UIO_WRITE;
  401         auio.uio_segflg = UIO_USERSPACE;
  402         auio.uio_td = td;
  403 #ifdef KTRACE
  404         /*
  405          * if tracing, save a copy of iovec and uio
  406          */
  407         if (KTRPOINT(td, KTR_GENIO)) {
  408                 ktriov = aiov;
  409                 ktruio = auio;
  410                 didktr = 1;
  411         }
  412 #endif
  413         cnt = nbyte;
  414         if (fp->f_type == DTYPE_VNODE)
  415                 bwillwrite();
  416         if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
  417                 if (auio.uio_resid != cnt && (error == ERESTART ||
  418                     error == EINTR || error == EWOULDBLOCK))
  419                         error = 0;
  420                 /* Socket layer is responsible for issuing SIGPIPE. */
  421                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
  422                         PROC_LOCK(td->td_proc);
  423                         psignal(td->td_proc, SIGPIPE);
  424                         PROC_UNLOCK(td->td_proc);
  425                 }
  426         }
  427         cnt -= auio.uio_resid;
  428 #ifdef KTRACE
  429         if (didktr && error == 0) {
  430                 ktruio.uio_iov = &ktriov;
  431                 ktruio.uio_resid = cnt;
  432                 ktrgenio(fd, UIO_WRITE, &ktruio, error);
  433         }
  434 #endif
  435         td->td_retval[0] = cnt;
  436         return (error);
  437 }
  438 
  439 /*
  440  * Gather write system call
  441  */
  442 #ifndef _SYS_SYSPROTO_H_
  443 struct writev_args {
  444         int     fd;
  445         struct  iovec *iovp;
  446         u_int   iovcnt;
  447 };
  448 #endif
  449 /*
  450  * MPSAFE
  451  */
  452 int
  453 writev(td, uap)
  454         struct thread *td;
  455         register struct writev_args *uap;
  456 {
  457         struct file *fp;
  458         struct uio auio;
  459         register struct iovec *iov;
  460         struct iovec *needfree;
  461         struct iovec aiov[UIO_SMALLIOV];
  462         long i, cnt, error = 0;
  463         u_int iovlen;
  464 #ifdef KTRACE
  465         struct iovec *ktriov = NULL;
  466         struct uio ktruio;
  467 #endif
  468 
  469         mtx_lock(&Giant);
  470         if ((error = fget_write(td, uap->fd, &fp)) != 0) {
  471                 error = EBADF;
  472                 goto done2;
  473         }
  474         /* note: can't use iovlen until iovcnt is validated */
  475         iovlen = uap->iovcnt * sizeof (struct iovec);
  476         if (uap->iovcnt > UIO_SMALLIOV) {
  477                 if (uap->iovcnt > UIO_MAXIOV) {
  478                         needfree = NULL;
  479                         error = EINVAL;
  480                         goto done;
  481                 }
  482                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
  483                 needfree = iov;
  484         } else {
  485                 iov = aiov;
  486                 needfree = NULL;
  487         }
  488         auio.uio_iov = iov;
  489         auio.uio_iovcnt = uap->iovcnt;
  490         auio.uio_rw = UIO_WRITE;
  491         auio.uio_segflg = UIO_USERSPACE;
  492         auio.uio_td = td;
  493         auio.uio_offset = -1;
  494         if ((error = copyin(uap->iovp, iov, iovlen)))
  495                 goto done;
  496         auio.uio_resid = 0;
  497         for (i = 0; i < uap->iovcnt; i++) {
  498                 if (iov->iov_len > INT_MAX - auio.uio_resid) {
  499                         error = EINVAL;
  500                         goto done;
  501                 }
  502                 auio.uio_resid += iov->iov_len;
  503                 iov++;
  504         }
  505 #ifdef KTRACE
  506         /*
  507          * if tracing, save a copy of iovec and uio
  508          */
  509         if (KTRPOINT(td, KTR_GENIO))  {
  510                 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
  511                 bcopy(auio.uio_iov, ktriov, iovlen);
  512                 ktruio = auio;
  513         }
  514 #endif
  515         cnt = auio.uio_resid;
  516         if (fp->f_type == DTYPE_VNODE)
  517                 bwillwrite();
  518         if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
  519                 if (auio.uio_resid != cnt && (error == ERESTART ||
  520                     error == EINTR || error == EWOULDBLOCK))
  521                         error = 0;
  522                 if (error == EPIPE) {
  523                         PROC_LOCK(td->td_proc);
  524                         psignal(td->td_proc, SIGPIPE);
  525                         PROC_UNLOCK(td->td_proc);
  526                 }
  527         }
  528         cnt -= auio.uio_resid;
  529 #ifdef KTRACE
  530         if (ktriov != NULL) {
  531                 if (error == 0) {
  532                         ktruio.uio_iov = ktriov;
  533                         ktruio.uio_resid = cnt;
  534                         ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
  535                 }
  536                 FREE(ktriov, M_TEMP);
  537         }
  538 #endif
  539         td->td_retval[0] = cnt;
  540 done:
  541         fdrop(fp, td);
  542         if (needfree)
  543                 FREE(needfree, M_IOV);
  544 done2:
  545         mtx_unlock(&Giant);
  546         return (error);
  547 }
  548 
  549 /*
  550  * Ioctl system call
  551  */
  552 #ifndef _SYS_SYSPROTO_H_
  553 struct ioctl_args {
  554         int     fd;
  555         u_long  com;
  556         caddr_t data;
  557 };
  558 #endif
  559 /*
  560  * MPSAFE
  561  */
  562 /* ARGSUSED */
  563 int
  564 ioctl(td, uap)
  565         struct thread *td;
  566         register struct ioctl_args *uap;
  567 {
  568         struct file *fp;
  569         register struct filedesc *fdp;
  570         register u_long com;
  571         int error = 0;
  572         register u_int size;
  573         caddr_t data, memp;
  574         int tmp;
  575 #define STK_PARAMS      128
  576         union {
  577             char stkbuf[STK_PARAMS];
  578             long align;
  579         } ubuf;
  580 
  581         if ((error = fget(td, uap->fd, &fp)) != 0)
  582                 return (error);
  583         mtx_lock(&Giant);
  584         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  585                 fdrop(fp, td);
  586                 mtx_unlock(&Giant);
  587                 return (EBADF);
  588         }
  589         fdp = td->td_proc->p_fd;
  590         switch (com = uap->com) {
  591         case FIONCLEX:
  592                 FILEDESC_LOCK(fdp);
  593                 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
  594                 FILEDESC_UNLOCK(fdp);
  595                 fdrop(fp, td);
  596                 mtx_unlock(&Giant);
  597                 return (0);
  598         case FIOCLEX:
  599                 FILEDESC_LOCK(fdp);
  600                 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
  601                 FILEDESC_UNLOCK(fdp);
  602                 fdrop(fp, td);
  603                 mtx_unlock(&Giant);
  604                 return (0);
  605         }
  606 
  607         /*
  608          * Interpret high order word to find amount of data to be
  609          * copied to/from the user's address space.
  610          */
  611         size = IOCPARM_LEN(com);
  612         if (size > IOCPARM_MAX) {
  613                 fdrop(fp, td);
  614                 mtx_unlock(&Giant);
  615                 return (ENOTTY);
  616         }
  617 
  618         memp = NULL;
  619         if (size > sizeof (ubuf.stkbuf)) {
  620                 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
  621                 data = memp;
  622         } else {
  623                 data = ubuf.stkbuf;
  624         }
  625         if (com&IOC_IN) {
  626                 if (size) {
  627                         error = copyin(uap->data, data, (u_int)size);
  628                         if (error) {
  629                                 if (memp)
  630                                         free(memp, M_IOCTLOPS);
  631                                 fdrop(fp, td);
  632                                 goto done;
  633                         }
  634                 } else {
  635                         *(caddr_t *)data = uap->data;
  636                 }
  637         } else if ((com&IOC_OUT) && size) {
  638                 /*
  639                  * Zero the buffer so the user always
  640                  * gets back something deterministic.
  641                  */
  642                 bzero(data, size);
  643         } else if (com&IOC_VOID) {
  644                 *(caddr_t *)data = uap->data;
  645         }
  646 
  647         switch (com) {
  648 
  649         case FIONBIO:
  650                 FILE_LOCK(fp);
  651                 if ((tmp = *(int *)data))
  652                         fp->f_flag |= FNONBLOCK;
  653                 else
  654                         fp->f_flag &= ~FNONBLOCK;
  655                 FILE_UNLOCK(fp);
  656                 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
  657                 break;
  658 
  659         case FIOASYNC:
  660                 FILE_LOCK(fp);
  661                 if ((tmp = *(int *)data))
  662                         fp->f_flag |= FASYNC;
  663                 else
  664                         fp->f_flag &= ~FASYNC;
  665                 FILE_UNLOCK(fp);
  666                 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
  667                 break;
  668 
  669         default:
  670                 error = fo_ioctl(fp, com, data, td->td_ucred, td);
  671                 /*
  672                  * Copy any data to user, size was
  673                  * already set and checked above.
  674                  */
  675                 if (error == 0 && (com&IOC_OUT) && size)
  676                         error = copyout(data, uap->data, (u_int)size);
  677                 break;
  678         }
  679         if (memp)
  680                 free(memp, M_IOCTLOPS);
  681         fdrop(fp, td);
  682 done:
  683         mtx_unlock(&Giant);
  684         return (error);
  685 }
  686 
  687 /*
  688  * sellock and selwait are initialized in selectinit() via SYSINIT.
  689  */
  690 struct mtx      sellock;
  691 struct cv       selwait;
  692 u_int           nselcoll;       /* Select collisions since boot */
  693 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
  694 
  695 /*
  696  * Select system call.
  697  */
  698 #ifndef _SYS_SYSPROTO_H_
  699 struct select_args {
  700         int     nd;
  701         fd_set  *in, *ou, *ex;
  702         struct  timeval *tv;
  703 };
  704 #endif
  705 /*
  706  * MPSAFE
  707  */
  708 int
  709 select(td, uap)
  710         register struct thread *td;
  711         register struct select_args *uap;
  712 {
  713         struct timeval tv, *tvp;
  714         int error;
  715 
  716         if (uap->tv != NULL) {
  717                 error = copyin(uap->tv, &tv, sizeof(tv));
  718                 if (error)
  719                         return (error);
  720                 tvp = &tv;
  721         } else
  722                 tvp = NULL;
  723 
  724         return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
  725 }
  726 
  727 int
  728 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
  729     fd_set *fd_ex, struct timeval *tvp)
  730 {
  731         struct filedesc *fdp;
  732         /*
  733          * The magic 2048 here is chosen to be just enough for FD_SETSIZE
  734          * infds with the new FD_SETSIZE of 1024, and more than enough for
  735          * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
  736          * of 256.
  737          */
  738         fd_mask s_selbits[howmany(2048, NFDBITS)];
  739         fd_mask *ibits[3], *obits[3], *selbits, *sbp;
  740         struct timeval atv, rtv, ttv;
  741         int error, timo;
  742         u_int ncoll, nbufbytes, ncpbytes, nfdbits;
  743 
  744         if (nd < 0)
  745                 return (EINVAL);
  746         fdp = td->td_proc->p_fd;
  747         mtx_lock(&Giant);
  748         FILEDESC_LOCK(fdp);
  749 
  750         if (nd > td->td_proc->p_fd->fd_nfiles)
  751                 nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
  752         FILEDESC_UNLOCK(fdp);
  753 
  754         /*
  755          * Allocate just enough bits for the non-null fd_sets.  Use the
  756          * preallocated auto buffer if possible.
  757          */
  758         nfdbits = roundup(nd, NFDBITS);
  759         ncpbytes = nfdbits / NBBY;
  760         nbufbytes = 0;
  761         if (fd_in != NULL)
  762                 nbufbytes += 2 * ncpbytes;
  763         if (fd_ou != NULL)
  764                 nbufbytes += 2 * ncpbytes;
  765         if (fd_ex != NULL)
  766                 nbufbytes += 2 * ncpbytes;
  767         if (nbufbytes <= sizeof s_selbits)
  768                 selbits = &s_selbits[0];
  769         else
  770                 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
  771 
  772         /*
  773          * Assign pointers into the bit buffers and fetch the input bits.
  774          * Put the output buffers together so that they can be bzeroed
  775          * together.
  776          */
  777         sbp = selbits;
  778 #define getbits(name, x) \
  779         do {                                                            \
  780                 if (name == NULL)                                       \
  781                         ibits[x] = NULL;                                \
  782                 else {                                                  \
  783                         ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;   \
  784                         obits[x] = sbp;                                 \
  785                         sbp += ncpbytes / sizeof *sbp;                  \
  786                         error = copyin(name, ibits[x], ncpbytes);       \
  787                         if (error != 0)                                 \
  788                                 goto done_nosellock;                    \
  789                 }                                                       \
  790         } while (0)
  791         getbits(fd_in, 0);
  792         getbits(fd_ou, 1);
  793         getbits(fd_ex, 2);
  794 #undef  getbits
  795         if (nbufbytes != 0)
  796                 bzero(selbits, nbufbytes / 2);
  797 
  798         if (tvp != NULL) {
  799                 atv = *tvp;
  800                 if (itimerfix(&atv)) {
  801                         error = EINVAL;
  802                         goto done_nosellock;
  803                 }
  804                 getmicrouptime(&rtv);
  805                 timevaladd(&atv, &rtv);
  806         } else {
  807                 atv.tv_sec = 0;
  808                 atv.tv_usec = 0;
  809         }
  810         timo = 0;
  811         TAILQ_INIT(&td->td_selq);
  812         mtx_lock(&sellock);
  813 retry:
  814         ncoll = nselcoll;
  815         mtx_lock_spin(&sched_lock);
  816         td->td_flags |= TDF_SELECT;
  817         mtx_unlock_spin(&sched_lock);
  818         mtx_unlock(&sellock);
  819 
  820         error = selscan(td, ibits, obits, nd);
  821         mtx_lock(&sellock);
  822         if (error || td->td_retval[0])
  823                 goto done;
  824         if (atv.tv_sec || atv.tv_usec) {
  825                 getmicrouptime(&rtv);
  826                 if (timevalcmp(&rtv, &atv, >=))
  827                         goto done;
  828                 ttv = atv;
  829                 timevalsub(&ttv, &rtv);
  830                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  831                     24 * 60 * 60 * hz : tvtohz(&ttv);
  832         }
  833 
  834         /*
  835          * An event of interest may occur while we do not hold
  836          * sellock, so check TDF_SELECT and the number of
  837          * collisions and rescan the file descriptors if
  838          * necessary.
  839          */
  840         mtx_lock_spin(&sched_lock);
  841         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  842                 mtx_unlock_spin(&sched_lock);
  843                 goto retry;
  844         }
  845         mtx_unlock_spin(&sched_lock);
  846 
  847         if (timo > 0)
  848                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  849         else
  850                 error = cv_wait_sig(&selwait, &sellock);
  851         
  852         if (error == 0)
  853                 goto retry;
  854 
  855 done:
  856         clear_selinfo_list(td);
  857         mtx_lock_spin(&sched_lock);
  858         td->td_flags &= ~TDF_SELECT;
  859         mtx_unlock_spin(&sched_lock);
  860         mtx_unlock(&sellock);
  861 
  862 done_nosellock:
  863         /* select is not restarted after signals... */
  864         if (error == ERESTART)
  865                 error = EINTR;
  866         if (error == EWOULDBLOCK)
  867                 error = 0;
  868 #define putbits(name, x) \
  869         if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
  870                 error = error2;
  871         if (error == 0) {
  872                 int error2;
  873 
  874                 putbits(fd_in, 0);
  875                 putbits(fd_ou, 1);
  876                 putbits(fd_ex, 2);
  877 #undef putbits
  878         }
  879         if (selbits != &s_selbits[0])
  880                 free(selbits, M_SELECT);
  881 
  882         mtx_unlock(&Giant);
  883         return (error);
  884 }
  885 
  886 static int
  887 selscan(td, ibits, obits, nfd)
  888         struct thread *td;
  889         fd_mask **ibits, **obits;
  890         int nfd;
  891 {
  892         int msk, i, fd;
  893         fd_mask bits;
  894         struct file *fp;
  895         int n = 0;
  896         /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
  897         static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
  898         struct filedesc *fdp = td->td_proc->p_fd;
  899 
  900         FILEDESC_LOCK(fdp);
  901         for (msk = 0; msk < 3; msk++) {
  902                 if (ibits[msk] == NULL)
  903                         continue;
  904                 for (i = 0; i < nfd; i += NFDBITS) {
  905                         bits = ibits[msk][i/NFDBITS];
  906                         /* ffs(int mask) not portable, fd_mask is long */
  907                         for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
  908                                 if (!(bits & 1))
  909                                         continue;
  910                                 if ((fp = fget_locked(fdp, fd)) == NULL) {
  911                                         FILEDESC_UNLOCK(fdp);
  912                                         return (EBADF);
  913                                 }
  914                                 if (fo_poll(fp, flag[msk], td->td_ucred,
  915                                     td)) {
  916                                         obits[msk][(fd)/NFDBITS] |=
  917                                             ((fd_mask)1 << ((fd) % NFDBITS));
  918                                         n++;
  919                                 }
  920                         }
  921                 }
  922         }
  923         FILEDESC_UNLOCK(fdp);
  924         td->td_retval[0] = n;
  925         return (0);
  926 }
  927 
  928 /*
  929  * Poll system call.
  930  */
  931 #ifndef _SYS_SYSPROTO_H_
  932 struct poll_args {
  933         struct pollfd *fds;
  934         u_int   nfds;
  935         int     timeout;
  936 };
  937 #endif
  938 /*
  939  * MPSAFE
  940  */
  941 int
  942 poll(td, uap)
  943         struct thread *td;
  944         struct poll_args *uap;
  945 {
  946         caddr_t bits;
  947         char smallbits[32 * sizeof(struct pollfd)];
  948         struct timeval atv, rtv, ttv;
  949         int error = 0, timo;
  950         u_int ncoll, nfds;
  951         size_t ni;
  952 
  953         nfds = uap->nfds;
  954 
  955         mtx_lock(&Giant);
  956         /*
  957          * This is kinda bogus.  We have fd limits, but that is not
  958          * really related to the size of the pollfd array.  Make sure
  959          * we let the process use at least FD_SETSIZE entries and at
  960          * least enough for the current limits.  We want to be reasonably
  961          * safe, but not overly restrictive.
  962          */
  963         if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
  964             (nfds > FD_SETSIZE)) {
  965                 error = EINVAL;
  966                 goto done2;
  967         }
  968         ni = nfds * sizeof(struct pollfd);
  969         if (ni > sizeof(smallbits))
  970                 bits = malloc(ni, M_TEMP, M_WAITOK);
  971         else
  972                 bits = smallbits;
  973         error = copyin(uap->fds, bits, ni);
  974         if (error)
  975                 goto done_nosellock;
  976         if (uap->timeout != INFTIM) {
  977                 atv.tv_sec = uap->timeout / 1000;
  978                 atv.tv_usec = (uap->timeout % 1000) * 1000;
  979                 if (itimerfix(&atv)) {
  980                         error = EINVAL;
  981                         goto done_nosellock;
  982                 }
  983                 getmicrouptime(&rtv);
  984                 timevaladd(&atv, &rtv);
  985         } else {
  986                 atv.tv_sec = 0;
  987                 atv.tv_usec = 0;
  988         }
  989         timo = 0;
  990         TAILQ_INIT(&td->td_selq);
  991         mtx_lock(&sellock);
  992 retry:
  993         ncoll = nselcoll;
  994         mtx_lock_spin(&sched_lock);
  995         td->td_flags |= TDF_SELECT;
  996         mtx_unlock_spin(&sched_lock);
  997         mtx_unlock(&sellock);
  998 
  999         error = pollscan(td, (struct pollfd *)bits, nfds);
 1000         mtx_lock(&sellock);
 1001         if (error || td->td_retval[0])
 1002                 goto done;
 1003         if (atv.tv_sec || atv.tv_usec) {
 1004                 getmicrouptime(&rtv);
 1005                 if (timevalcmp(&rtv, &atv, >=))
 1006                         goto done;
 1007                 ttv = atv;
 1008                 timevalsub(&ttv, &rtv);
 1009                 timo = ttv.tv_sec > 24 * 60 * 60 ?
 1010                     24 * 60 * 60 * hz : tvtohz(&ttv);
 1011         }
 1012         /*
 1013          * An event of interest may occur while we do not hold
 1014          * sellock, so check TDF_SELECT and the number of collisions
 1015          * and rescan the file descriptors if necessary.
 1016          */
 1017         mtx_lock_spin(&sched_lock);
 1018         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 1019                 mtx_unlock_spin(&sched_lock);
 1020                 goto retry;
 1021         }
 1022         mtx_unlock_spin(&sched_lock);
 1023 
 1024         if (timo > 0)
 1025                 error = cv_timedwait_sig(&selwait, &sellock, timo);
 1026         else
 1027                 error = cv_wait_sig(&selwait, &sellock);
 1028 
 1029         if (error == 0)
 1030                 goto retry;
 1031 
 1032 done:
 1033         clear_selinfo_list(td);
 1034         mtx_lock_spin(&sched_lock);
 1035         td->td_flags &= ~TDF_SELECT;
 1036         mtx_unlock_spin(&sched_lock);
 1037         mtx_unlock(&sellock);
 1038 
 1039 done_nosellock:
 1040         /* poll is not restarted after signals... */
 1041         if (error == ERESTART)
 1042                 error = EINTR;
 1043         if (error == EWOULDBLOCK)
 1044                 error = 0;
 1045         if (error == 0) {
 1046                 error = copyout(bits, uap->fds, ni);
 1047                 if (error)
 1048                         goto out;
 1049         }
 1050 out:
 1051         if (ni > sizeof(smallbits))
 1052                 free(bits, M_TEMP);
 1053 done2:
 1054         mtx_unlock(&Giant);
 1055         return (error);
 1056 }
 1057 
 1058 static int
 1059 pollscan(td, fds, nfd)
 1060         struct thread *td;
 1061         struct pollfd *fds;
 1062         u_int nfd;
 1063 {
 1064         register struct filedesc *fdp = td->td_proc->p_fd;
 1065         int i;
 1066         struct file *fp;
 1067         int n = 0;
 1068 
 1069         FILEDESC_LOCK(fdp);
 1070         for (i = 0; i < nfd; i++, fds++) {
 1071                 if (fds->fd >= fdp->fd_nfiles) {
 1072                         fds->revents = POLLNVAL;
 1073                         n++;
 1074                 } else if (fds->fd < 0) {
 1075                         fds->revents = 0;
 1076                 } else {
 1077                         fp = fdp->fd_ofiles[fds->fd];
 1078                         if (fp == NULL) {
 1079                                 fds->revents = POLLNVAL;
 1080                                 n++;
 1081                         } else {
 1082                                 /*
 1083                                  * Note: backend also returns POLLHUP and
 1084                                  * POLLERR if appropriate.
 1085                                  */
 1086                                 fds->revents = fo_poll(fp, fds->events,
 1087                                     td->td_ucred, td);
 1088                                 if (fds->revents != 0)
 1089                                         n++;
 1090                         }
 1091                 }
 1092         }
 1093         FILEDESC_UNLOCK(fdp);
 1094         td->td_retval[0] = n;
 1095         return (0);
 1096 }
 1097 
 1098 /*
 1099  * OpenBSD poll system call.
 1100  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
 1101  */
 1102 #ifndef _SYS_SYSPROTO_H_
 1103 struct openbsd_poll_args {
 1104         struct pollfd *fds;
 1105         u_int   nfds;
 1106         int     timeout;
 1107 };
 1108 #endif
 1109 /*
 1110  * MPSAFE
 1111  */
 1112 int
 1113 openbsd_poll(td, uap)
 1114         register struct thread *td;
 1115         register struct openbsd_poll_args *uap;
 1116 {
 1117         return (poll(td, (struct poll_args *)uap));
 1118 }
 1119 
 1120 /*
 1121  * Remove the references to the thread from all of the objects
 1122  * we were polling.
 1123  *
 1124  * This code assumes that the underlying owner of the selinfo
 1125  * structure will hold sellock before it changes it, and that
 1126  * it will unlink itself from our list if it goes away.
 1127  */
 1128 void
 1129 clear_selinfo_list(td)
 1130         struct thread *td;
 1131 {
 1132         struct selinfo *si;
 1133 
 1134         mtx_assert(&sellock, MA_OWNED);
 1135         TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 1136                 si->si_thread = NULL;
 1137         TAILQ_INIT(&td->td_selq);
 1138 }
 1139 
 1140 /*ARGSUSED*/
 1141 int
 1142 seltrue(dev, events, td)
 1143         dev_t dev;
 1144         int events;
 1145         struct thread *td;
 1146 {
 1147 
 1148         return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 1149 }
 1150 
 1151 /*
 1152  * Record a select request.
 1153  */
 1154 void
 1155 selrecord(selector, sip)
 1156         struct thread *selector;
 1157         struct selinfo *sip;
 1158 {
 1159 
 1160         mtx_lock(&sellock);
 1161         /*
 1162          * If the selinfo's thread pointer is NULL then take ownership of it.
 1163          *
 1164          * If the thread pointer is not NULL and it points to another
 1165          * thread, then we have a collision.
 1166          *
 1167          * If the thread pointer is not NULL and points back to us then leave
 1168          * it alone as we've already added pointed it at us and added it to
 1169          * our list.
 1170          */
 1171         if (sip->si_thread == NULL) {
 1172                 sip->si_thread = selector;
 1173                 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 1174         } else if (sip->si_thread != selector) {
 1175                 sip->si_flags |= SI_COLL;
 1176         }
 1177 
 1178         mtx_unlock(&sellock);
 1179 }
 1180 
 1181 /*
 1182  * Do a wakeup when a selectable event occurs.
 1183  */
 1184 void
 1185 selwakeup(sip)
 1186         struct selinfo *sip;
 1187 {
 1188         struct thread *td;
 1189 
 1190         mtx_lock(&sellock);
 1191         td = sip->si_thread;
 1192         if ((sip->si_flags & SI_COLL) != 0) {
 1193                 nselcoll++;
 1194                 sip->si_flags &= ~SI_COLL;
 1195                 cv_broadcast(&selwait);
 1196         }
 1197         if (td == NULL) {
 1198                 mtx_unlock(&sellock);
 1199                 return;
 1200         }
 1201         TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 1202         sip->si_thread = NULL;
 1203         mtx_lock_spin(&sched_lock);
 1204         if (td->td_wchan == &selwait) {
 1205                 cv_waitq_remove(td);
 1206                 TD_CLR_SLEEPING(td);
 1207                 setrunnable(td);
 1208         } else
 1209                 td->td_flags &= ~TDF_SELECT;
 1210         mtx_unlock_spin(&sched_lock);
 1211         mtx_unlock(&sellock);
 1212 }
 1213 
 1214 static void selectinit(void *);
 1215 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 1216 
 1217 /* ARGSUSED*/
 1218 static void
 1219 selectinit(dummy)
 1220         void *dummy;
 1221 {
 1222         cv_init(&selwait, "select");
 1223         mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 1224 }

Cache object: f997c7871ec531d38642e438db2ab5f4


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.