The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  *      @(#)sys_generic.c       8.5 (Berkeley) 1/21/94
   39  * $FreeBSD: releng/5.0/sys/kern/sys_generic.c 108086 2002-12-19 09:40:13Z alfred $
   40  */
   41 
   42 #include "opt_ktrace.h"
   43 
   44 #include <sys/param.h>
   45 #include <sys/systm.h>
   46 #include <sys/sysproto.h>
   47 #include <sys/filedesc.h>
   48 #include <sys/filio.h>
   49 #include <sys/fcntl.h>
   50 #include <sys/file.h>
   51 #include <sys/proc.h>
   52 #include <sys/signalvar.h>
   53 #include <sys/socketvar.h>
   54 #include <sys/uio.h>
   55 #include <sys/kernel.h>
   56 #include <sys/malloc.h>
   57 #include <sys/poll.h>
   58 #include <sys/resourcevar.h>
   59 #include <sys/selinfo.h>
   60 #include <sys/syscallsubr.h>
   61 #include <sys/sysctl.h>
   62 #include <sys/sysent.h>
   63 #include <sys/bio.h>
   64 #include <sys/buf.h>
   65 #include <sys/condvar.h>
   66 #ifdef KTRACE
   67 #include <sys/ktrace.h>
   68 #endif
   69 #include <vm/vm.h>
   70 #include <vm/vm_page.h>
   71 
   72 #include <machine/limits.h>
   73 
   74 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
   75 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
   76 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
   77 
   78 static int      pollscan(struct thread *, struct pollfd *, u_int);
   79 static int      selscan(struct thread *, fd_mask **, fd_mask **, int);
   80 static int      dofileread(struct thread *, struct file *, int, void *,
   81                     size_t, off_t, int);
   82 static int      dofilewrite(struct thread *, struct file *, int,
   83                     const void *, size_t, off_t, int);
   84 
   85 /*
   86  * Read system call.
   87  */
   88 #ifndef _SYS_SYSPROTO_H_
   89 struct read_args {
   90         int     fd;
   91         void    *buf;
   92         size_t  nbyte;
   93 };
   94 #endif
   95 /*
   96  * MPSAFE
   97  */
   98 int
   99 read(td, uap)
  100         struct thread *td;
  101         struct read_args *uap;
  102 {
  103         struct file *fp;
  104         int error;
  105 
  106         if ((error = fget_read(td, uap->fd, &fp)) == 0) {
  107                 error = dofileread(td, fp, uap->fd, uap->buf,
  108                             uap->nbyte, (off_t)-1, 0);
  109                 fdrop(fp, td);
  110         }
  111         return(error);
  112 }
  113 
  114 /*
  115  * Pread system call
  116  */
  117 #ifndef _SYS_SYSPROTO_H_
  118 struct pread_args {
  119         int     fd;
  120         void    *buf;
  121         size_t  nbyte;
  122         int     pad;
  123         off_t   offset;
  124 };
  125 #endif
  126 /*
  127  * MPSAFE
  128  */
  129 int
  130 pread(td, uap)
  131         struct thread *td;
  132         struct pread_args *uap;
  133 {
  134         struct file *fp;
  135         int error;
  136 
  137         if ((error = fget_read(td, uap->fd, &fp)) != 0)
  138                 return (error);
  139         if (fp->f_type != DTYPE_VNODE) {
  140                 error = ESPIPE;
  141         } else {
  142                 error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 
  143                             uap->offset, FOF_OFFSET);
  144         }
  145         fdrop(fp, td);
  146         return(error);
  147 }
  148 
  149 /*
  150  * Code common for read and pread
  151  */
  152 static int
  153 dofileread(td, fp, fd, buf, nbyte, offset, flags)
  154         struct thread *td;
  155         struct file *fp;
  156         int fd, flags;
  157         void *buf;
  158         size_t nbyte;
  159         off_t offset;
  160 {
  161         struct uio auio;
  162         struct iovec aiov;
  163         long cnt, error = 0;
  164 #ifdef KTRACE
  165         struct iovec ktriov;
  166         struct uio ktruio;
  167         int didktr = 0;
  168 #endif
  169 
  170         aiov.iov_base = buf;
  171         aiov.iov_len = nbyte;
  172         auio.uio_iov = &aiov;
  173         auio.uio_iovcnt = 1;
  174         auio.uio_offset = offset;
  175         if (nbyte > INT_MAX)
  176                 return (EINVAL);
  177         auio.uio_resid = nbyte;
  178         auio.uio_rw = UIO_READ;
  179         auio.uio_segflg = UIO_USERSPACE;
  180         auio.uio_td = td;
  181 #ifdef KTRACE
  182         /*
  183          * if tracing, save a copy of iovec
  184          */
  185         if (KTRPOINT(td, KTR_GENIO)) {
  186                 ktriov = aiov;
  187                 ktruio = auio;
  188                 didktr = 1;
  189         }
  190 #endif
  191         cnt = nbyte;
  192 
  193         if ((error = fo_read(fp, &auio, td->td_ucred, flags, td))) {
  194                 if (auio.uio_resid != cnt && (error == ERESTART ||
  195                     error == EINTR || error == EWOULDBLOCK))
  196                         error = 0;
  197         }
  198         cnt -= auio.uio_resid;
  199 #ifdef KTRACE
  200         if (didktr && error == 0) {
  201                 ktruio.uio_iov = &ktriov;
  202                 ktruio.uio_resid = cnt;
  203                 ktrgenio(fd, UIO_READ, &ktruio, error);
  204         }
  205 #endif
  206         td->td_retval[0] = cnt;
  207         return (error);
  208 }
  209 
  210 /*
  211  * Scatter read system call.
  212  */
  213 #ifndef _SYS_SYSPROTO_H_
  214 struct readv_args {
  215         int     fd;
  216         struct  iovec *iovp;
  217         u_int   iovcnt;
  218 };
  219 #endif
  220 /*
  221  * MPSAFE
  222  */
  223 int
  224 readv(td, uap)
  225         struct thread *td;
  226         struct readv_args *uap;
  227 {
  228         struct file *fp;
  229         struct uio auio;
  230         struct iovec *iov;
  231         struct iovec *needfree;
  232         struct iovec aiov[UIO_SMALLIOV];
  233         long i, cnt;
  234         int error;
  235         u_int iovlen;
  236 #ifdef KTRACE
  237         struct iovec *ktriov = NULL;
  238         struct uio ktruio;
  239 #endif
  240 
  241         if ((error = fget_read(td, uap->fd, &fp)) != 0)
  242                 return (error);
  243         needfree = NULL;
  244         /* note: can't use iovlen until iovcnt is validated */
  245         iovlen = uap->iovcnt * sizeof (struct iovec);
  246         if (uap->iovcnt > UIO_SMALLIOV) {
  247                 if (uap->iovcnt > UIO_MAXIOV) {
  248                         error = EINVAL;
  249                         goto done;
  250                 }
  251                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
  252                 needfree = iov;
  253         } else
  254                 iov = aiov;
  255         auio.uio_iov = iov;
  256         auio.uio_iovcnt = uap->iovcnt;
  257         auio.uio_rw = UIO_READ;
  258         auio.uio_segflg = UIO_USERSPACE;
  259         auio.uio_td = td;
  260         auio.uio_offset = -1;
  261         if ((error = copyin(uap->iovp, iov, iovlen)))
  262                 goto done;
  263         auio.uio_resid = 0;
  264         for (i = 0; i < uap->iovcnt; i++) {
  265                 if (iov->iov_len > INT_MAX - auio.uio_resid) {
  266                         error = EINVAL;
  267                         goto done;
  268                 }
  269                 auio.uio_resid += iov->iov_len;
  270                 iov++;
  271         }
  272 #ifdef KTRACE
  273         /*
  274          * if tracing, save a copy of iovec
  275          */
  276         if (KTRPOINT(td, KTR_GENIO))  {
  277                 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
  278                 bcopy(auio.uio_iov, ktriov, iovlen);
  279                 ktruio = auio;
  280         }
  281 #endif
  282         cnt = auio.uio_resid;
  283         if ((error = fo_read(fp, &auio, td->td_ucred, 0, td))) {
  284                 if (auio.uio_resid != cnt && (error == ERESTART ||
  285                     error == EINTR || error == EWOULDBLOCK))
  286                         error = 0;
  287         }
  288         cnt -= auio.uio_resid;
  289 #ifdef KTRACE
  290         if (ktriov != NULL) {
  291                 if (error == 0) {
  292                         ktruio.uio_iov = ktriov;
  293                         ktruio.uio_resid = cnt;
  294                         ktrgenio(uap->fd, UIO_READ, &ktruio, error);
  295                 }
  296                 FREE(ktriov, M_TEMP);
  297         }
  298 #endif
  299         td->td_retval[0] = cnt;
  300 done:
  301         fdrop(fp, td);
  302         if (needfree)
  303                 FREE(needfree, M_IOV);
  304         return (error);
  305 }
  306 
  307 /*
  308  * Write system call
  309  */
  310 #ifndef _SYS_SYSPROTO_H_
  311 struct write_args {
  312         int     fd;
  313         const void *buf;
  314         size_t  nbyte;
  315 };
  316 #endif
  317 /*
  318  * MPSAFE
  319  */
  320 int
  321 write(td, uap)
  322         struct thread *td;
  323         struct write_args *uap;
  324 {
  325         struct file *fp;
  326         int error;
  327 
  328         if ((error = fget_write(td, uap->fd, &fp)) == 0) {
  329                 error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
  330                             (off_t)-1, 0);
  331                 fdrop(fp, td);
  332         } else {
  333                 error = EBADF;  /* XXX this can't be right */
  334         }
  335         return(error);
  336 }
  337 
  338 /*
  339  * Pwrite system call
  340  */
  341 #ifndef _SYS_SYSPROTO_H_
  342 struct pwrite_args {
  343         int     fd;
  344         const void *buf;
  345         size_t  nbyte;
  346         int     pad;
  347         off_t   offset;
  348 };
  349 #endif
  350 /*
  351  * MPSAFE
  352  */
  353 int
  354 pwrite(td, uap)
  355         struct thread *td;
  356         struct pwrite_args *uap;
  357 {
  358         struct file *fp;
  359         int error;
  360 
  361         if ((error = fget_write(td, uap->fd, &fp)) == 0) {
  362                 if (fp->f_type == DTYPE_VNODE) {
  363                         error = dofilewrite(td, fp, uap->fd, uap->buf,
  364                                     uap->nbyte, uap->offset, FOF_OFFSET);
  365                 } else {
  366                         error = ESPIPE;
  367                 }
  368                 fdrop(fp, td);
  369         } else {
  370                 error = EBADF;  /* this can't be right */
  371         }
  372         return(error);
  373 }
  374 
  375 static int
  376 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
  377         struct thread *td;
  378         struct file *fp;
  379         int fd, flags;
  380         const void *buf;
  381         size_t nbyte;
  382         off_t offset;
  383 {
  384         struct uio auio;
  385         struct iovec aiov;
  386         long cnt, error = 0;
  387 #ifdef KTRACE
  388         struct iovec ktriov;
  389         struct uio ktruio;
  390         int didktr = 0;
  391 #endif
  392 
  393         aiov.iov_base = (void *)(uintptr_t)buf;
  394         aiov.iov_len = nbyte;
  395         auio.uio_iov = &aiov;
  396         auio.uio_iovcnt = 1;
  397         auio.uio_offset = offset;
  398         if (nbyte > INT_MAX)
  399                 return (EINVAL);
  400         auio.uio_resid = nbyte;
  401         auio.uio_rw = UIO_WRITE;
  402         auio.uio_segflg = UIO_USERSPACE;
  403         auio.uio_td = td;
  404 #ifdef KTRACE
  405         /*
  406          * if tracing, save a copy of iovec and uio
  407          */
  408         if (KTRPOINT(td, KTR_GENIO)) {
  409                 ktriov = aiov;
  410                 ktruio = auio;
  411                 didktr = 1;
  412         }
  413 #endif
  414         cnt = nbyte;
  415         if (fp->f_type == DTYPE_VNODE)
  416                 bwillwrite();
  417         if ((error = fo_write(fp, &auio, td->td_ucred, flags, td))) {
  418                 if (auio.uio_resid != cnt && (error == ERESTART ||
  419                     error == EINTR || error == EWOULDBLOCK))
  420                         error = 0;
  421                 /* Socket layer is responsible for issuing SIGPIPE. */
  422                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET) {
  423                         PROC_LOCK(td->td_proc);
  424                         psignal(td->td_proc, SIGPIPE);
  425                         PROC_UNLOCK(td->td_proc);
  426                 }
  427         }
  428         cnt -= auio.uio_resid;
  429 #ifdef KTRACE
  430         if (didktr && error == 0) {
  431                 ktruio.uio_iov = &ktriov;
  432                 ktruio.uio_resid = cnt;
  433                 ktrgenio(fd, UIO_WRITE, &ktruio, error);
  434         }
  435 #endif
  436         td->td_retval[0] = cnt;
  437         return (error);
  438 }
  439 
  440 /*
  441  * Gather write system call
  442  */
  443 #ifndef _SYS_SYSPROTO_H_
  444 struct writev_args {
  445         int     fd;
  446         struct  iovec *iovp;
  447         u_int   iovcnt;
  448 };
  449 #endif
  450 /*
  451  * MPSAFE
  452  */
  453 int
  454 writev(td, uap)
  455         struct thread *td;
  456         register struct writev_args *uap;
  457 {
  458         struct file *fp;
  459         struct uio auio;
  460         register struct iovec *iov;
  461         struct iovec *needfree;
  462         struct iovec aiov[UIO_SMALLIOV];
  463         long i, cnt, error = 0;
  464         u_int iovlen;
  465 #ifdef KTRACE
  466         struct iovec *ktriov = NULL;
  467         struct uio ktruio;
  468 #endif
  469 
  470         mtx_lock(&Giant);
  471         if ((error = fget_write(td, uap->fd, &fp)) != 0) {
  472                 error = EBADF;
  473                 goto done2;
  474         }
  475         /* note: can't use iovlen until iovcnt is validated */
  476         iovlen = uap->iovcnt * sizeof (struct iovec);
  477         if (uap->iovcnt > UIO_SMALLIOV) {
  478                 if (uap->iovcnt > UIO_MAXIOV) {
  479                         needfree = NULL;
  480                         error = EINVAL;
  481                         goto done;
  482                 }
  483                 MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
  484                 needfree = iov;
  485         } else {
  486                 iov = aiov;
  487                 needfree = NULL;
  488         }
  489         auio.uio_iov = iov;
  490         auio.uio_iovcnt = uap->iovcnt;
  491         auio.uio_rw = UIO_WRITE;
  492         auio.uio_segflg = UIO_USERSPACE;
  493         auio.uio_td = td;
  494         auio.uio_offset = -1;
  495         if ((error = copyin(uap->iovp, iov, iovlen)))
  496                 goto done;
  497         auio.uio_resid = 0;
  498         for (i = 0; i < uap->iovcnt; i++) {
  499                 if (iov->iov_len > INT_MAX - auio.uio_resid) {
  500                         error = EINVAL;
  501                         goto done;
  502                 }
  503                 auio.uio_resid += iov->iov_len;
  504                 iov++;
  505         }
  506 #ifdef KTRACE
  507         /*
  508          * if tracing, save a copy of iovec and uio
  509          */
  510         if (KTRPOINT(td, KTR_GENIO))  {
  511                 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
  512                 bcopy(auio.uio_iov, ktriov, iovlen);
  513                 ktruio = auio;
  514         }
  515 #endif
  516         cnt = auio.uio_resid;
  517         if (fp->f_type == DTYPE_VNODE)
  518                 bwillwrite();
  519         if ((error = fo_write(fp, &auio, td->td_ucred, 0, td))) {
  520                 if (auio.uio_resid != cnt && (error == ERESTART ||
  521                     error == EINTR || error == EWOULDBLOCK))
  522                         error = 0;
  523                 if (error == EPIPE) {
  524                         PROC_LOCK(td->td_proc);
  525                         psignal(td->td_proc, SIGPIPE);
  526                         PROC_UNLOCK(td->td_proc);
  527                 }
  528         }
  529         cnt -= auio.uio_resid;
  530 #ifdef KTRACE
  531         if (ktriov != NULL) {
  532                 if (error == 0) {
  533                         ktruio.uio_iov = ktriov;
  534                         ktruio.uio_resid = cnt;
  535                         ktrgenio(uap->fd, UIO_WRITE, &ktruio, error);
  536                 }
  537                 FREE(ktriov, M_TEMP);
  538         }
  539 #endif
  540         td->td_retval[0] = cnt;
  541 done:
  542         fdrop(fp, td);
  543         if (needfree)
  544                 FREE(needfree, M_IOV);
  545 done2:
  546         mtx_unlock(&Giant);
  547         return (error);
  548 }
  549 
  550 /*
  551  * Ioctl system call
  552  */
  553 #ifndef _SYS_SYSPROTO_H_
  554 struct ioctl_args {
  555         int     fd;
  556         u_long  com;
  557         caddr_t data;
  558 };
  559 #endif
  560 /*
  561  * MPSAFE
  562  */
  563 /* ARGSUSED */
  564 int
  565 ioctl(td, uap)
  566         struct thread *td;
  567         register struct ioctl_args *uap;
  568 {
  569         struct file *fp;
  570         register struct filedesc *fdp;
  571         register u_long com;
  572         int error = 0;
  573         register u_int size;
  574         caddr_t data, memp;
  575         int tmp;
  576 #define STK_PARAMS      128
  577         union {
  578             char stkbuf[STK_PARAMS];
  579             long align;
  580         } ubuf;
  581 
  582         if ((error = fget(td, uap->fd, &fp)) != 0)
  583                 return (error);
  584         mtx_lock(&Giant);
  585         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  586                 fdrop(fp, td);
  587                 mtx_unlock(&Giant);
  588                 return (EBADF);
  589         }
  590         fdp = td->td_proc->p_fd;
  591         switch (com = uap->com) {
  592         case FIONCLEX:
  593                 FILEDESC_LOCK(fdp);
  594                 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
  595                 FILEDESC_UNLOCK(fdp);
  596                 fdrop(fp, td);
  597                 mtx_unlock(&Giant);
  598                 return (0);
  599         case FIOCLEX:
  600                 FILEDESC_LOCK(fdp);
  601                 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
  602                 FILEDESC_UNLOCK(fdp);
  603                 fdrop(fp, td);
  604                 mtx_unlock(&Giant);
  605                 return (0);
  606         }
  607 
  608         /*
  609          * Interpret high order word to find amount of data to be
  610          * copied to/from the user's address space.
  611          */
  612         size = IOCPARM_LEN(com);
  613         if (size > IOCPARM_MAX) {
  614                 fdrop(fp, td);
  615                 mtx_unlock(&Giant);
  616                 return (ENOTTY);
  617         }
  618 
  619         memp = NULL;
  620         if (size > sizeof (ubuf.stkbuf)) {
  621                 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
  622                 data = memp;
  623         } else {
  624                 data = ubuf.stkbuf;
  625         }
  626         if (com&IOC_IN) {
  627                 if (size) {
  628                         error = copyin(uap->data, data, (u_int)size);
  629                         if (error) {
  630                                 if (memp)
  631                                         free(memp, M_IOCTLOPS);
  632                                 fdrop(fp, td);
  633                                 goto done;
  634                         }
  635                 } else {
  636                         *(caddr_t *)data = uap->data;
  637                 }
  638         } else if ((com&IOC_OUT) && size) {
  639                 /*
  640                  * Zero the buffer so the user always
  641                  * gets back something deterministic.
  642                  */
  643                 bzero(data, size);
  644         } else if (com&IOC_VOID) {
  645                 *(caddr_t *)data = uap->data;
  646         }
  647 
  648         switch (com) {
  649 
  650         case FIONBIO:
  651                 FILE_LOCK(fp);
  652                 if ((tmp = *(int *)data))
  653                         fp->f_flag |= FNONBLOCK;
  654                 else
  655                         fp->f_flag &= ~FNONBLOCK;
  656                 FILE_UNLOCK(fp);
  657                 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
  658                 break;
  659 
  660         case FIOASYNC:
  661                 FILE_LOCK(fp);
  662                 if ((tmp = *(int *)data))
  663                         fp->f_flag |= FASYNC;
  664                 else
  665                         fp->f_flag &= ~FASYNC;
  666                 FILE_UNLOCK(fp);
  667                 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
  668                 break;
  669 
  670         default:
  671                 error = fo_ioctl(fp, com, data, td->td_ucred, td);
  672                 /*
  673                  * Copy any data to user, size was
  674                  * already set and checked above.
  675                  */
  676                 if (error == 0 && (com&IOC_OUT) && size)
  677                         error = copyout(data, uap->data, (u_int)size);
  678                 break;
  679         }
  680         if (memp)
  681                 free(memp, M_IOCTLOPS);
  682         fdrop(fp, td);
  683 done:
  684         mtx_unlock(&Giant);
  685         return (error);
  686 }
  687 
  688 /*
  689  * sellock and selwait are initialized in selectinit() via SYSINIT.
  690  */
  691 struct mtx      sellock;
  692 struct cv       selwait;
  693 u_int           nselcoll;       /* Select collisions since boot */
  694 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
  695 
  696 /*
  697  * Select system call.
  698  */
  699 #ifndef _SYS_SYSPROTO_H_
  700 struct select_args {
  701         int     nd;
  702         fd_set  *in, *ou, *ex;
  703         struct  timeval *tv;
  704 };
  705 #endif
  706 /*
  707  * MPSAFE
  708  */
  709 int
  710 select(td, uap)
  711         register struct thread *td;
  712         register struct select_args *uap;
  713 {
  714         struct timeval tv, *tvp;
  715         int error;
  716 
  717         if (uap->tv != NULL) {
  718                 error = copyin(uap->tv, &tv, sizeof(tv));
  719                 if (error)
  720                         return (error);
  721                 tvp = &tv;
  722         } else
  723                 tvp = NULL;
  724 
  725         return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
  726 }
  727 
  728 int
  729 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
  730     fd_set *fd_ex, struct timeval *tvp)
  731 {
  732         struct filedesc *fdp;
  733         /*
  734          * The magic 2048 here is chosen to be just enough for FD_SETSIZE
  735          * infds with the new FD_SETSIZE of 1024, and more than enough for
  736          * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
  737          * of 256.
  738          */
  739         fd_mask s_selbits[howmany(2048, NFDBITS)];
  740         fd_mask *ibits[3], *obits[3], *selbits, *sbp;
  741         struct timeval atv, rtv, ttv;
  742         int error, timo;
  743         u_int ncoll, nbufbytes, ncpbytes, nfdbits;
  744 
  745         if (nd < 0)
  746                 return (EINVAL);
  747         fdp = td->td_proc->p_fd;
  748         mtx_lock(&Giant);
  749         FILEDESC_LOCK(fdp);
  750 
  751         if (nd > td->td_proc->p_fd->fd_nfiles)
  752                 nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
  753         FILEDESC_UNLOCK(fdp);
  754 
  755         /*
  756          * Allocate just enough bits for the non-null fd_sets.  Use the
  757          * preallocated auto buffer if possible.
  758          */
  759         nfdbits = roundup(nd, NFDBITS);
  760         ncpbytes = nfdbits / NBBY;
  761         nbufbytes = 0;
  762         if (fd_in != NULL)
  763                 nbufbytes += 2 * ncpbytes;
  764         if (fd_ou != NULL)
  765                 nbufbytes += 2 * ncpbytes;
  766         if (fd_ex != NULL)
  767                 nbufbytes += 2 * ncpbytes;
  768         if (nbufbytes <= sizeof s_selbits)
  769                 selbits = &s_selbits[0];
  770         else
  771                 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
  772 
  773         /*
  774          * Assign pointers into the bit buffers and fetch the input bits.
  775          * Put the output buffers together so that they can be bzeroed
  776          * together.
  777          */
  778         sbp = selbits;
  779 #define getbits(name, x) \
  780         do {                                                            \
  781                 if (name == NULL)                                       \
  782                         ibits[x] = NULL;                                \
  783                 else {                                                  \
  784                         ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;   \
  785                         obits[x] = sbp;                                 \
  786                         sbp += ncpbytes / sizeof *sbp;                  \
  787                         error = copyin(name, ibits[x], ncpbytes);       \
  788                         if (error != 0)                                 \
  789                                 goto done_nosellock;                    \
  790                 }                                                       \
  791         } while (0)
  792         getbits(fd_in, 0);
  793         getbits(fd_ou, 1);
  794         getbits(fd_ex, 2);
  795 #undef  getbits
  796         if (nbufbytes != 0)
  797                 bzero(selbits, nbufbytes / 2);
  798 
  799         if (tvp != NULL) {
  800                 atv = *tvp;
  801                 if (itimerfix(&atv)) {
  802                         error = EINVAL;
  803                         goto done_nosellock;
  804                 }
  805                 getmicrouptime(&rtv);
  806                 timevaladd(&atv, &rtv);
  807         } else {
  808                 atv.tv_sec = 0;
  809                 atv.tv_usec = 0;
  810         }
  811         timo = 0;
  812         TAILQ_INIT(&td->td_selq);
  813         mtx_lock(&sellock);
  814 retry:
  815         ncoll = nselcoll;
  816         mtx_lock_spin(&sched_lock);
  817         td->td_flags |= TDF_SELECT;
  818         mtx_unlock_spin(&sched_lock);
  819         mtx_unlock(&sellock);
  820 
  821         error = selscan(td, ibits, obits, nd);
  822         mtx_lock(&sellock);
  823         if (error || td->td_retval[0])
  824                 goto done;
  825         if (atv.tv_sec || atv.tv_usec) {
  826                 getmicrouptime(&rtv);
  827                 if (timevalcmp(&rtv, &atv, >=))
  828                         goto done;
  829                 ttv = atv;
  830                 timevalsub(&ttv, &rtv);
  831                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  832                     24 * 60 * 60 * hz : tvtohz(&ttv);
  833         }
  834 
  835         /*
  836          * An event of interest may occur while we do not hold
  837          * sellock, so check TDF_SELECT and the number of
  838          * collisions and rescan the file descriptors if
  839          * necessary.
  840          */
  841         mtx_lock_spin(&sched_lock);
  842         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  843                 mtx_unlock_spin(&sched_lock);
  844                 goto retry;
  845         }
  846         mtx_unlock_spin(&sched_lock);
  847 
  848         if (timo > 0)
  849                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  850         else
  851                 error = cv_wait_sig(&selwait, &sellock);
  852         
  853         if (error == 0)
  854                 goto retry;
  855 
  856 done:
  857         clear_selinfo_list(td);
  858         mtx_lock_spin(&sched_lock);
  859         td->td_flags &= ~TDF_SELECT;
  860         mtx_unlock_spin(&sched_lock);
  861         mtx_unlock(&sellock);
  862 
  863 done_nosellock:
  864         /* select is not restarted after signals... */
  865         if (error == ERESTART)
  866                 error = EINTR;
  867         if (error == EWOULDBLOCK)
  868                 error = 0;
  869 #define putbits(name, x) \
  870         if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
  871                 error = error2;
  872         if (error == 0) {
  873                 int error2;
  874 
  875                 putbits(fd_in, 0);
  876                 putbits(fd_ou, 1);
  877                 putbits(fd_ex, 2);
  878 #undef putbits
  879         }
  880         if (selbits != &s_selbits[0])
  881                 free(selbits, M_SELECT);
  882 
  883         mtx_unlock(&Giant);
  884         return (error);
  885 }
  886 
  887 static int
  888 selscan(td, ibits, obits, nfd)
  889         struct thread *td;
  890         fd_mask **ibits, **obits;
  891         int nfd;
  892 {
  893         int msk, i, fd;
  894         fd_mask bits;
  895         struct file *fp;
  896         int n = 0;
  897         /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
  898         static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
  899         struct filedesc *fdp = td->td_proc->p_fd;
  900 
  901         FILEDESC_LOCK(fdp);
  902         for (msk = 0; msk < 3; msk++) {
  903                 if (ibits[msk] == NULL)
  904                         continue;
  905                 for (i = 0; i < nfd; i += NFDBITS) {
  906                         bits = ibits[msk][i/NFDBITS];
  907                         /* ffs(int mask) not portable, fd_mask is long */
  908                         for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
  909                                 if (!(bits & 1))
  910                                         continue;
  911                                 if ((fp = fget_locked(fdp, fd)) == NULL) {
  912                                         FILEDESC_UNLOCK(fdp);
  913                                         return (EBADF);
  914                                 }
  915                                 if (fo_poll(fp, flag[msk], td->td_ucred,
  916                                     td)) {
  917                                         obits[msk][(fd)/NFDBITS] |=
  918                                             ((fd_mask)1 << ((fd) % NFDBITS));
  919                                         n++;
  920                                 }
  921                         }
  922                 }
  923         }
  924         FILEDESC_UNLOCK(fdp);
  925         td->td_retval[0] = n;
  926         return (0);
  927 }
  928 
  929 /*
  930  * Poll system call.
  931  */
  932 #ifndef _SYS_SYSPROTO_H_
  933 struct poll_args {
  934         struct pollfd *fds;
  935         u_int   nfds;
  936         int     timeout;
  937 };
  938 #endif
  939 /*
  940  * MPSAFE
  941  */
  942 int
  943 poll(td, uap)
  944         struct thread *td;
  945         struct poll_args *uap;
  946 {
  947         caddr_t bits;
  948         char smallbits[32 * sizeof(struct pollfd)];
  949         struct timeval atv, rtv, ttv;
  950         int error = 0, timo;
  951         u_int ncoll, nfds;
  952         size_t ni;
  953 
  954         nfds = uap->nfds;
  955 
  956         mtx_lock(&Giant);
  957         /*
  958          * This is kinda bogus.  We have fd limits, but that is not
  959          * really related to the size of the pollfd array.  Make sure
  960          * we let the process use at least FD_SETSIZE entries and at
  961          * least enough for the current limits.  We want to be reasonably
  962          * safe, but not overly restrictive.
  963          */
  964         if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
  965             (nfds > FD_SETSIZE)) {
  966                 error = EINVAL;
  967                 goto done2;
  968         }
  969         ni = nfds * sizeof(struct pollfd);
  970         if (ni > sizeof(smallbits))
  971                 bits = malloc(ni, M_TEMP, M_WAITOK);
  972         else
  973                 bits = smallbits;
  974         error = copyin(uap->fds, bits, ni);
  975         if (error)
  976                 goto done_nosellock;
  977         if (uap->timeout != INFTIM) {
  978                 atv.tv_sec = uap->timeout / 1000;
  979                 atv.tv_usec = (uap->timeout % 1000) * 1000;
  980                 if (itimerfix(&atv)) {
  981                         error = EINVAL;
  982                         goto done_nosellock;
  983                 }
  984                 getmicrouptime(&rtv);
  985                 timevaladd(&atv, &rtv);
  986         } else {
  987                 atv.tv_sec = 0;
  988                 atv.tv_usec = 0;
  989         }
  990         timo = 0;
  991         TAILQ_INIT(&td->td_selq);
  992         mtx_lock(&sellock);
  993 retry:
  994         ncoll = nselcoll;
  995         mtx_lock_spin(&sched_lock);
  996         td->td_flags |= TDF_SELECT;
  997         mtx_unlock_spin(&sched_lock);
  998         mtx_unlock(&sellock);
  999 
 1000         error = pollscan(td, (struct pollfd *)bits, nfds);
 1001         mtx_lock(&sellock);
 1002         if (error || td->td_retval[0])
 1003                 goto done;
 1004         if (atv.tv_sec || atv.tv_usec) {
 1005                 getmicrouptime(&rtv);
 1006                 if (timevalcmp(&rtv, &atv, >=))
 1007                         goto done;
 1008                 ttv = atv;
 1009                 timevalsub(&ttv, &rtv);
 1010                 timo = ttv.tv_sec > 24 * 60 * 60 ?
 1011                     24 * 60 * 60 * hz : tvtohz(&ttv);
 1012         }
 1013         /*
 1014          * An event of interest may occur while we do not hold
 1015          * sellock, so check TDF_SELECT and the number of collisions
 1016          * and rescan the file descriptors if necessary.
 1017          */
 1018         mtx_lock_spin(&sched_lock);
 1019         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 1020                 mtx_unlock_spin(&sched_lock);
 1021                 goto retry;
 1022         }
 1023         mtx_unlock_spin(&sched_lock);
 1024 
 1025         if (timo > 0)
 1026                 error = cv_timedwait_sig(&selwait, &sellock, timo);
 1027         else
 1028                 error = cv_wait_sig(&selwait, &sellock);
 1029 
 1030         if (error == 0)
 1031                 goto retry;
 1032 
 1033 done:
 1034         clear_selinfo_list(td);
 1035         mtx_lock_spin(&sched_lock);
 1036         td->td_flags &= ~TDF_SELECT;
 1037         mtx_unlock_spin(&sched_lock);
 1038         mtx_unlock(&sellock);
 1039 
 1040 done_nosellock:
 1041         /* poll is not restarted after signals... */
 1042         if (error == ERESTART)
 1043                 error = EINTR;
 1044         if (error == EWOULDBLOCK)
 1045                 error = 0;
 1046         if (error == 0) {
 1047                 error = copyout(bits, uap->fds, ni);
 1048                 if (error)
 1049                         goto out;
 1050         }
 1051 out:
 1052         if (ni > sizeof(smallbits))
 1053                 free(bits, M_TEMP);
 1054 done2:
 1055         mtx_unlock(&Giant);
 1056         return (error);
 1057 }
 1058 
 1059 static int
 1060 pollscan(td, fds, nfd)
 1061         struct thread *td;
 1062         struct pollfd *fds;
 1063         u_int nfd;
 1064 {
 1065         register struct filedesc *fdp = td->td_proc->p_fd;
 1066         int i;
 1067         struct file *fp;
 1068         int n = 0;
 1069 
 1070         FILEDESC_LOCK(fdp);
 1071         for (i = 0; i < nfd; i++, fds++) {
 1072                 if (fds->fd >= fdp->fd_nfiles) {
 1073                         fds->revents = POLLNVAL;
 1074                         n++;
 1075                 } else if (fds->fd < 0) {
 1076                         fds->revents = 0;
 1077                 } else {
 1078                         fp = fdp->fd_ofiles[fds->fd];
 1079                         if (fp == NULL) {
 1080                                 fds->revents = POLLNVAL;
 1081                                 n++;
 1082                         } else {
 1083                                 /*
 1084                                  * Note: backend also returns POLLHUP and
 1085                                  * POLLERR if appropriate.
 1086                                  */
 1087                                 fds->revents = fo_poll(fp, fds->events,
 1088                                     td->td_ucred, td);
 1089                                 if (fds->revents != 0)
 1090                                         n++;
 1091                         }
 1092                 }
 1093         }
 1094         FILEDESC_UNLOCK(fdp);
 1095         td->td_retval[0] = n;
 1096         return (0);
 1097 }
 1098 
 1099 /*
 1100  * OpenBSD poll system call.
 1101  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
 1102  */
 1103 #ifndef _SYS_SYSPROTO_H_
 1104 struct openbsd_poll_args {
 1105         struct pollfd *fds;
 1106         u_int   nfds;
 1107         int     timeout;
 1108 };
 1109 #endif
 1110 /*
 1111  * MPSAFE
 1112  */
 1113 int
 1114 openbsd_poll(td, uap)
 1115         register struct thread *td;
 1116         register struct openbsd_poll_args *uap;
 1117 {
 1118         return (poll(td, (struct poll_args *)uap));
 1119 }
 1120 
 1121 /*
 1122  * Remove the references to the thread from all of the objects
 1123  * we were polling.
 1124  *
 1125  * This code assumes that the underlying owner of the selinfo
 1126  * structure will hold sellock before it changes it, and that
 1127  * it will unlink itself from our list if it goes away.
 1128  */
 1129 void
 1130 clear_selinfo_list(td)
 1131         struct thread *td;
 1132 {
 1133         struct selinfo *si;
 1134 
 1135         mtx_assert(&sellock, MA_OWNED);
 1136         TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 1137                 si->si_thread = NULL;
 1138         TAILQ_INIT(&td->td_selq);
 1139 }
 1140 
 1141 /*ARGSUSED*/
 1142 int
 1143 seltrue(dev, events, td)
 1144         dev_t dev;
 1145         int events;
 1146         struct thread *td;
 1147 {
 1148 
 1149         return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 1150 }
 1151 
 1152 /*
 1153  * Record a select request.
 1154  */
 1155 void
 1156 selrecord(selector, sip)
 1157         struct thread *selector;
 1158         struct selinfo *sip;
 1159 {
 1160 
 1161         mtx_lock(&sellock);
 1162         /*
 1163          * If the selinfo's thread pointer is NULL then take ownership of it.
 1164          *
 1165          * If the thread pointer is not NULL and it points to another
 1166          * thread, then we have a collision.
 1167          *
 1168          * If the thread pointer is not NULL and points back to us then leave
 1169          * it alone as we've already added pointed it at us and added it to
 1170          * our list.
 1171          */
 1172         if (sip->si_thread == NULL) {
 1173                 sip->si_thread = selector;
 1174                 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 1175         } else if (sip->si_thread != selector) {
 1176                 sip->si_flags |= SI_COLL;
 1177         }
 1178 
 1179         mtx_unlock(&sellock);
 1180 }
 1181 
 1182 /*
 1183  * Do a wakeup when a selectable event occurs.
 1184  */
 1185 void
 1186 selwakeup(sip)
 1187         struct selinfo *sip;
 1188 {
 1189         struct thread *td;
 1190 
 1191         mtx_lock(&sellock);
 1192         td = sip->si_thread;
 1193         if ((sip->si_flags & SI_COLL) != 0) {
 1194                 nselcoll++;
 1195                 sip->si_flags &= ~SI_COLL;
 1196                 cv_broadcast(&selwait);
 1197         }
 1198         if (td == NULL) {
 1199                 mtx_unlock(&sellock);
 1200                 return;
 1201         }
 1202         TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 1203         sip->si_thread = NULL;
 1204         mtx_lock_spin(&sched_lock);
 1205         if (td->td_wchan == &selwait) {
 1206                 cv_waitq_remove(td);
 1207                 TD_CLR_SLEEPING(td);
 1208                 setrunnable(td);
 1209         } else
 1210                 td->td_flags &= ~TDF_SELECT;
 1211         mtx_unlock_spin(&sched_lock);
 1212         mtx_unlock(&sellock);
 1213 }
 1214 
 1215 static void selectinit(void *);
 1216 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 1217 
 1218 /* ARGSUSED*/
 1219 static void
 1220 selectinit(dummy)
 1221         void *dummy;
 1222 {
 1223         cv_init(&selwait, "select");
 1224         mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 1225 }

Cache object: 8b00ecfd95f004ec0c535763e49abc47


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.