The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)sys_generic.c       8.5 (Berkeley) 1/21/94
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_compat.h"
   41 #include "opt_ktrace.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/sysproto.h>
   46 #include <sys/filedesc.h>
   47 #include <sys/filio.h>
   48 #include <sys/fcntl.h>
   49 #include <sys/file.h>
   50 #include <sys/proc.h>
   51 #include <sys/signalvar.h>
   52 #include <sys/socketvar.h>
   53 #include <sys/uio.h>
   54 #include <sys/kernel.h>
   55 #include <sys/limits.h>
   56 #include <sys/malloc.h>
   57 #include <sys/poll.h>
   58 #include <sys/resourcevar.h>
   59 #include <sys/selinfo.h>
   60 #include <sys/sleepqueue.h>
   61 #include <sys/syscallsubr.h>
   62 #include <sys/sysctl.h>
   63 #include <sys/sysent.h>
   64 #include <sys/vnode.h>
   65 #include <sys/bio.h>
   66 #include <sys/buf.h>
   67 #include <sys/condvar.h>
   68 #ifdef KTRACE
   69 #include <sys/ktrace.h>
   70 #endif
   71 
   72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
   73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
   74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
   75 
   76 static int      pollout(struct pollfd *, struct pollfd *, u_int);
   77 static int      pollscan(struct thread *, struct pollfd *, u_int);
   78 static int      selscan(struct thread *, fd_mask **, fd_mask **, int);
   79 static int      dofileread(struct thread *, int, struct file *, struct uio *,
   80                     off_t, int);
   81 static int      dofilewrite(struct thread *, int, struct file *, struct uio *,
   82                     off_t, int);
   83 static void     doselwakeup(struct selinfo *, int);
   84 
   85 #ifndef _SYS_SYSPROTO_H_
   86 struct read_args {
   87         int     fd;
   88         void    *buf;
   89         size_t  nbyte;
   90 };
   91 #endif
   92 int
   93 read(td, uap)
   94         struct thread *td;
   95         struct read_args *uap;
   96 {
   97         struct uio auio;
   98         struct iovec aiov;
   99         int error;
  100 
  101         if (uap->nbyte > INT_MAX)
  102                 return (EINVAL);
  103         aiov.iov_base = uap->buf;
  104         aiov.iov_len = uap->nbyte;
  105         auio.uio_iov = &aiov;
  106         auio.uio_iovcnt = 1;
  107         auio.uio_resid = uap->nbyte;
  108         auio.uio_segflg = UIO_USERSPACE;
  109         error = kern_readv(td, uap->fd, &auio);
  110         return(error);
  111 }
  112 
  113 /*
  114  * Positioned read system call
  115  */
  116 #ifndef _SYS_SYSPROTO_H_
  117 struct pread_args {
  118         int     fd;
  119         void    *buf;
  120         size_t  nbyte;
  121         int     pad;
  122         off_t   offset;
  123 };
  124 #endif
  125 int
  126 pread(td, uap)
  127         struct thread *td;
  128         struct pread_args *uap;
  129 {
  130         struct uio auio;
  131         struct iovec aiov;
  132         int error;
  133 
  134         if (uap->nbyte > INT_MAX)
  135                 return (EINVAL);
  136         aiov.iov_base = uap->buf;
  137         aiov.iov_len = uap->nbyte;
  138         auio.uio_iov = &aiov;
  139         auio.uio_iovcnt = 1;
  140         auio.uio_resid = uap->nbyte;
  141         auio.uio_segflg = UIO_USERSPACE;
  142         error = kern_preadv(td, uap->fd, &auio, uap->offset);
  143         return(error);
  144 }
  145 
  146 int
  147 freebsd6_pread(td, uap)
  148         struct thread *td;
  149         struct freebsd6_pread_args *uap;
  150 {
  151         struct pread_args oargs;
  152 
  153         oargs.fd = uap->fd;
  154         oargs.buf = uap->buf;
  155         oargs.nbyte = uap->nbyte;
  156         oargs.offset = uap->offset;
  157         return (pread(td, &oargs));
  158 }
  159 
  160 /*
  161  * Scatter read system call.
  162  */
  163 #ifndef _SYS_SYSPROTO_H_
  164 struct readv_args {
  165         int     fd;
  166         struct  iovec *iovp;
  167         u_int   iovcnt;
  168 };
  169 #endif
  170 int
  171 readv(struct thread *td, struct readv_args *uap)
  172 {
  173         struct uio *auio;
  174         int error;
  175 
  176         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  177         if (error)
  178                 return (error);
  179         error = kern_readv(td, uap->fd, auio);
  180         free(auio, M_IOV);
  181         return (error);
  182 }
  183 
  184 int
  185 kern_readv(struct thread *td, int fd, struct uio *auio)
  186 {
  187         struct file *fp;
  188         int error;
  189 
  190         error = fget_read(td, fd, &fp);
  191         if (error)
  192                 return (error);
  193         error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
  194         fdrop(fp, td);
  195         return (error);
  196 }
  197 
  198 /*
  199  * Scatter positioned read system call.
  200  */
  201 #ifndef _SYS_SYSPROTO_H_
  202 struct preadv_args {
  203         int     fd;
  204         struct  iovec *iovp;
  205         u_int   iovcnt;
  206         off_t   offset;
  207 };
  208 #endif
  209 int
  210 preadv(struct thread *td, struct preadv_args *uap)
  211 {
  212         struct uio *auio;
  213         int error;
  214 
  215         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  216         if (error)
  217                 return (error);
  218         error = kern_preadv(td, uap->fd, auio, uap->offset);
  219         free(auio, M_IOV);
  220         return (error);
  221 }
  222 
  223 int
  224 kern_preadv(td, fd, auio, offset)
  225         struct thread *td;
  226         int fd;
  227         struct uio *auio;
  228         off_t offset;
  229 {
  230         struct file *fp;
  231         int error;
  232 
  233         error = fget_read(td, fd, &fp);
  234         if (error)
  235                 return (error);
  236         if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
  237                 error = ESPIPE;
  238         else if (offset < 0 && fp->f_vnode->v_type != VCHR)
  239                 error = EINVAL;
  240         else
  241                 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
  242         fdrop(fp, td);
  243         return (error);
  244 }
  245 
  246 /*
  247  * Common code for readv and preadv that reads data in
  248  * from a file using the passed in uio, offset, and flags.
  249  */
  250 static int
  251 dofileread(td, fd, fp, auio, offset, flags)
  252         struct thread *td;
  253         int fd;
  254         struct file *fp;
  255         struct uio *auio;
  256         off_t offset;
  257         int flags;
  258 {
  259         ssize_t cnt;
  260         int error;
  261 #ifdef KTRACE
  262         struct uio *ktruio = NULL;
  263 #endif
  264 
  265         /* Finish zero length reads right here */
  266         if (auio->uio_resid == 0) {
  267                 td->td_retval[0] = 0;
  268                 return(0);
  269         }
  270         auio->uio_rw = UIO_READ;
  271         auio->uio_offset = offset;
  272         auio->uio_td = td;
  273 #ifdef KTRACE
  274         if (KTRPOINT(td, KTR_GENIO)) 
  275                 ktruio = cloneuio(auio);
  276 #endif
  277         cnt = auio->uio_resid;
  278         if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
  279                 if (auio->uio_resid != cnt && (error == ERESTART ||
  280                     error == EINTR || error == EWOULDBLOCK))
  281                         error = 0;
  282         }
  283         cnt -= auio->uio_resid;
  284 #ifdef KTRACE
  285         if (ktruio != NULL) {
  286                 ktruio->uio_resid = cnt;
  287                 ktrgenio(fd, UIO_READ, ktruio, error);
  288         }
  289 #endif
  290         td->td_retval[0] = cnt;
  291         return (error);
  292 }
  293 
  294 #ifndef _SYS_SYSPROTO_H_
  295 struct write_args {
  296         int     fd;
  297         const void *buf;
  298         size_t  nbyte;
  299 };
  300 #endif
  301 int
  302 write(td, uap)
  303         struct thread *td;
  304         struct write_args *uap;
  305 {
  306         struct uio auio;
  307         struct iovec aiov;
  308         int error;
  309 
  310         if (uap->nbyte > INT_MAX)
  311                 return (EINVAL);
  312         aiov.iov_base = (void *)(uintptr_t)uap->buf;
  313         aiov.iov_len = uap->nbyte;
  314         auio.uio_iov = &aiov;
  315         auio.uio_iovcnt = 1;
  316         auio.uio_resid = uap->nbyte;
  317         auio.uio_segflg = UIO_USERSPACE;
  318         error = kern_writev(td, uap->fd, &auio);
  319         return(error);
  320 }
  321 
  322 /*
  323  * Positioned write system call.
  324  */
  325 #ifndef _SYS_SYSPROTO_H_
  326 struct pwrite_args {
  327         int     fd;
  328         const void *buf;
  329         size_t  nbyte;
  330         int     pad;
  331         off_t   offset;
  332 };
  333 #endif
  334 int
  335 pwrite(td, uap)
  336         struct thread *td;
  337         struct pwrite_args *uap;
  338 {
  339         struct uio auio;
  340         struct iovec aiov;
  341         int error;
  342 
  343         if (uap->nbyte > INT_MAX)
  344                 return (EINVAL);
  345         aiov.iov_base = (void *)(uintptr_t)uap->buf;
  346         aiov.iov_len = uap->nbyte;
  347         auio.uio_iov = &aiov;
  348         auio.uio_iovcnt = 1;
  349         auio.uio_resid = uap->nbyte;
  350         auio.uio_segflg = UIO_USERSPACE;
  351         error = kern_pwritev(td, uap->fd, &auio, uap->offset);
  352         return(error);
  353 }
  354 
  355 int
  356 freebsd6_pwrite(td, uap)
  357         struct thread *td;
  358         struct freebsd6_pwrite_args *uap;
  359 {
  360         struct pwrite_args oargs;
  361 
  362         oargs.fd = uap->fd;
  363         oargs.buf = uap->buf;
  364         oargs.nbyte = uap->nbyte;
  365         oargs.offset = uap->offset;
  366         return (pwrite(td, &oargs));
  367 }
  368 
  369 /*
  370  * Gather write system call.
  371  */
  372 #ifndef _SYS_SYSPROTO_H_
  373 struct writev_args {
  374         int     fd;
  375         struct  iovec *iovp;
  376         u_int   iovcnt;
  377 };
  378 #endif
  379 int
  380 writev(struct thread *td, struct writev_args *uap)
  381 {
  382         struct uio *auio;
  383         int error;
  384 
  385         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  386         if (error)
  387                 return (error);
  388         error = kern_writev(td, uap->fd, auio);
  389         free(auio, M_IOV);
  390         return (error);
  391 }
  392 
  393 int
  394 kern_writev(struct thread *td, int fd, struct uio *auio)
  395 {
  396         struct file *fp;
  397         int error;
  398 
  399         error = fget_write(td, fd, &fp);
  400         if (error)
  401                 return (error);
  402         error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
  403         fdrop(fp, td);
  404         return (error);
  405 }
  406 
  407 /*
  408  * Gather positioned write system call.
  409  */
  410 #ifndef _SYS_SYSPROTO_H_
  411 struct pwritev_args {
  412         int     fd;
  413         struct  iovec *iovp;
  414         u_int   iovcnt;
  415         off_t   offset;
  416 };
  417 #endif
  418 int
  419 pwritev(struct thread *td, struct pwritev_args *uap)
  420 {
  421         struct uio *auio;
  422         int error;
  423 
  424         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  425         if (error)
  426                 return (error);
  427         error = kern_pwritev(td, uap->fd, auio, uap->offset);
  428         free(auio, M_IOV);
  429         return (error);
  430 }
  431 
  432 int
  433 kern_pwritev(td, fd, auio, offset)
  434         struct thread *td;
  435         struct uio *auio;
  436         int fd;
  437         off_t offset;
  438 {
  439         struct file *fp;
  440         int error;
  441 
  442         error = fget_write(td, fd, &fp);
  443         if (error)
  444                 return (error);
  445         if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
  446                 error = ESPIPE;
  447         else if (offset < 0 && fp->f_vnode->v_type != VCHR)
  448                 error = EINVAL;
  449         else
  450                 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
  451         fdrop(fp, td);
  452         return (error);
  453 }
  454 
  455 /*
  456  * Common code for writev and pwritev that writes data to
  457  * a file using the passed in uio, offset, and flags.
  458  */
  459 static int
  460 dofilewrite(td, fd, fp, auio, offset, flags)
  461         struct thread *td;
  462         int fd;
  463         struct file *fp;
  464         struct uio *auio;
  465         off_t offset;
  466         int flags;
  467 {
  468         ssize_t cnt;
  469         int error;
  470 #ifdef KTRACE
  471         struct uio *ktruio = NULL;
  472 #endif
  473 
  474         auio->uio_rw = UIO_WRITE;
  475         auio->uio_td = td;
  476         auio->uio_offset = offset;
  477 #ifdef KTRACE
  478         if (KTRPOINT(td, KTR_GENIO))
  479                 ktruio = cloneuio(auio);
  480 #endif
  481         cnt = auio->uio_resid;
  482         if (fp->f_type == DTYPE_VNODE)
  483                 bwillwrite();
  484         if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
  485                 if (auio->uio_resid != cnt && (error == ERESTART ||
  486                     error == EINTR || error == EWOULDBLOCK))
  487                         error = 0;
  488                 /* Socket layer is responsible for issuing SIGPIPE. */
  489                 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
  490                         PROC_LOCK(td->td_proc);
  491                         psignal(td->td_proc, SIGPIPE);
  492                         PROC_UNLOCK(td->td_proc);
  493                 }
  494         }
  495         cnt -= auio->uio_resid;
  496 #ifdef KTRACE
  497         if (ktruio != NULL) {
  498                 ktruio->uio_resid = cnt;
  499                 ktrgenio(fd, UIO_WRITE, ktruio, error);
  500         }
  501 #endif
  502         td->td_retval[0] = cnt;
  503         return (error);
  504 }
  505 
  506 #ifndef _SYS_SYSPROTO_H_
  507 struct ioctl_args {
  508         int     fd;
  509         u_long  com;
  510         caddr_t data;
  511 };
  512 #endif
  513 /* ARGSUSED */
  514 int
  515 ioctl(struct thread *td, struct ioctl_args *uap)
  516 {
  517         u_long com;
  518         int arg, error;
  519         u_int size;
  520         caddr_t data;
  521 
  522         if (uap->com > 0xffffffff) {
  523                 printf(
  524                     "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
  525                     td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
  526                 uap->com &= 0xffffffff;
  527         }
  528         com = uap->com;
  529 
  530         /*
  531          * Interpret high order word to find amount of data to be
  532          * copied to/from the user's address space.
  533          */
  534         size = IOCPARM_LEN(com);
  535         if ((size > IOCPARM_MAX) ||
  536             ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
  537 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
  538             ((com & IOC_OUT) && size == 0) ||
  539 #else
  540             ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
  541 #endif
  542             ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
  543                 return (ENOTTY);
  544 
  545         if (size > 0) {
  546                 if (!(com & IOC_VOID))
  547                         data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
  548                 else {
  549                         /* Integer argument. */
  550                         arg = (intptr_t)uap->data;
  551                         data = (void *)&arg;
  552                         size = 0;
  553                 }
  554         } else
  555                 data = (void *)&uap->data;
  556         if (com & IOC_IN) {
  557                 error = copyin(uap->data, data, (u_int)size);
  558                 if (error) {
  559                         if (size > 0)
  560                                 free(data, M_IOCTLOPS);
  561                         return (error);
  562                 }
  563         } else if (com & IOC_OUT) {
  564                 /*
  565                  * Zero the buffer so the user always
  566                  * gets back something deterministic.
  567                  */
  568                 bzero(data, size);
  569         }
  570 
  571         error = kern_ioctl(td, uap->fd, com, data);
  572 
  573         if (error == 0 && (com & IOC_OUT))
  574                 error = copyout(data, uap->data, (u_int)size);
  575 
  576         if (size > 0)
  577                 free(data, M_IOCTLOPS);
  578         return (error);
  579 }
  580 
  581 int
  582 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
  583 {
  584         struct file *fp;
  585         struct filedesc *fdp;
  586         int error;
  587         int tmp;
  588 
  589         if ((error = fget(td, fd, &fp)) != 0)
  590                 return (error);
  591         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  592                 fdrop(fp, td);
  593                 return (EBADF);
  594         }
  595         fdp = td->td_proc->p_fd;
  596         switch (com) {
  597         case FIONCLEX:
  598                 FILEDESC_XLOCK(fdp);
  599                 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
  600                 FILEDESC_XUNLOCK(fdp);
  601                 goto out;
  602         case FIOCLEX:
  603                 FILEDESC_XLOCK(fdp);
  604                 fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
  605                 FILEDESC_XUNLOCK(fdp);
  606                 goto out;
  607         case FIONBIO:
  608                 FILE_LOCK(fp);
  609                 if ((tmp = *(int *)data))
  610                         fp->f_flag |= FNONBLOCK;
  611                 else
  612                         fp->f_flag &= ~FNONBLOCK;
  613                 FILE_UNLOCK(fp);
  614                 data = (void *)&tmp;
  615                 break;
  616         case FIOASYNC:
  617                 FILE_LOCK(fp);
  618                 if ((tmp = *(int *)data))
  619                         fp->f_flag |= FASYNC;
  620                 else
  621                         fp->f_flag &= ~FASYNC;
  622                 FILE_UNLOCK(fp);
  623                 data = (void *)&tmp;
  624                 break;
  625         }
  626 
  627         error = fo_ioctl(fp, com, data, td->td_ucred, td);
  628 out:
  629         fdrop(fp, td);
  630         return (error);
  631 }
  632 
  633 /*
  634  * sellock and selwait are initialized in selectinit() via SYSINIT.
  635  */
  636 struct mtx      sellock;
  637 struct cv       selwait;
  638 u_int           nselcoll;       /* Select collisions since boot */
  639 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
  640 
  641 int
  642 poll_no_poll(int events)
  643 {
  644         /*
  645          * Return true for read/write.  If the user asked for something
  646          * special, return POLLNVAL, so that clients have a way of
  647          * determining reliably whether or not the extended
  648          * functionality is present without hard-coding knowledge
  649          * of specific filesystem implementations.
  650          */
  651         if (events & ~POLLSTANDARD)
  652                 return (POLLNVAL);
  653 
  654         return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
  655 }
  656 
  657 #ifndef _SYS_SYSPROTO_H_
  658 struct select_args {
  659         int     nd;
  660         fd_set  *in, *ou, *ex;
  661         struct  timeval *tv;
  662 };
  663 #endif
  664 int
  665 select(td, uap)
  666         register struct thread *td;
  667         register struct select_args *uap;
  668 {
  669         struct timeval tv, *tvp;
  670         int error;
  671 
  672         if (uap->tv != NULL) {
  673                 error = copyin(uap->tv, &tv, sizeof(tv));
  674                 if (error)
  675                         return (error);
  676                 tvp = &tv;
  677         } else
  678                 tvp = NULL;
  679 
  680         return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
  681 }
  682 
  683 int
  684 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
  685     fd_set *fd_ex, struct timeval *tvp)
  686 {
  687         struct filedesc *fdp;
  688         /*
  689          * The magic 2048 here is chosen to be just enough for FD_SETSIZE
  690          * infds with the new FD_SETSIZE of 1024, and more than enough for
  691          * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
  692          * of 256.
  693          */
  694         fd_mask s_selbits[howmany(2048, NFDBITS)];
  695         fd_mask *ibits[3], *obits[3], *selbits, *sbp;
  696         struct timeval atv, rtv, ttv;
  697         int error, timo;
  698         u_int ncoll, nbufbytes, ncpbytes, nfdbits;
  699 
  700         if (nd < 0)
  701                 return (EINVAL);
  702         fdp = td->td_proc->p_fd;
  703         
  704         FILEDESC_SLOCK(fdp);
  705         if (nd > td->td_proc->p_fd->fd_nfiles)
  706                 nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
  707         FILEDESC_SUNLOCK(fdp);
  708 
  709         /*
  710          * Allocate just enough bits for the non-null fd_sets.  Use the
  711          * preallocated auto buffer if possible.
  712          */
  713         nfdbits = roundup(nd, NFDBITS);
  714         ncpbytes = nfdbits / NBBY;
  715         nbufbytes = 0;
  716         if (fd_in != NULL)
  717                 nbufbytes += 2 * ncpbytes;
  718         if (fd_ou != NULL)
  719                 nbufbytes += 2 * ncpbytes;
  720         if (fd_ex != NULL)
  721                 nbufbytes += 2 * ncpbytes;
  722         if (nbufbytes <= sizeof s_selbits)
  723                 selbits = &s_selbits[0];
  724         else
  725                 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
  726 
  727         /*
  728          * Assign pointers into the bit buffers and fetch the input bits.
  729          * Put the output buffers together so that they can be bzeroed
  730          * together.
  731          */
  732         sbp = selbits;
  733 #define getbits(name, x) \
  734         do {                                                            \
  735                 if (name == NULL)                                       \
  736                         ibits[x] = NULL;                                \
  737                 else {                                                  \
  738                         ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;   \
  739                         obits[x] = sbp;                                 \
  740                         sbp += ncpbytes / sizeof *sbp;                  \
  741                         error = copyin(name, ibits[x], ncpbytes);       \
  742                         if (error != 0)                                 \
  743                                 goto done_nosellock;                    \
  744                 }                                                       \
  745         } while (0)
  746         getbits(fd_in, 0);
  747         getbits(fd_ou, 1);
  748         getbits(fd_ex, 2);
  749 #undef  getbits
  750         if (nbufbytes != 0)
  751                 bzero(selbits, nbufbytes / 2);
  752 
  753         if (tvp != NULL) {
  754                 atv = *tvp;
  755                 if (itimerfix(&atv)) {
  756                         error = EINVAL;
  757                         goto done_nosellock;
  758                 }
  759                 getmicrouptime(&rtv);
  760                 timevaladd(&atv, &rtv);
  761         } else {
  762                 atv.tv_sec = 0;
  763                 atv.tv_usec = 0;
  764         }
  765         timo = 0;
  766         TAILQ_INIT(&td->td_selq);
  767         mtx_lock(&sellock);
  768 retry:
  769         ncoll = nselcoll;
  770         thread_lock(td);
  771         td->td_flags |= TDF_SELECT;
  772         thread_unlock(td);
  773         mtx_unlock(&sellock);
  774 
  775         error = selscan(td, ibits, obits, nd);
  776         mtx_lock(&sellock);
  777         if (error || td->td_retval[0])
  778                 goto done;
  779         if (atv.tv_sec || atv.tv_usec) {
  780                 getmicrouptime(&rtv);
  781                 if (timevalcmp(&rtv, &atv, >=))
  782                         goto done;
  783                 ttv = atv;
  784                 timevalsub(&ttv, &rtv);
  785                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  786                     24 * 60 * 60 * hz : tvtohz(&ttv);
  787         }
  788 
  789         /*
  790          * An event of interest may occur while we do not hold
  791          * sellock, so check TDF_SELECT and the number of
  792          * collisions and rescan the file descriptors if
  793          * necessary.
  794          */
  795         thread_lock(td);
  796         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  797                 thread_unlock(td);
  798                 goto retry;
  799         }
  800         thread_unlock(td);
  801 
  802         if (timo > 0)
  803                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  804         else
  805                 error = cv_wait_sig(&selwait, &sellock);
  806         
  807         if (error == 0)
  808                 goto retry;
  809 
  810 done:
  811         clear_selinfo_list(td);
  812         thread_lock(td);
  813         td->td_flags &= ~TDF_SELECT;
  814         thread_unlock(td);
  815         mtx_unlock(&sellock);
  816 
  817 done_nosellock:
  818         /* select is not restarted after signals... */
  819         if (error == ERESTART)
  820                 error = EINTR;
  821         if (error == EWOULDBLOCK)
  822                 error = 0;
  823 #define putbits(name, x) \
  824         if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
  825                 error = error2;
  826         if (error == 0) {
  827                 int error2;
  828 
  829                 putbits(fd_in, 0);
  830                 putbits(fd_ou, 1);
  831                 putbits(fd_ex, 2);
  832 #undef putbits
  833         }
  834         if (selbits != &s_selbits[0])
  835                 free(selbits, M_SELECT);
  836 
  837         return (error);
  838 }
  839 
  840 static int
  841 selscan(td, ibits, obits, nfd)
  842         struct thread *td;
  843         fd_mask **ibits, **obits;
  844         int nfd;
  845 {
  846         int msk, i, fd;
  847         fd_mask bits;
  848         struct file *fp;
  849         int n = 0;
  850         /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
  851         static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
  852         struct filedesc *fdp = td->td_proc->p_fd;
  853 
  854         FILEDESC_SLOCK(fdp);
  855         for (msk = 0; msk < 3; msk++) {
  856                 if (ibits[msk] == NULL)
  857                         continue;
  858                 for (i = 0; i < nfd; i += NFDBITS) {
  859                         bits = ibits[msk][i/NFDBITS];
  860                         /* ffs(int mask) not portable, fd_mask is long */
  861                         for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
  862                                 if (!(bits & 1))
  863                                         continue;
  864                                 if ((fp = fget_locked(fdp, fd)) == NULL) {
  865                                         FILEDESC_SUNLOCK(fdp);
  866                                         return (EBADF);
  867                                 }
  868                                 if (fo_poll(fp, flag[msk], td->td_ucred,
  869                                     td)) {
  870                                         obits[msk][(fd)/NFDBITS] |=
  871                                             ((fd_mask)1 << ((fd) % NFDBITS));
  872                                         n++;
  873                                 }
  874                         }
  875                 }
  876         }
  877         FILEDESC_SUNLOCK(fdp);
  878         td->td_retval[0] = n;
  879         return (0);
  880 }
  881 
  882 #ifndef _SYS_SYSPROTO_H_
  883 struct poll_args {
  884         struct pollfd *fds;
  885         u_int   nfds;
  886         int     timeout;
  887 };
  888 #endif
  889 int
  890 poll(td, uap)
  891         struct thread *td;
  892         struct poll_args *uap;
  893 {
  894         struct pollfd *bits;
  895         struct pollfd smallbits[32];
  896         struct timeval atv, rtv, ttv;
  897         int error = 0, timo;
  898         u_int ncoll, nfds;
  899         size_t ni;
  900 
  901         nfds = uap->nfds;
  902 
  903         /*
  904          * This is kinda bogus.  We have fd limits, but that is not
  905          * really related to the size of the pollfd array.  Make sure
  906          * we let the process use at least FD_SETSIZE entries and at
  907          * least enough for the current limits.  We want to be reasonably
  908          * safe, but not overly restrictive.
  909          */
  910         PROC_LOCK(td->td_proc);
  911         if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
  912             (nfds > FD_SETSIZE)) {
  913                 PROC_UNLOCK(td->td_proc);
  914                 error = EINVAL;
  915                 goto done2;
  916         }
  917         PROC_UNLOCK(td->td_proc);
  918         ni = nfds * sizeof(struct pollfd);
  919         if (ni > sizeof(smallbits))
  920                 bits = malloc(ni, M_TEMP, M_WAITOK);
  921         else
  922                 bits = smallbits;
  923         error = copyin(uap->fds, bits, ni);
  924         if (error)
  925                 goto done_nosellock;
  926         if (uap->timeout != INFTIM) {
  927                 atv.tv_sec = uap->timeout / 1000;
  928                 atv.tv_usec = (uap->timeout % 1000) * 1000;
  929                 if (itimerfix(&atv)) {
  930                         error = EINVAL;
  931                         goto done_nosellock;
  932                 }
  933                 getmicrouptime(&rtv);
  934                 timevaladd(&atv, &rtv);
  935         } else {
  936                 atv.tv_sec = 0;
  937                 atv.tv_usec = 0;
  938         }
  939         timo = 0;
  940         TAILQ_INIT(&td->td_selq);
  941         mtx_lock(&sellock);
  942 retry:
  943         ncoll = nselcoll;
  944         thread_lock(td);
  945         td->td_flags |= TDF_SELECT;
  946         thread_unlock(td);
  947         mtx_unlock(&sellock);
  948 
  949         error = pollscan(td, bits, nfds);
  950         mtx_lock(&sellock);
  951         if (error || td->td_retval[0])
  952                 goto done;
  953         if (atv.tv_sec || atv.tv_usec) {
  954                 getmicrouptime(&rtv);
  955                 if (timevalcmp(&rtv, &atv, >=))
  956                         goto done;
  957                 ttv = atv;
  958                 timevalsub(&ttv, &rtv);
  959                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  960                     24 * 60 * 60 * hz : tvtohz(&ttv);
  961         }
  962         /*
  963          * An event of interest may occur while we do not hold
  964          * sellock, so check TDF_SELECT and the number of collisions
  965          * and rescan the file descriptors if necessary.
  966          */
  967         thread_lock(td);
  968         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  969                 thread_unlock(td);
  970                 goto retry;
  971         }
  972         thread_unlock(td);
  973 
  974         if (timo > 0)
  975                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  976         else
  977                 error = cv_wait_sig(&selwait, &sellock);
  978 
  979         if (error == 0)
  980                 goto retry;
  981 
  982 done:
  983         clear_selinfo_list(td);
  984         thread_lock(td);
  985         td->td_flags &= ~TDF_SELECT;
  986         thread_unlock(td);
  987         mtx_unlock(&sellock);
  988 
  989 done_nosellock:
  990         /* poll is not restarted after signals... */
  991         if (error == ERESTART)
  992                 error = EINTR;
  993         if (error == EWOULDBLOCK)
  994                 error = 0;
  995         if (error == 0) {
  996                 error = pollout(bits, uap->fds, nfds);
  997                 if (error)
  998                         goto out;
  999         }
 1000 out:
 1001         if (ni > sizeof(smallbits))
 1002                 free(bits, M_TEMP);
 1003 done2:
 1004         return (error);
 1005 }
 1006 
 1007 static int
 1008 pollout(fds, ufds, nfd)
 1009         struct pollfd *fds;
 1010         struct pollfd *ufds;
 1011         u_int nfd;
 1012 {
 1013         int error = 0;
 1014         u_int i = 0;
 1015 
 1016         for (i = 0; i < nfd; i++) {
 1017                 error = copyout(&fds->revents, &ufds->revents,
 1018                     sizeof(ufds->revents));
 1019                 if (error)
 1020                         return (error);
 1021                 fds++;
 1022                 ufds++;
 1023         }
 1024         return (0);
 1025 }
 1026 
 1027 static int
 1028 pollscan(td, fds, nfd)
 1029         struct thread *td;
 1030         struct pollfd *fds;
 1031         u_int nfd;
 1032 {
 1033         register struct filedesc *fdp = td->td_proc->p_fd;
 1034         int i;
 1035         struct file *fp;
 1036         int n = 0;
 1037 
 1038         FILEDESC_SLOCK(fdp);
 1039         for (i = 0; i < nfd; i++, fds++) {
 1040                 if (fds->fd >= fdp->fd_nfiles) {
 1041                         fds->revents = POLLNVAL;
 1042                         n++;
 1043                 } else if (fds->fd < 0) {
 1044                         fds->revents = 0;
 1045                 } else {
 1046                         fp = fdp->fd_ofiles[fds->fd];
 1047                         if (fp == NULL) {
 1048                                 fds->revents = POLLNVAL;
 1049                                 n++;
 1050                         } else {
 1051                                 /*
 1052                                  * Note: backend also returns POLLHUP and
 1053                                  * POLLERR if appropriate.
 1054                                  */
 1055                                 fds->revents = fo_poll(fp, fds->events,
 1056                                     td->td_ucred, td);
 1057                                 if (fds->revents != 0)
 1058                                         n++;
 1059                         }
 1060                 }
 1061         }
 1062         FILEDESC_SUNLOCK(fdp);
 1063         td->td_retval[0] = n;
 1064         return (0);
 1065 }
 1066 
 1067 /*
 1068  * OpenBSD poll system call.
 1069  *
 1070  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
 1071  */
 1072 #ifndef _SYS_SYSPROTO_H_
 1073 struct openbsd_poll_args {
 1074         struct pollfd *fds;
 1075         u_int   nfds;
 1076         int     timeout;
 1077 };
 1078 #endif
 1079 int
 1080 openbsd_poll(td, uap)
 1081         register struct thread *td;
 1082         register struct openbsd_poll_args *uap;
 1083 {
 1084         return (poll(td, (struct poll_args *)uap));
 1085 }
 1086 
 1087 /*
 1088  * Remove the references to the thread from all of the objects we were
 1089  * polling.
 1090  *
 1091  * This code assumes that the underlying owner of the selinfo structure will
 1092  * hold sellock before it changes it, and that it will unlink itself from our
 1093  * list if it goes away.
 1094  */
 1095 void
 1096 clear_selinfo_list(td)
 1097         struct thread *td;
 1098 {
 1099         struct selinfo *si;
 1100 
 1101         mtx_assert(&sellock, MA_OWNED);
 1102         TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 1103                 si->si_thread = NULL;
 1104         TAILQ_INIT(&td->td_selq);
 1105 }
 1106 
 1107 /*
 1108  * Record a select request.
 1109  */
 1110 void
 1111 selrecord(selector, sip)
 1112         struct thread *selector;
 1113         struct selinfo *sip;
 1114 {
 1115 
 1116         mtx_lock(&sellock);
 1117         /*
 1118          * If the selinfo's thread pointer is NULL then take ownership of it.
 1119          *
 1120          * If the thread pointer is not NULL and it points to another
 1121          * thread, then we have a collision.
 1122          *
 1123          * If the thread pointer is not NULL and points back to us then leave
 1124          * it alone as we've already added pointed it at us and added it to
 1125          * our list.
 1126          */
 1127         if (sip->si_thread == NULL) {
 1128                 sip->si_thread = selector;
 1129                 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 1130         } else if (sip->si_thread != selector) {
 1131                 sip->si_flags |= SI_COLL;
 1132         }
 1133 
 1134         mtx_unlock(&sellock);
 1135 }
 1136 
 1137 /* Wake up a selecting thread. */
 1138 void
 1139 selwakeup(sip)
 1140         struct selinfo *sip;
 1141 {
 1142         doselwakeup(sip, -1);
 1143 }
 1144 
 1145 /* Wake up a selecting thread, and set its priority. */
 1146 void
 1147 selwakeuppri(sip, pri)
 1148         struct selinfo *sip;
 1149         int pri;
 1150 {
 1151         doselwakeup(sip, pri);
 1152 }
 1153 
 1154 /*
 1155  * Do a wakeup when a selectable event occurs.
 1156  */
 1157 static void
 1158 doselwakeup(sip, pri)
 1159         struct selinfo *sip;
 1160         int pri;
 1161 {
 1162         struct thread *td;
 1163 
 1164         mtx_lock(&sellock);
 1165         td = sip->si_thread;
 1166         if ((sip->si_flags & SI_COLL) != 0) {
 1167                 nselcoll++;
 1168                 sip->si_flags &= ~SI_COLL;
 1169                 cv_broadcastpri(&selwait, pri);
 1170         }
 1171         if (td == NULL) {
 1172                 mtx_unlock(&sellock);
 1173                 return;
 1174         }
 1175         TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 1176         sip->si_thread = NULL;
 1177         thread_lock(td);
 1178         td->td_flags &= ~TDF_SELECT;
 1179         thread_unlock(td);
 1180         sleepq_remove(td, &selwait);
 1181         mtx_unlock(&sellock);
 1182 }
 1183 
 1184 static void selectinit(void *);
 1185 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 1186 
 1187 /* ARGSUSED*/
 1188 static void
 1189 selectinit(dummy)
 1190         void *dummy;
 1191 {
 1192         cv_init(&selwait, "select");
 1193         mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 1194 }

Cache object: b4b989f9ab5ac49408ddac3a4dd1d550


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.