The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)sys_generic.c       8.5 (Berkeley) 1/21/94
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_ktrace.h"
   41 
   42 #include <sys/param.h>
   43 #include <sys/systm.h>
   44 #include <sys/sysproto.h>
   45 #include <sys/filedesc.h>
   46 #include <sys/filio.h>
   47 #include <sys/fcntl.h>
   48 #include <sys/file.h>
   49 #include <sys/proc.h>
   50 #include <sys/signalvar.h>
   51 #include <sys/socketvar.h>
   52 #include <sys/uio.h>
   53 #include <sys/kernel.h>
   54 #include <sys/limits.h>
   55 #include <sys/malloc.h>
   56 #include <sys/poll.h>
   57 #include <sys/resourcevar.h>
   58 #include <sys/selinfo.h>
   59 #include <sys/sleepqueue.h>
   60 #include <sys/syscallsubr.h>
   61 #include <sys/sysctl.h>
   62 #include <sys/sysent.h>
   63 #include <sys/vnode.h>
   64 #include <sys/bio.h>
   65 #include <sys/buf.h>
   66 #include <sys/condvar.h>
   67 #ifdef KTRACE
   68 #include <sys/ktrace.h>
   69 #endif
   70 #include <vm/vm.h>
   71 #include <vm/vm_page.h>
   72 
   73 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
   74 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
   75 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
   76 
   77 static int      pollscan(struct thread *, struct pollfd *, u_int);
   78 static int      selscan(struct thread *, fd_mask **, fd_mask **, int);
   79 static int      dofileread(struct thread *, int, struct file *, struct uio *,
   80                     off_t, int);
   81 static int      dofilewrite(struct thread *, int, struct file *, struct uio *,
   82                     off_t, int);
   83 static void     doselwakeup(struct selinfo *, int);
   84 
   85 /*
   86  * Read system call.
   87  */
   88 #ifndef _SYS_SYSPROTO_H_
   89 struct read_args {
   90         int     fd;
   91         void    *buf;
   92         size_t  nbyte;
   93 };
   94 #endif
   95 /*
   96  * MPSAFE
   97  */
   98 int
   99 read(td, uap)
  100         struct thread *td;
  101         struct read_args *uap;
  102 {
  103         struct uio auio;
  104         struct iovec aiov;
  105         int error;
  106 
  107         if (uap->nbyte > INT_MAX)
  108                 return (EINVAL);
  109         aiov.iov_base = uap->buf;
  110         aiov.iov_len = uap->nbyte;
  111         auio.uio_iov = &aiov;
  112         auio.uio_iovcnt = 1;
  113         auio.uio_resid = uap->nbyte;
  114         auio.uio_segflg = UIO_USERSPACE;
  115         error = kern_readv(td, uap->fd, &auio);
  116         return(error);
  117 }
  118 
  119 /*
  120  * Positioned read system call
  121  */
  122 #ifndef _SYS_SYSPROTO_H_
  123 struct pread_args {
  124         int     fd;
  125         void    *buf;
  126         size_t  nbyte;
  127         int     pad;
  128         off_t   offset;
  129 };
  130 #endif
  131 /*
  132  * MPSAFE
  133  */
  134 int
  135 pread(td, uap)
  136         struct thread *td;
  137         struct pread_args *uap;
  138 {
  139         struct uio auio;
  140         struct iovec aiov;
  141         int error;
  142 
  143         if (uap->nbyte > INT_MAX)
  144                 return (EINVAL);
  145         aiov.iov_base = uap->buf;
  146         aiov.iov_len = uap->nbyte;
  147         auio.uio_iov = &aiov;
  148         auio.uio_iovcnt = 1;
  149         auio.uio_resid = uap->nbyte;
  150         auio.uio_segflg = UIO_USERSPACE;
  151         error = kern_preadv(td, uap->fd, &auio, uap->offset);
  152         return(error);
  153 }
  154 
  155 /*
  156  * Scatter read system call.
  157  */
  158 #ifndef _SYS_SYSPROTO_H_
  159 struct readv_args {
  160         int     fd;
  161         struct  iovec *iovp;
  162         u_int   iovcnt;
  163 };
  164 #endif
  165 /*
  166  * MPSAFE
  167  */
  168 int
  169 readv(struct thread *td, struct readv_args *uap)
  170 {
  171         struct uio *auio;
  172         int error;
  173 
  174         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  175         if (error)
  176                 return (error);
  177         error = kern_readv(td, uap->fd, auio);
  178         free(auio, M_IOV);
  179         return (error);
  180 }
  181 
  182 int
  183 kern_readv(struct thread *td, int fd, struct uio *auio)
  184 {
  185         struct file *fp;
  186         int error;
  187 
  188         error = fget_read(td, fd, &fp);
  189         if (error)
  190                 return (error);
  191         error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
  192         fdrop(fp, td);
  193         return (error);
  194 }
  195 
  196 /*
  197  * Scatter positioned read system call.
  198  */
  199 #ifndef _SYS_SYSPROTO_H_
  200 struct preadv_args {
  201         int     fd;
  202         struct  iovec *iovp;
  203         u_int   iovcnt;
  204         off_t   offset;
  205 };
  206 #endif
  207 /*
  208  * MPSAFE
  209  */
  210 int
  211 preadv(struct thread *td, struct preadv_args *uap)
  212 {
  213         struct uio *auio;
  214         int error;
  215 
  216         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  217         if (error)
  218                 return (error);
  219         error = kern_preadv(td, uap->fd, auio, uap->offset);
  220         free(auio, M_IOV);
  221         return (error);
  222 }
  223 
  224 int
  225 kern_preadv(td, fd, auio, offset)
  226         struct thread *td;
  227         int fd;
  228         struct uio *auio;
  229         off_t offset;
  230 {
  231         struct file *fp;
  232         int error;
  233 
  234         error = fget_read(td, fd, &fp);
  235         if (error)
  236                 return (error);
  237         if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
  238                 error = ESPIPE;
  239         else if (offset < 0 && fp->f_vnode->v_type != VCHR)
  240                 error = EINVAL;
  241         else
  242                 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
  243         fdrop(fp, td);
  244         return (error);
  245 }
  246 
  247 /*
  248  * Common code for readv and preadv that reads data in
  249  * from a file using the passed in uio, offset, and flags.
  250  */
  251 static int
  252 dofileread(td, fd, fp, auio, offset, flags)
  253         struct thread *td;
  254         int fd;
  255         struct file *fp;
  256         struct uio *auio;
  257         off_t offset;
  258         int flags;
  259 {
  260         ssize_t cnt;
  261         int error;
  262 #ifdef KTRACE
  263         struct uio *ktruio = NULL;
  264 #endif
  265 
  266         /* Finish zero length reads right here */
  267         if (auio->uio_resid == 0) {
  268                 td->td_retval[0] = 0;
  269                 return(0);
  270         }
  271         auio->uio_rw = UIO_READ;
  272         auio->uio_offset = offset;
  273         auio->uio_td = td;
  274 #ifdef KTRACE
  275         if (KTRPOINT(td, KTR_GENIO)) 
  276                 ktruio = cloneuio(auio);
  277 #endif
  278         cnt = auio->uio_resid;
  279         if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
  280                 if (auio->uio_resid != cnt && (error == ERESTART ||
  281                     error == EINTR || error == EWOULDBLOCK))
  282                         error = 0;
  283         }
  284         cnt -= auio->uio_resid;
  285 #ifdef KTRACE
  286         if (ktruio != NULL) {
  287                 ktruio->uio_resid = cnt;
  288                 ktrgenio(fd, UIO_READ, ktruio, error);
  289         }
  290 #endif
  291         td->td_retval[0] = cnt;
  292         return (error);
  293 }
  294 
  295 /*
  296  * Write system call
  297  */
  298 #ifndef _SYS_SYSPROTO_H_
  299 struct write_args {
  300         int     fd;
  301         const void *buf;
  302         size_t  nbyte;
  303 };
  304 #endif
  305 /*
  306  * MPSAFE
  307  */
  308 int
  309 write(td, uap)
  310         struct thread *td;
  311         struct write_args *uap;
  312 {
  313         struct uio auio;
  314         struct iovec aiov;
  315         int error;
  316 
  317         if (uap->nbyte > INT_MAX)
  318                 return (EINVAL);
  319         aiov.iov_base = (void *)(uintptr_t)uap->buf;
  320         aiov.iov_len = uap->nbyte;
  321         auio.uio_iov = &aiov;
  322         auio.uio_iovcnt = 1;
  323         auio.uio_resid = uap->nbyte;
  324         auio.uio_segflg = UIO_USERSPACE;
  325         error = kern_writev(td, uap->fd, &auio);
  326         return(error);
  327 }
  328 
  329 /*
  330  * Positioned write system call
  331  */
  332 #ifndef _SYS_SYSPROTO_H_
  333 struct pwrite_args {
  334         int     fd;
  335         const void *buf;
  336         size_t  nbyte;
  337         int     pad;
  338         off_t   offset;
  339 };
  340 #endif
  341 /*
  342  * MPSAFE
  343  */
  344 int
  345 pwrite(td, uap)
  346         struct thread *td;
  347         struct pwrite_args *uap;
  348 {
  349         struct uio auio;
  350         struct iovec aiov;
  351         int error;
  352 
  353         if (uap->nbyte > INT_MAX)
  354                 return (EINVAL);
  355         aiov.iov_base = (void *)(uintptr_t)uap->buf;
  356         aiov.iov_len = uap->nbyte;
  357         auio.uio_iov = &aiov;
  358         auio.uio_iovcnt = 1;
  359         auio.uio_resid = uap->nbyte;
  360         auio.uio_segflg = UIO_USERSPACE;
  361         error = kern_pwritev(td, uap->fd, &auio, uap->offset);
  362         return(error);
  363 }
  364 
  365 /*
  366  * Gather write system call
  367  */
  368 #ifndef _SYS_SYSPROTO_H_
  369 struct writev_args {
  370         int     fd;
  371         struct  iovec *iovp;
  372         u_int   iovcnt;
  373 };
  374 #endif
  375 /*
  376  * MPSAFE
  377  */
  378 int
  379 writev(struct thread *td, struct writev_args *uap)
  380 {
  381         struct uio *auio;
  382         int error;
  383 
  384         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  385         if (error)
  386                 return (error);
  387         error = kern_writev(td, uap->fd, auio);
  388         free(auio, M_IOV);
  389         return (error);
  390 }
  391 
  392 int
  393 kern_writev(struct thread *td, int fd, struct uio *auio)
  394 {
  395         struct file *fp;
  396         int error;
  397 
  398         error = fget_write(td, fd, &fp);
  399         if (error)
  400                 return (EBADF); /* XXX this can't be right */
  401         error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
  402         fdrop(fp, td);
  403         return (error);
  404 }
  405 
  406 /*
  407  * Gather positioned write system call
  408  */
  409 #ifndef _SYS_SYSPROTO_H_
  410 struct pwritev_args {
  411         int     fd;
  412         struct  iovec *iovp;
  413         u_int   iovcnt;
  414         off_t   offset;
  415 };
  416 #endif
  417 /*
  418  * MPSAFE
  419  */
  420 int
  421 pwritev(struct thread *td, struct pwritev_args *uap)
  422 {
  423         struct uio *auio;
  424         int error;
  425 
  426         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  427         if (error)
  428                 return (error);
  429         error = kern_pwritev(td, uap->fd, auio, uap->offset);
  430         free(auio, M_IOV);
  431         return (error);
  432 }
  433 
  434 int
  435 kern_pwritev(td, fd, auio, offset)
  436         struct thread *td;
  437         struct uio *auio;
  438         int fd;
  439         off_t offset;
  440 {
  441         struct file *fp;
  442         int error;
  443 
  444         error = fget_write(td, fd, &fp);
  445         if (error)
  446                 return (EBADF); /* XXX this can't be right */
  447         if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
  448                 error = ESPIPE;
  449         else if (offset < 0 && fp->f_vnode->v_type != VCHR)
  450                 error = EINVAL;
  451         else
  452                 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
  453         fdrop(fp, td);
  454         return (error);
  455 }
  456 
  457 /*
  458  * Common code for writev and pwritev that writes data to
  459  * a file using the passed in uio, offset, and flags.
  460  */
  461 static int
  462 dofilewrite(td, fd, fp, auio, offset, flags)
  463         struct thread *td;
  464         int fd;
  465         struct file *fp;
  466         struct uio *auio;
  467         off_t offset;
  468         int flags;
  469 {
  470         ssize_t cnt;
  471         int error;
  472 #ifdef KTRACE
  473         struct uio *ktruio = NULL;
  474 #endif
  475 
  476         auio->uio_rw = UIO_WRITE;
  477         auio->uio_td = td;
  478         auio->uio_offset = offset;
  479 #ifdef KTRACE
  480         if (KTRPOINT(td, KTR_GENIO))
  481                 ktruio = cloneuio(auio);
  482 #endif
  483         cnt = auio->uio_resid;
  484         if (fp->f_type == DTYPE_VNODE)
  485                 bwillwrite();
  486         if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
  487                 if (auio->uio_resid != cnt && (error == ERESTART ||
  488                     error == EINTR || error == EWOULDBLOCK))
  489                         error = 0;
  490                 /* Socket layer is responsible for issuing SIGPIPE. */
  491                 if (error == EPIPE) {
  492                         PROC_LOCK(td->td_proc);
  493                         psignal(td->td_proc, SIGPIPE);
  494                         PROC_UNLOCK(td->td_proc);
  495                 }
  496         }
  497         cnt -= auio->uio_resid;
  498 #ifdef KTRACE
  499         if (ktruio != NULL) {
  500                 ktruio->uio_resid = cnt;
  501                 ktrgenio(fd, UIO_WRITE, ktruio, error);
  502         }
  503 #endif
  504         td->td_retval[0] = cnt;
  505         return (error);
  506 }
  507 
  508 /*
  509  * Ioctl system call
  510  */
  511 #ifndef _SYS_SYSPROTO_H_
  512 struct ioctl_args {
  513         int     fd;
  514         u_long  com;
  515         caddr_t data;
  516 };
  517 #endif
  518 /*
  519  * MPSAFE
  520  */
  521 /* ARGSUSED */
  522 int
  523 ioctl(struct thread *td, struct ioctl_args *uap)
  524 {
  525         struct file *fp;
  526         struct filedesc *fdp;
  527         u_long com;
  528         int error = 0;
  529         u_int size;
  530         caddr_t data, memp;
  531         int tmp;
  532 
  533         if ((error = fget(td, uap->fd, &fp)) != 0)
  534                 return (error);
  535         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  536                 fdrop(fp, td);
  537                 return (EBADF);
  538         }
  539         fdp = td->td_proc->p_fd;
  540         switch (com = uap->com) {
  541         case FIONCLEX:
  542                 FILEDESC_LOCK_FAST(fdp);
  543                 fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
  544                 FILEDESC_UNLOCK_FAST(fdp);
  545                 fdrop(fp, td);
  546                 return (0);
  547         case FIOCLEX:
  548                 FILEDESC_LOCK_FAST(fdp);
  549                 fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
  550                 FILEDESC_UNLOCK_FAST(fdp);
  551                 fdrop(fp, td);
  552                 return (0);
  553         }
  554 
  555         /*
  556          * Interpret high order word to find amount of data to be
  557          * copied to/from the user's address space.
  558          */
  559         size = IOCPARM_LEN(com);
  560         if ((size > IOCPARM_MAX) ||
  561             ((com & (IOC_VOID | IOC_IN | IOC_OUT)) == 0)) {
  562                 fdrop(fp, td);
  563                 return (ENOTTY);
  564         }
  565 
  566         if (size > 0) {
  567                 memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
  568                 data = memp;
  569         } else {
  570                 memp = NULL;
  571                 data = (void *)&uap->data;
  572         }
  573         if (com & IOC_IN) {
  574                 error = copyin(uap->data, data, (u_int)size);
  575                 if (error) {
  576                         free(memp, M_IOCTLOPS);
  577                         fdrop(fp, td);
  578                         return (error);
  579                 }
  580         } else if (com & IOC_OUT) {
  581                 /*
  582                  * Zero the buffer so the user always
  583                  * gets back something deterministic.
  584                  */
  585                 bzero(data, size);
  586         }
  587 
  588         if (com == FIONBIO) {
  589                 FILE_LOCK(fp);
  590                 if ((tmp = *(int *)data))
  591                         fp->f_flag |= FNONBLOCK;
  592                 else
  593                         fp->f_flag &= ~FNONBLOCK;
  594                 FILE_UNLOCK(fp);
  595                 data = (void *)&tmp;
  596         } else if (com == FIOASYNC) {
  597                 FILE_LOCK(fp);
  598                 if ((tmp = *(int *)data))
  599                         fp->f_flag |= FASYNC;
  600                 else
  601                         fp->f_flag &= ~FASYNC;
  602                 FILE_UNLOCK(fp);
  603                 data = (void *)&tmp;
  604         }
  605 
  606         error = fo_ioctl(fp, com, data, td->td_ucred, td);
  607 
  608         if (error == 0 && (com & IOC_OUT))
  609                 error = copyout(data, uap->data, (u_int)size);
  610 
  611         if (memp != NULL)
  612                 free(memp, M_IOCTLOPS);
  613         fdrop(fp, td);
  614         return (error);
  615 }
  616 
  617 /*
  618  * sellock and selwait are initialized in selectinit() via SYSINIT.
  619  */
  620 struct mtx      sellock;
  621 struct cv       selwait;
  622 u_int           nselcoll;       /* Select collisions since boot */
  623 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
  624 
  625 /*
  626  * Select system call.
  627  */
  628 #ifndef _SYS_SYSPROTO_H_
  629 struct select_args {
  630         int     nd;
  631         fd_set  *in, *ou, *ex;
  632         struct  timeval *tv;
  633 };
  634 #endif
  635 /*
  636  * MPSAFE
  637  */
  638 int
  639 select(td, uap)
  640         register struct thread *td;
  641         register struct select_args *uap;
  642 {
  643         struct timeval tv, *tvp;
  644         int error;
  645 
  646         if (uap->tv != NULL) {
  647                 error = copyin(uap->tv, &tv, sizeof(tv));
  648                 if (error)
  649                         return (error);
  650                 tvp = &tv;
  651         } else
  652                 tvp = NULL;
  653 
  654         return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
  655 }
  656 
  657 int
  658 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
  659     fd_set *fd_ex, struct timeval *tvp)
  660 {
  661         struct filedesc *fdp;
  662         /*
  663          * The magic 2048 here is chosen to be just enough for FD_SETSIZE
  664          * infds with the new FD_SETSIZE of 1024, and more than enough for
  665          * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
  666          * of 256.
  667          */
  668         fd_mask s_selbits[howmany(2048, NFDBITS)];
  669         fd_mask *ibits[3], *obits[3], *selbits, *sbp;
  670         struct timeval atv, rtv, ttv;
  671         int error, timo;
  672         u_int ncoll, nbufbytes, ncpbytes, nfdbits;
  673 
  674         if (nd < 0)
  675                 return (EINVAL);
  676         fdp = td->td_proc->p_fd;
  677         
  678         FILEDESC_LOCK_FAST(fdp);
  679 
  680         if (nd > td->td_proc->p_fd->fd_nfiles)
  681                 nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
  682         FILEDESC_UNLOCK_FAST(fdp);
  683 
  684         /*
  685          * Allocate just enough bits for the non-null fd_sets.  Use the
  686          * preallocated auto buffer if possible.
  687          */
  688         nfdbits = roundup(nd, NFDBITS);
  689         ncpbytes = nfdbits / NBBY;
  690         nbufbytes = 0;
  691         if (fd_in != NULL)
  692                 nbufbytes += 2 * ncpbytes;
  693         if (fd_ou != NULL)
  694                 nbufbytes += 2 * ncpbytes;
  695         if (fd_ex != NULL)
  696                 nbufbytes += 2 * ncpbytes;
  697         if (nbufbytes <= sizeof s_selbits)
  698                 selbits = &s_selbits[0];
  699         else
  700                 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
  701 
  702         /*
  703          * Assign pointers into the bit buffers and fetch the input bits.
  704          * Put the output buffers together so that they can be bzeroed
  705          * together.
  706          */
  707         sbp = selbits;
  708 #define getbits(name, x) \
  709         do {                                                            \
  710                 if (name == NULL)                                       \
  711                         ibits[x] = NULL;                                \
  712                 else {                                                  \
  713                         ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;   \
  714                         obits[x] = sbp;                                 \
  715                         sbp += ncpbytes / sizeof *sbp;                  \
  716                         error = copyin(name, ibits[x], ncpbytes);       \
  717                         if (error != 0)                                 \
  718                                 goto done_nosellock;                    \
  719                 }                                                       \
  720         } while (0)
  721         getbits(fd_in, 0);
  722         getbits(fd_ou, 1);
  723         getbits(fd_ex, 2);
  724 #undef  getbits
  725         if (nbufbytes != 0)
  726                 bzero(selbits, nbufbytes / 2);
  727 
  728         if (tvp != NULL) {
  729                 atv = *tvp;
  730                 if (itimerfix(&atv)) {
  731                         error = EINVAL;
  732                         goto done_nosellock;
  733                 }
  734                 getmicrouptime(&rtv);
  735                 timevaladd(&atv, &rtv);
  736         } else {
  737                 atv.tv_sec = 0;
  738                 atv.tv_usec = 0;
  739         }
  740         timo = 0;
  741         TAILQ_INIT(&td->td_selq);
  742         mtx_lock(&sellock);
  743 retry:
  744         ncoll = nselcoll;
  745         mtx_lock_spin(&sched_lock);
  746         td->td_flags |= TDF_SELECT;
  747         mtx_unlock_spin(&sched_lock);
  748         mtx_unlock(&sellock);
  749 
  750         error = selscan(td, ibits, obits, nd);
  751         mtx_lock(&sellock);
  752         if (error || td->td_retval[0])
  753                 goto done;
  754         if (atv.tv_sec || atv.tv_usec) {
  755                 getmicrouptime(&rtv);
  756                 if (timevalcmp(&rtv, &atv, >=))
  757                         goto done;
  758                 ttv = atv;
  759                 timevalsub(&ttv, &rtv);
  760                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  761                     24 * 60 * 60 * hz : tvtohz(&ttv);
  762         }
  763 
  764         /*
  765          * An event of interest may occur while we do not hold
  766          * sellock, so check TDF_SELECT and the number of
  767          * collisions and rescan the file descriptors if
  768          * necessary.
  769          */
  770         mtx_lock_spin(&sched_lock);
  771         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  772                 mtx_unlock_spin(&sched_lock);
  773                 goto retry;
  774         }
  775         mtx_unlock_spin(&sched_lock);
  776 
  777         if (timo > 0)
  778                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  779         else
  780                 error = cv_wait_sig(&selwait, &sellock);
  781         
  782         if (error == 0)
  783                 goto retry;
  784 
  785 done:
  786         clear_selinfo_list(td);
  787         mtx_lock_spin(&sched_lock);
  788         td->td_flags &= ~TDF_SELECT;
  789         mtx_unlock_spin(&sched_lock);
  790         mtx_unlock(&sellock);
  791 
  792 done_nosellock:
  793         /* select is not restarted after signals... */
  794         if (error == ERESTART)
  795                 error = EINTR;
  796         if (error == EWOULDBLOCK)
  797                 error = 0;
  798 #define putbits(name, x) \
  799         if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
  800                 error = error2;
  801         if (error == 0) {
  802                 int error2;
  803 
  804                 putbits(fd_in, 0);
  805                 putbits(fd_ou, 1);
  806                 putbits(fd_ex, 2);
  807 #undef putbits
  808         }
  809         if (selbits != &s_selbits[0])
  810                 free(selbits, M_SELECT);
  811 
  812         return (error);
  813 }
  814 
  815 static int
  816 selscan(td, ibits, obits, nfd)
  817         struct thread *td;
  818         fd_mask **ibits, **obits;
  819         int nfd;
  820 {
  821         int msk, i, fd;
  822         fd_mask bits;
  823         struct file *fp;
  824         int n = 0;
  825         /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
  826         static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
  827         struct filedesc *fdp = td->td_proc->p_fd;
  828 
  829         FILEDESC_LOCK(fdp);
  830         for (msk = 0; msk < 3; msk++) {
  831                 if (ibits[msk] == NULL)
  832                         continue;
  833                 for (i = 0; i < nfd; i += NFDBITS) {
  834                         bits = ibits[msk][i/NFDBITS];
  835                         /* ffs(int mask) not portable, fd_mask is long */
  836                         for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
  837                                 if (!(bits & 1))
  838                                         continue;
  839                                 if ((fp = fget_locked(fdp, fd)) == NULL) {
  840                                         FILEDESC_UNLOCK(fdp);
  841                                         return (EBADF);
  842                                 }
  843                                 if (fo_poll(fp, flag[msk], td->td_ucred,
  844                                     td)) {
  845                                         obits[msk][(fd)/NFDBITS] |=
  846                                             ((fd_mask)1 << ((fd) % NFDBITS));
  847                                         n++;
  848                                 }
  849                         }
  850                 }
  851         }
  852         FILEDESC_UNLOCK(fdp);
  853         td->td_retval[0] = n;
  854         return (0);
  855 }
  856 
  857 /*
  858  * Poll system call.
  859  */
  860 #ifndef _SYS_SYSPROTO_H_
  861 struct poll_args {
  862         struct pollfd *fds;
  863         u_int   nfds;
  864         int     timeout;
  865 };
  866 #endif
  867 /*
  868  * MPSAFE
  869  */
  870 int
  871 poll(td, uap)
  872         struct thread *td;
  873         struct poll_args *uap;
  874 {
  875         struct pollfd *bits;
  876         struct pollfd smallbits[32];
  877         struct timeval atv, rtv, ttv;
  878         int error = 0, timo;
  879         u_int ncoll, nfds;
  880         size_t ni;
  881 
  882         nfds = uap->nfds;
  883 
  884         /*
  885          * This is kinda bogus.  We have fd limits, but that is not
  886          * really related to the size of the pollfd array.  Make sure
  887          * we let the process use at least FD_SETSIZE entries and at
  888          * least enough for the current limits.  We want to be reasonably
  889          * safe, but not overly restrictive.
  890          */
  891         PROC_LOCK(td->td_proc);
  892         if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
  893             (nfds > FD_SETSIZE)) {
  894                 PROC_UNLOCK(td->td_proc);
  895                 error = EINVAL;
  896                 goto done2;
  897         }
  898         PROC_UNLOCK(td->td_proc);
  899         ni = nfds * sizeof(struct pollfd);
  900         if (ni > sizeof(smallbits))
  901                 bits = malloc(ni, M_TEMP, M_WAITOK);
  902         else
  903                 bits = smallbits;
  904         error = copyin(uap->fds, bits, ni);
  905         if (error)
  906                 goto done_nosellock;
  907         if (uap->timeout != INFTIM) {
  908                 atv.tv_sec = uap->timeout / 1000;
  909                 atv.tv_usec = (uap->timeout % 1000) * 1000;
  910                 if (itimerfix(&atv)) {
  911                         error = EINVAL;
  912                         goto done_nosellock;
  913                 }
  914                 getmicrouptime(&rtv);
  915                 timevaladd(&atv, &rtv);
  916         } else {
  917                 atv.tv_sec = 0;
  918                 atv.tv_usec = 0;
  919         }
  920         timo = 0;
  921         TAILQ_INIT(&td->td_selq);
  922         mtx_lock(&sellock);
  923 retry:
  924         ncoll = nselcoll;
  925         mtx_lock_spin(&sched_lock);
  926         td->td_flags |= TDF_SELECT;
  927         mtx_unlock_spin(&sched_lock);
  928         mtx_unlock(&sellock);
  929 
  930         error = pollscan(td, bits, nfds);
  931         mtx_lock(&sellock);
  932         if (error || td->td_retval[0])
  933                 goto done;
  934         if (atv.tv_sec || atv.tv_usec) {
  935                 getmicrouptime(&rtv);
  936                 if (timevalcmp(&rtv, &atv, >=))
  937                         goto done;
  938                 ttv = atv;
  939                 timevalsub(&ttv, &rtv);
  940                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  941                     24 * 60 * 60 * hz : tvtohz(&ttv);
  942         }
  943         /*
  944          * An event of interest may occur while we do not hold
  945          * sellock, so check TDF_SELECT and the number of collisions
  946          * and rescan the file descriptors if necessary.
  947          */
  948         mtx_lock_spin(&sched_lock);
  949         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  950                 mtx_unlock_spin(&sched_lock);
  951                 goto retry;
  952         }
  953         mtx_unlock_spin(&sched_lock);
  954 
  955         if (timo > 0)
  956                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  957         else
  958                 error = cv_wait_sig(&selwait, &sellock);
  959 
  960         if (error == 0)
  961                 goto retry;
  962 
  963 done:
  964         clear_selinfo_list(td);
  965         mtx_lock_spin(&sched_lock);
  966         td->td_flags &= ~TDF_SELECT;
  967         mtx_unlock_spin(&sched_lock);
  968         mtx_unlock(&sellock);
  969 
  970 done_nosellock:
  971         /* poll is not restarted after signals... */
  972         if (error == ERESTART)
  973                 error = EINTR;
  974         if (error == EWOULDBLOCK)
  975                 error = 0;
  976         if (error == 0) {
  977                 error = copyout(bits, uap->fds, ni);
  978                 if (error)
  979                         goto out;
  980         }
  981 out:
  982         if (ni > sizeof(smallbits))
  983                 free(bits, M_TEMP);
  984 done2:
  985         return (error);
  986 }
  987 
  988 static int
  989 pollscan(td, fds, nfd)
  990         struct thread *td;
  991         struct pollfd *fds;
  992         u_int nfd;
  993 {
  994         register struct filedesc *fdp = td->td_proc->p_fd;
  995         int i;
  996         struct file *fp;
  997         int n = 0;
  998 
  999         FILEDESC_LOCK(fdp);
 1000         for (i = 0; i < nfd; i++, fds++) {
 1001                 if (fds->fd >= fdp->fd_nfiles) {
 1002                         fds->revents = POLLNVAL;
 1003                         n++;
 1004                 } else if (fds->fd < 0) {
 1005                         fds->revents = 0;
 1006                 } else {
 1007                         fp = fdp->fd_ofiles[fds->fd];
 1008                         if (fp == NULL) {
 1009                                 fds->revents = POLLNVAL;
 1010                                 n++;
 1011                         } else {
 1012                                 /*
 1013                                  * Note: backend also returns POLLHUP and
 1014                                  * POLLERR if appropriate.
 1015                                  */
 1016                                 fds->revents = fo_poll(fp, fds->events,
 1017                                     td->td_ucred, td);
 1018                                 if (fds->revents != 0)
 1019                                         n++;
 1020                         }
 1021                 }
 1022         }
 1023         FILEDESC_UNLOCK(fdp);
 1024         td->td_retval[0] = n;
 1025         return (0);
 1026 }
 1027 
 1028 /*
 1029  * OpenBSD poll system call.
 1030  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
 1031  */
 1032 #ifndef _SYS_SYSPROTO_H_
 1033 struct openbsd_poll_args {
 1034         struct pollfd *fds;
 1035         u_int   nfds;
 1036         int     timeout;
 1037 };
 1038 #endif
 1039 /*
 1040  * MPSAFE
 1041  */
 1042 int
 1043 openbsd_poll(td, uap)
 1044         register struct thread *td;
 1045         register struct openbsd_poll_args *uap;
 1046 {
 1047         return (poll(td, (struct poll_args *)uap));
 1048 }
 1049 
 1050 /*
 1051  * Remove the references to the thread from all of the objects
 1052  * we were polling.
 1053  *
 1054  * This code assumes that the underlying owner of the selinfo
 1055  * structure will hold sellock before it changes it, and that
 1056  * it will unlink itself from our list if it goes away.
 1057  */
 1058 void
 1059 clear_selinfo_list(td)
 1060         struct thread *td;
 1061 {
 1062         struct selinfo *si;
 1063 
 1064         mtx_assert(&sellock, MA_OWNED);
 1065         TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 1066                 si->si_thread = NULL;
 1067         TAILQ_INIT(&td->td_selq);
 1068 }
 1069 
 1070 /*
 1071  * Record a select request.
 1072  */
 1073 void
 1074 selrecord(selector, sip)
 1075         struct thread *selector;
 1076         struct selinfo *sip;
 1077 {
 1078 
 1079         mtx_lock(&sellock);
 1080         /*
 1081          * If the selinfo's thread pointer is NULL then take ownership of it.
 1082          *
 1083          * If the thread pointer is not NULL and it points to another
 1084          * thread, then we have a collision.
 1085          *
 1086          * If the thread pointer is not NULL and points back to us then leave
 1087          * it alone as we've already added pointed it at us and added it to
 1088          * our list.
 1089          */
 1090         if (sip->si_thread == NULL) {
 1091                 sip->si_thread = selector;
 1092                 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 1093         } else if (sip->si_thread != selector) {
 1094                 sip->si_flags |= SI_COLL;
 1095         }
 1096 
 1097         mtx_unlock(&sellock);
 1098 }
 1099 
 1100 /* Wake up a selecting thread. */
 1101 void
 1102 selwakeup(sip)
 1103         struct selinfo *sip;
 1104 {
 1105         doselwakeup(sip, -1);
 1106 }
 1107 
 1108 /* Wake up a selecting thread, and set its priority. */
 1109 void
 1110 selwakeuppri(sip, pri)
 1111         struct selinfo *sip;
 1112         int pri;
 1113 {
 1114         doselwakeup(sip, pri);
 1115 }
 1116 
 1117 /*
 1118  * Do a wakeup when a selectable event occurs.
 1119  */
 1120 static void
 1121 doselwakeup(sip, pri)
 1122         struct selinfo *sip;
 1123         int pri;
 1124 {
 1125         struct thread *td;
 1126 
 1127         mtx_lock(&sellock);
 1128         td = sip->si_thread;
 1129         if ((sip->si_flags & SI_COLL) != 0) {
 1130                 nselcoll++;
 1131                 sip->si_flags &= ~SI_COLL;
 1132                 cv_broadcastpri(&selwait, pri);
 1133         }
 1134         if (td == NULL) {
 1135                 mtx_unlock(&sellock);
 1136                 return;
 1137         }
 1138         TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 1139         sip->si_thread = NULL;
 1140         mtx_lock_spin(&sched_lock);
 1141         td->td_flags &= ~TDF_SELECT;
 1142         mtx_unlock_spin(&sched_lock);
 1143         sleepq_remove(td, &selwait);
 1144         mtx_unlock(&sellock);
 1145 }
 1146 
 1147 static void selectinit(void *);
 1148 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 1149 
 1150 /* ARGSUSED*/
 1151 static void
 1152 selectinit(dummy)
 1153         void *dummy;
 1154 {
 1155         cv_init(&selwait, "select");
 1156         mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 1157 }

Cache object: 4887d6a62a5471d9ddbee5f746b9d5a8


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.