The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)sys_generic.c       8.5 (Berkeley) 1/21/94
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_compat.h"
   41 #include "opt_ktrace.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/sysproto.h>
   46 #include <sys/filedesc.h>
   47 #include <sys/filio.h>
   48 #include <sys/fcntl.h>
   49 #include <sys/file.h>
   50 #include <sys/proc.h>
   51 #include <sys/signalvar.h>
   52 #include <sys/socketvar.h>
   53 #include <sys/uio.h>
   54 #include <sys/kernel.h>
   55 #include <sys/limits.h>
   56 #include <sys/malloc.h>
   57 #include <sys/poll.h>
   58 #include <sys/resourcevar.h>
   59 #include <sys/selinfo.h>
   60 #include <sys/sleepqueue.h>
   61 #include <sys/syscallsubr.h>
   62 #include <sys/sysctl.h>
   63 #include <sys/sysent.h>
   64 #include <sys/vnode.h>
   65 #include <sys/bio.h>
   66 #include <sys/buf.h>
   67 #include <sys/condvar.h>
   68 #ifdef KTRACE
   69 #include <sys/ktrace.h>
   70 #endif
   71 
   72 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
   73 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
   74 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
   75 
   76 static int      pollscan(struct thread *, struct pollfd *, u_int);
   77 static int      selscan(struct thread *, fd_mask **, fd_mask **, int);
   78 static int      dofileread(struct thread *, int, struct file *, struct uio *,
   79                     off_t, int);
   80 static int      dofilewrite(struct thread *, int, struct file *, struct uio *,
   81                     off_t, int);
   82 static void     doselwakeup(struct selinfo *, int);
   83 
   84 #ifndef _SYS_SYSPROTO_H_
   85 struct read_args {
   86         int     fd;
   87         void    *buf;
   88         size_t  nbyte;
   89 };
   90 #endif
   91 int
   92 read(td, uap)
   93         struct thread *td;
   94         struct read_args *uap;
   95 {
   96         struct uio auio;
   97         struct iovec aiov;
   98         int error;
   99 
  100         if (uap->nbyte > INT_MAX)
  101                 return (EINVAL);
  102         aiov.iov_base = uap->buf;
  103         aiov.iov_len = uap->nbyte;
  104         auio.uio_iov = &aiov;
  105         auio.uio_iovcnt = 1;
  106         auio.uio_resid = uap->nbyte;
  107         auio.uio_segflg = UIO_USERSPACE;
  108         error = kern_readv(td, uap->fd, &auio);
  109         return(error);
  110 }
  111 
  112 /*
  113  * Positioned read system call
  114  */
  115 #ifndef _SYS_SYSPROTO_H_
  116 struct pread_args {
  117         int     fd;
  118         void    *buf;
  119         size_t  nbyte;
  120         int     pad;
  121         off_t   offset;
  122 };
  123 #endif
  124 int
  125 pread(td, uap)
  126         struct thread *td;
  127         struct pread_args *uap;
  128 {
  129         struct uio auio;
  130         struct iovec aiov;
  131         int error;
  132 
  133         if (uap->nbyte > INT_MAX)
  134                 return (EINVAL);
  135         aiov.iov_base = uap->buf;
  136         aiov.iov_len = uap->nbyte;
  137         auio.uio_iov = &aiov;
  138         auio.uio_iovcnt = 1;
  139         auio.uio_resid = uap->nbyte;
  140         auio.uio_segflg = UIO_USERSPACE;
  141         error = kern_preadv(td, uap->fd, &auio, uap->offset);
  142         return(error);
  143 }
  144 
  145 int
  146 freebsd6_pread(td, uap)
  147         struct thread *td;
  148         struct freebsd6_pread_args *uap;
  149 {
  150         struct pread_args oargs;
  151 
  152         oargs.fd = uap->fd;
  153         oargs.buf = uap->buf;
  154         oargs.nbyte = uap->nbyte;
  155         oargs.offset = uap->offset;
  156         return (pread(td, &oargs));
  157 }
  158 
  159 /*
  160  * Scatter read system call.
  161  */
  162 #ifndef _SYS_SYSPROTO_H_
  163 struct readv_args {
  164         int     fd;
  165         struct  iovec *iovp;
  166         u_int   iovcnt;
  167 };
  168 #endif
  169 int
  170 readv(struct thread *td, struct readv_args *uap)
  171 {
  172         struct uio *auio;
  173         int error;
  174 
  175         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  176         if (error)
  177                 return (error);
  178         error = kern_readv(td, uap->fd, auio);
  179         free(auio, M_IOV);
  180         return (error);
  181 }
  182 
  183 int
  184 kern_readv(struct thread *td, int fd, struct uio *auio)
  185 {
  186         struct file *fp;
  187         int error;
  188 
  189         error = fget_read(td, fd, &fp);
  190         if (error)
  191                 return (error);
  192         error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
  193         fdrop(fp, td);
  194         return (error);
  195 }
  196 
  197 /*
  198  * Scatter positioned read system call.
  199  */
  200 #ifndef _SYS_SYSPROTO_H_
  201 struct preadv_args {
  202         int     fd;
  203         struct  iovec *iovp;
  204         u_int   iovcnt;
  205         off_t   offset;
  206 };
  207 #endif
  208 int
  209 preadv(struct thread *td, struct preadv_args *uap)
  210 {
  211         struct uio *auio;
  212         int error;
  213 
  214         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  215         if (error)
  216                 return (error);
  217         error = kern_preadv(td, uap->fd, auio, uap->offset);
  218         free(auio, M_IOV);
  219         return (error);
  220 }
  221 
  222 int
  223 kern_preadv(td, fd, auio, offset)
  224         struct thread *td;
  225         int fd;
  226         struct uio *auio;
  227         off_t offset;
  228 {
  229         struct file *fp;
  230         int error;
  231 
  232         error = fget_read(td, fd, &fp);
  233         if (error)
  234                 return (error);
  235         if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
  236                 error = ESPIPE;
  237         else if (offset < 0 && fp->f_vnode->v_type != VCHR)
  238                 error = EINVAL;
  239         else
  240                 error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
  241         fdrop(fp, td);
  242         return (error);
  243 }
  244 
  245 /*
  246  * Common code for readv and preadv that reads data in
  247  * from a file using the passed in uio, offset, and flags.
  248  */
  249 static int
  250 dofileread(td, fd, fp, auio, offset, flags)
  251         struct thread *td;
  252         int fd;
  253         struct file *fp;
  254         struct uio *auio;
  255         off_t offset;
  256         int flags;
  257 {
  258         ssize_t cnt;
  259         int error;
  260 #ifdef KTRACE
  261         struct uio *ktruio = NULL;
  262 #endif
  263 
  264         /* Finish zero length reads right here */
  265         if (auio->uio_resid == 0) {
  266                 td->td_retval[0] = 0;
  267                 return(0);
  268         }
  269         auio->uio_rw = UIO_READ;
  270         auio->uio_offset = offset;
  271         auio->uio_td = td;
  272 #ifdef KTRACE
  273         if (KTRPOINT(td, KTR_GENIO)) 
  274                 ktruio = cloneuio(auio);
  275 #endif
  276         cnt = auio->uio_resid;
  277         if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
  278                 if (auio->uio_resid != cnt && (error == ERESTART ||
  279                     error == EINTR || error == EWOULDBLOCK))
  280                         error = 0;
  281         }
  282         cnt -= auio->uio_resid;
  283 #ifdef KTRACE
  284         if (ktruio != NULL) {
  285                 ktruio->uio_resid = cnt;
  286                 ktrgenio(fd, UIO_READ, ktruio, error);
  287         }
  288 #endif
  289         td->td_retval[0] = cnt;
  290         return (error);
  291 }
  292 
  293 #ifndef _SYS_SYSPROTO_H_
  294 struct write_args {
  295         int     fd;
  296         const void *buf;
  297         size_t  nbyte;
  298 };
  299 #endif
  300 int
  301 write(td, uap)
  302         struct thread *td;
  303         struct write_args *uap;
  304 {
  305         struct uio auio;
  306         struct iovec aiov;
  307         int error;
  308 
  309         if (uap->nbyte > INT_MAX)
  310                 return (EINVAL);
  311         aiov.iov_base = (void *)(uintptr_t)uap->buf;
  312         aiov.iov_len = uap->nbyte;
  313         auio.uio_iov = &aiov;
  314         auio.uio_iovcnt = 1;
  315         auio.uio_resid = uap->nbyte;
  316         auio.uio_segflg = UIO_USERSPACE;
  317         error = kern_writev(td, uap->fd, &auio);
  318         return(error);
  319 }
  320 
  321 /*
  322  * Positioned write system call.
  323  */
  324 #ifndef _SYS_SYSPROTO_H_
  325 struct pwrite_args {
  326         int     fd;
  327         const void *buf;
  328         size_t  nbyte;
  329         int     pad;
  330         off_t   offset;
  331 };
  332 #endif
  333 int
  334 pwrite(td, uap)
  335         struct thread *td;
  336         struct pwrite_args *uap;
  337 {
  338         struct uio auio;
  339         struct iovec aiov;
  340         int error;
  341 
  342         if (uap->nbyte > INT_MAX)
  343                 return (EINVAL);
  344         aiov.iov_base = (void *)(uintptr_t)uap->buf;
  345         aiov.iov_len = uap->nbyte;
  346         auio.uio_iov = &aiov;
  347         auio.uio_iovcnt = 1;
  348         auio.uio_resid = uap->nbyte;
  349         auio.uio_segflg = UIO_USERSPACE;
  350         error = kern_pwritev(td, uap->fd, &auio, uap->offset);
  351         return(error);
  352 }
  353 
  354 int
  355 freebsd6_pwrite(td, uap)
  356         struct thread *td;
  357         struct freebsd6_pwrite_args *uap;
  358 {
  359         struct pwrite_args oargs;
  360 
  361         oargs.fd = uap->fd;
  362         oargs.buf = uap->buf;
  363         oargs.nbyte = uap->nbyte;
  364         oargs.offset = uap->offset;
  365         return (pwrite(td, &oargs));
  366 }
  367 
  368 /*
  369  * Gather write system call.
  370  */
  371 #ifndef _SYS_SYSPROTO_H_
  372 struct writev_args {
  373         int     fd;
  374         struct  iovec *iovp;
  375         u_int   iovcnt;
  376 };
  377 #endif
  378 int
  379 writev(struct thread *td, struct writev_args *uap)
  380 {
  381         struct uio *auio;
  382         int error;
  383 
  384         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  385         if (error)
  386                 return (error);
  387         error = kern_writev(td, uap->fd, auio);
  388         free(auio, M_IOV);
  389         return (error);
  390 }
  391 
  392 int
  393 kern_writev(struct thread *td, int fd, struct uio *auio)
  394 {
  395         struct file *fp;
  396         int error;
  397 
  398         error = fget_write(td, fd, &fp);
  399         if (error)
  400                 return (error);
  401         error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
  402         fdrop(fp, td);
  403         return (error);
  404 }
  405 
  406 /*
  407  * Gather positioned write system call.
  408  */
  409 #ifndef _SYS_SYSPROTO_H_
  410 struct pwritev_args {
  411         int     fd;
  412         struct  iovec *iovp;
  413         u_int   iovcnt;
  414         off_t   offset;
  415 };
  416 #endif
  417 int
  418 pwritev(struct thread *td, struct pwritev_args *uap)
  419 {
  420         struct uio *auio;
  421         int error;
  422 
  423         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  424         if (error)
  425                 return (error);
  426         error = kern_pwritev(td, uap->fd, auio, uap->offset);
  427         free(auio, M_IOV);
  428         return (error);
  429 }
  430 
  431 int
  432 kern_pwritev(td, fd, auio, offset)
  433         struct thread *td;
  434         struct uio *auio;
  435         int fd;
  436         off_t offset;
  437 {
  438         struct file *fp;
  439         int error;
  440 
  441         error = fget_write(td, fd, &fp);
  442         if (error)
  443                 return (error);
  444         if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
  445                 error = ESPIPE;
  446         else if (offset < 0 && fp->f_vnode->v_type != VCHR)
  447                 error = EINVAL;
  448         else
  449                 error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
  450         fdrop(fp, td);
  451         return (error);
  452 }
  453 
  454 /*
  455  * Common code for writev and pwritev that writes data to
  456  * a file using the passed in uio, offset, and flags.
  457  */
  458 static int
  459 dofilewrite(td, fd, fp, auio, offset, flags)
  460         struct thread *td;
  461         int fd;
  462         struct file *fp;
  463         struct uio *auio;
  464         off_t offset;
  465         int flags;
  466 {
  467         ssize_t cnt;
  468         int error;
  469 #ifdef KTRACE
  470         struct uio *ktruio = NULL;
  471 #endif
  472 
  473         auio->uio_rw = UIO_WRITE;
  474         auio->uio_td = td;
  475         auio->uio_offset = offset;
  476 #ifdef KTRACE
  477         if (KTRPOINT(td, KTR_GENIO))
  478                 ktruio = cloneuio(auio);
  479 #endif
  480         cnt = auio->uio_resid;
  481         if (fp->f_type == DTYPE_VNODE)
  482                 bwillwrite();
  483         if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
  484                 if (auio->uio_resid != cnt && (error == ERESTART ||
  485                     error == EINTR || error == EWOULDBLOCK))
  486                         error = 0;
  487                 /* Socket layer is responsible for issuing SIGPIPE. */
  488                 if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
  489                         PROC_LOCK(td->td_proc);
  490                         psignal(td->td_proc, SIGPIPE);
  491                         PROC_UNLOCK(td->td_proc);
  492                 }
  493         }
  494         cnt -= auio->uio_resid;
  495 #ifdef KTRACE
  496         if (ktruio != NULL) {
  497                 ktruio->uio_resid = cnt;
  498                 ktrgenio(fd, UIO_WRITE, ktruio, error);
  499         }
  500 #endif
  501         td->td_retval[0] = cnt;
  502         return (error);
  503 }
  504 
  505 #ifndef _SYS_SYSPROTO_H_
  506 struct ioctl_args {
  507         int     fd;
  508         u_long  com;
  509         caddr_t data;
  510 };
  511 #endif
  512 /* ARGSUSED */
  513 int
  514 ioctl(struct thread *td, struct ioctl_args *uap)
  515 {
  516         u_long com;
  517         int arg, error;
  518         u_int size;
  519         caddr_t data;
  520 
  521         if (uap->com > 0xffffffff) {
  522                 printf(
  523                     "WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
  524                     td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
  525                 uap->com &= 0xffffffff;
  526         }
  527         com = uap->com;
  528 
  529         /*
  530          * Interpret high order word to find amount of data to be
  531          * copied to/from the user's address space.
  532          */
  533         size = IOCPARM_LEN(com);
  534         if ((size > IOCPARM_MAX) ||
  535             ((com & (IOC_VOID  | IOC_IN | IOC_OUT)) == 0) ||
  536 #if defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
  537             ((com & IOC_OUT) && size == 0) ||
  538 #else
  539             ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
  540 #endif
  541             ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
  542                 return (ENOTTY);
  543 
  544         if (size > 0) {
  545                 if (!(com & IOC_VOID))
  546                         data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
  547                 else {
  548                         /* Integer argument. */
  549                         arg = (intptr_t)uap->data;
  550                         data = (void *)&arg;
  551                         size = 0;
  552                 }
  553         } else
  554                 data = (void *)&uap->data;
  555         if (com & IOC_IN) {
  556                 error = copyin(uap->data, data, (u_int)size);
  557                 if (error) {
  558                         if (size > 0)
  559                                 free(data, M_IOCTLOPS);
  560                         return (error);
  561                 }
  562         } else if (com & IOC_OUT) {
  563                 /*
  564                  * Zero the buffer so the user always
  565                  * gets back something deterministic.
  566                  */
  567                 bzero(data, size);
  568         }
  569 
  570         error = kern_ioctl(td, uap->fd, com, data);
  571 
  572         if (error == 0 && (com & IOC_OUT))
  573                 error = copyout(data, uap->data, (u_int)size);
  574 
  575         if (size > 0)
  576                 free(data, M_IOCTLOPS);
  577         return (error);
  578 }
  579 
  580 int
  581 kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
  582 {
  583         struct file *fp;
  584         struct filedesc *fdp;
  585         int error;
  586         int tmp;
  587 
  588         if ((error = fget(td, fd, &fp)) != 0)
  589                 return (error);
  590         if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
  591                 fdrop(fp, td);
  592                 return (EBADF);
  593         }
  594         fdp = td->td_proc->p_fd;
  595         switch (com) {
  596         case FIONCLEX:
  597                 FILEDESC_XLOCK(fdp);
  598                 fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
  599                 FILEDESC_XUNLOCK(fdp);
  600                 goto out;
  601         case FIOCLEX:
  602                 FILEDESC_XLOCK(fdp);
  603                 fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
  604                 FILEDESC_XUNLOCK(fdp);
  605                 goto out;
  606         case FIONBIO:
  607                 FILE_LOCK(fp);
  608                 if ((tmp = *(int *)data))
  609                         fp->f_flag |= FNONBLOCK;
  610                 else
  611                         fp->f_flag &= ~FNONBLOCK;
  612                 FILE_UNLOCK(fp);
  613                 data = (void *)&tmp;
  614                 break;
  615         case FIOASYNC:
  616                 FILE_LOCK(fp);
  617                 if ((tmp = *(int *)data))
  618                         fp->f_flag |= FASYNC;
  619                 else
  620                         fp->f_flag &= ~FASYNC;
  621                 FILE_UNLOCK(fp);
  622                 data = (void *)&tmp;
  623                 break;
  624         }
  625 
  626         error = fo_ioctl(fp, com, data, td->td_ucred, td);
  627 out:
  628         fdrop(fp, td);
  629         return (error);
  630 }
  631 
  632 /*
  633  * sellock and selwait are initialized in selectinit() via SYSINIT.
  634  */
  635 struct mtx      sellock;
  636 struct cv       selwait;
  637 u_int           nselcoll;       /* Select collisions since boot */
  638 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
  639 
  640 #ifndef _SYS_SYSPROTO_H_
  641 struct select_args {
  642         int     nd;
  643         fd_set  *in, *ou, *ex;
  644         struct  timeval *tv;
  645 };
  646 #endif
  647 int
  648 select(td, uap)
  649         register struct thread *td;
  650         register struct select_args *uap;
  651 {
  652         struct timeval tv, *tvp;
  653         int error;
  654 
  655         if (uap->tv != NULL) {
  656                 error = copyin(uap->tv, &tv, sizeof(tv));
  657                 if (error)
  658                         return (error);
  659                 tvp = &tv;
  660         } else
  661                 tvp = NULL;
  662 
  663         return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp));
  664 }
  665 
  666 int
  667 kern_select(struct thread *td, int nd, fd_set *fd_in, fd_set *fd_ou,
  668     fd_set *fd_ex, struct timeval *tvp)
  669 {
  670         struct filedesc *fdp;
  671         /*
  672          * The magic 2048 here is chosen to be just enough for FD_SETSIZE
  673          * infds with the new FD_SETSIZE of 1024, and more than enough for
  674          * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
  675          * of 256.
  676          */
  677         fd_mask s_selbits[howmany(2048, NFDBITS)];
  678         fd_mask *ibits[3], *obits[3], *selbits, *sbp;
  679         struct timeval atv, rtv, ttv;
  680         int error, timo;
  681         u_int ncoll, nbufbytes, ncpbytes, nfdbits;
  682 
  683         if (nd < 0)
  684                 return (EINVAL);
  685         fdp = td->td_proc->p_fd;
  686         
  687         FILEDESC_SLOCK(fdp);
  688         if (nd > td->td_proc->p_fd->fd_nfiles)
  689                 nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
  690         FILEDESC_SUNLOCK(fdp);
  691 
  692         /*
  693          * Allocate just enough bits for the non-null fd_sets.  Use the
  694          * preallocated auto buffer if possible.
  695          */
  696         nfdbits = roundup(nd, NFDBITS);
  697         ncpbytes = nfdbits / NBBY;
  698         nbufbytes = 0;
  699         if (fd_in != NULL)
  700                 nbufbytes += 2 * ncpbytes;
  701         if (fd_ou != NULL)
  702                 nbufbytes += 2 * ncpbytes;
  703         if (fd_ex != NULL)
  704                 nbufbytes += 2 * ncpbytes;
  705         if (nbufbytes <= sizeof s_selbits)
  706                 selbits = &s_selbits[0];
  707         else
  708                 selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
  709 
  710         /*
  711          * Assign pointers into the bit buffers and fetch the input bits.
  712          * Put the output buffers together so that they can be bzeroed
  713          * together.
  714          */
  715         sbp = selbits;
  716 #define getbits(name, x) \
  717         do {                                                            \
  718                 if (name == NULL)                                       \
  719                         ibits[x] = NULL;                                \
  720                 else {                                                  \
  721                         ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;   \
  722                         obits[x] = sbp;                                 \
  723                         sbp += ncpbytes / sizeof *sbp;                  \
  724                         error = copyin(name, ibits[x], ncpbytes);       \
  725                         if (error != 0)                                 \
  726                                 goto done_nosellock;                    \
  727                 }                                                       \
  728         } while (0)
  729         getbits(fd_in, 0);
  730         getbits(fd_ou, 1);
  731         getbits(fd_ex, 2);
  732 #undef  getbits
  733         if (nbufbytes != 0)
  734                 bzero(selbits, nbufbytes / 2);
  735 
  736         if (tvp != NULL) {
  737                 atv = *tvp;
  738                 if (itimerfix(&atv)) {
  739                         error = EINVAL;
  740                         goto done_nosellock;
  741                 }
  742                 getmicrouptime(&rtv);
  743                 timevaladd(&atv, &rtv);
  744         } else {
  745                 atv.tv_sec = 0;
  746                 atv.tv_usec = 0;
  747         }
  748         timo = 0;
  749         TAILQ_INIT(&td->td_selq);
  750         mtx_lock(&sellock);
  751 retry:
  752         ncoll = nselcoll;
  753         thread_lock(td);
  754         td->td_flags |= TDF_SELECT;
  755         thread_unlock(td);
  756         mtx_unlock(&sellock);
  757 
  758         error = selscan(td, ibits, obits, nd);
  759         mtx_lock(&sellock);
  760         if (error || td->td_retval[0])
  761                 goto done;
  762         if (atv.tv_sec || atv.tv_usec) {
  763                 getmicrouptime(&rtv);
  764                 if (timevalcmp(&rtv, &atv, >=))
  765                         goto done;
  766                 ttv = atv;
  767                 timevalsub(&ttv, &rtv);
  768                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  769                     24 * 60 * 60 * hz : tvtohz(&ttv);
  770         }
  771 
  772         /*
  773          * An event of interest may occur while we do not hold
  774          * sellock, so check TDF_SELECT and the number of
  775          * collisions and rescan the file descriptors if
  776          * necessary.
  777          */
  778         thread_lock(td);
  779         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  780                 thread_unlock(td);
  781                 goto retry;
  782         }
  783         thread_unlock(td);
  784 
  785         if (timo > 0)
  786                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  787         else
  788                 error = cv_wait_sig(&selwait, &sellock);
  789         
  790         if (error == 0)
  791                 goto retry;
  792 
  793 done:
  794         clear_selinfo_list(td);
  795         thread_lock(td);
  796         td->td_flags &= ~TDF_SELECT;
  797         thread_unlock(td);
  798         mtx_unlock(&sellock);
  799 
  800 done_nosellock:
  801         /* select is not restarted after signals... */
  802         if (error == ERESTART)
  803                 error = EINTR;
  804         if (error == EWOULDBLOCK)
  805                 error = 0;
  806 #define putbits(name, x) \
  807         if (name && (error2 = copyout(obits[x], name, ncpbytes))) \
  808                 error = error2;
  809         if (error == 0) {
  810                 int error2;
  811 
  812                 putbits(fd_in, 0);
  813                 putbits(fd_ou, 1);
  814                 putbits(fd_ex, 2);
  815 #undef putbits
  816         }
  817         if (selbits != &s_selbits[0])
  818                 free(selbits, M_SELECT);
  819 
  820         return (error);
  821 }
  822 
  823 static int
  824 selscan(td, ibits, obits, nfd)
  825         struct thread *td;
  826         fd_mask **ibits, **obits;
  827         int nfd;
  828 {
  829         int msk, i, fd;
  830         fd_mask bits;
  831         struct file *fp;
  832         int n = 0;
  833         /* Note: backend also returns POLLHUP/POLLERR if appropriate. */
  834         static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
  835         struct filedesc *fdp = td->td_proc->p_fd;
  836 
  837         FILEDESC_SLOCK(fdp);
  838         for (msk = 0; msk < 3; msk++) {
  839                 if (ibits[msk] == NULL)
  840                         continue;
  841                 for (i = 0; i < nfd; i += NFDBITS) {
  842                         bits = ibits[msk][i/NFDBITS];
  843                         /* ffs(int mask) not portable, fd_mask is long */
  844                         for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
  845                                 if (!(bits & 1))
  846                                         continue;
  847                                 if ((fp = fget_locked(fdp, fd)) == NULL) {
  848                                         FILEDESC_SUNLOCK(fdp);
  849                                         return (EBADF);
  850                                 }
  851                                 if (fo_poll(fp, flag[msk], td->td_ucred,
  852                                     td)) {
  853                                         obits[msk][(fd)/NFDBITS] |=
  854                                             ((fd_mask)1 << ((fd) % NFDBITS));
  855                                         n++;
  856                                 }
  857                         }
  858                 }
  859         }
  860         FILEDESC_SUNLOCK(fdp);
  861         td->td_retval[0] = n;
  862         return (0);
  863 }
  864 
  865 #ifndef _SYS_SYSPROTO_H_
  866 struct poll_args {
  867         struct pollfd *fds;
  868         u_int   nfds;
  869         int     timeout;
  870 };
  871 #endif
  872 int
  873 poll(td, uap)
  874         struct thread *td;
  875         struct poll_args *uap;
  876 {
  877         struct pollfd *bits;
  878         struct pollfd smallbits[32];
  879         struct timeval atv, rtv, ttv;
  880         int error = 0, timo;
  881         u_int ncoll, nfds;
  882         size_t ni;
  883 
  884         nfds = uap->nfds;
  885 
  886         /*
  887          * This is kinda bogus.  We have fd limits, but that is not
  888          * really related to the size of the pollfd array.  Make sure
  889          * we let the process use at least FD_SETSIZE entries and at
  890          * least enough for the current limits.  We want to be reasonably
  891          * safe, but not overly restrictive.
  892          */
  893         PROC_LOCK(td->td_proc);
  894         if ((nfds > lim_cur(td->td_proc, RLIMIT_NOFILE)) &&
  895             (nfds > FD_SETSIZE)) {
  896                 PROC_UNLOCK(td->td_proc);
  897                 error = EINVAL;
  898                 goto done2;
  899         }
  900         PROC_UNLOCK(td->td_proc);
  901         ni = nfds * sizeof(struct pollfd);
  902         if (ni > sizeof(smallbits))
  903                 bits = malloc(ni, M_TEMP, M_WAITOK);
  904         else
  905                 bits = smallbits;
  906         error = copyin(uap->fds, bits, ni);
  907         if (error)
  908                 goto done_nosellock;
  909         if (uap->timeout != INFTIM) {
  910                 atv.tv_sec = uap->timeout / 1000;
  911                 atv.tv_usec = (uap->timeout % 1000) * 1000;
  912                 if (itimerfix(&atv)) {
  913                         error = EINVAL;
  914                         goto done_nosellock;
  915                 }
  916                 getmicrouptime(&rtv);
  917                 timevaladd(&atv, &rtv);
  918         } else {
  919                 atv.tv_sec = 0;
  920                 atv.tv_usec = 0;
  921         }
  922         timo = 0;
  923         TAILQ_INIT(&td->td_selq);
  924         mtx_lock(&sellock);
  925 retry:
  926         ncoll = nselcoll;
  927         thread_lock(td);
  928         td->td_flags |= TDF_SELECT;
  929         thread_unlock(td);
  930         mtx_unlock(&sellock);
  931 
  932         error = pollscan(td, bits, nfds);
  933         mtx_lock(&sellock);
  934         if (error || td->td_retval[0])
  935                 goto done;
  936         if (atv.tv_sec || atv.tv_usec) {
  937                 getmicrouptime(&rtv);
  938                 if (timevalcmp(&rtv, &atv, >=))
  939                         goto done;
  940                 ttv = atv;
  941                 timevalsub(&ttv, &rtv);
  942                 timo = ttv.tv_sec > 24 * 60 * 60 ?
  943                     24 * 60 * 60 * hz : tvtohz(&ttv);
  944         }
  945         /*
  946          * An event of interest may occur while we do not hold
  947          * sellock, so check TDF_SELECT and the number of collisions
  948          * and rescan the file descriptors if necessary.
  949          */
  950         thread_lock(td);
  951         if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
  952                 thread_unlock(td);
  953                 goto retry;
  954         }
  955         thread_unlock(td);
  956 
  957         if (timo > 0)
  958                 error = cv_timedwait_sig(&selwait, &sellock, timo);
  959         else
  960                 error = cv_wait_sig(&selwait, &sellock);
  961 
  962         if (error == 0)
  963                 goto retry;
  964 
  965 done:
  966         clear_selinfo_list(td);
  967         thread_lock(td);
  968         td->td_flags &= ~TDF_SELECT;
  969         thread_unlock(td);
  970         mtx_unlock(&sellock);
  971 
  972 done_nosellock:
  973         /* poll is not restarted after signals... */
  974         if (error == ERESTART)
  975                 error = EINTR;
  976         if (error == EWOULDBLOCK)
  977                 error = 0;
  978         if (error == 0) {
  979                 error = copyout(bits, uap->fds, ni);
  980                 if (error)
  981                         goto out;
  982         }
  983 out:
  984         if (ni > sizeof(smallbits))
  985                 free(bits, M_TEMP);
  986 done2:
  987         return (error);
  988 }
  989 
  990 static int
  991 pollscan(td, fds, nfd)
  992         struct thread *td;
  993         struct pollfd *fds;
  994         u_int nfd;
  995 {
  996         register struct filedesc *fdp = td->td_proc->p_fd;
  997         int i;
  998         struct file *fp;
  999         int n = 0;
 1000 
 1001         FILEDESC_SLOCK(fdp);
 1002         for (i = 0; i < nfd; i++, fds++) {
 1003                 if (fds->fd >= fdp->fd_nfiles) {
 1004                         fds->revents = POLLNVAL;
 1005                         n++;
 1006                 } else if (fds->fd < 0) {
 1007                         fds->revents = 0;
 1008                 } else {
 1009                         fp = fdp->fd_ofiles[fds->fd];
 1010                         if (fp == NULL) {
 1011                                 fds->revents = POLLNVAL;
 1012                                 n++;
 1013                         } else {
 1014                                 /*
 1015                                  * Note: backend also returns POLLHUP and
 1016                                  * POLLERR if appropriate.
 1017                                  */
 1018                                 fds->revents = fo_poll(fp, fds->events,
 1019                                     td->td_ucred, td);
 1020                                 if (fds->revents != 0)
 1021                                         n++;
 1022                         }
 1023                 }
 1024         }
 1025         FILEDESC_SUNLOCK(fdp);
 1026         td->td_retval[0] = n;
 1027         return (0);
 1028 }
 1029 
 1030 /*
 1031  * OpenBSD poll system call.
 1032  *
 1033  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
 1034  */
 1035 #ifndef _SYS_SYSPROTO_H_
 1036 struct openbsd_poll_args {
 1037         struct pollfd *fds;
 1038         u_int   nfds;
 1039         int     timeout;
 1040 };
 1041 #endif
 1042 int
 1043 openbsd_poll(td, uap)
 1044         register struct thread *td;
 1045         register struct openbsd_poll_args *uap;
 1046 {
 1047         return (poll(td, (struct poll_args *)uap));
 1048 }
 1049 
 1050 /*
 1051  * Remove the references to the thread from all of the objects we were
 1052  * polling.
 1053  *
 1054  * This code assumes that the underlying owner of the selinfo structure will
 1055  * hold sellock before it changes it, and that it will unlink itself from our
 1056  * list if it goes away.
 1057  */
 1058 void
 1059 clear_selinfo_list(td)
 1060         struct thread *td;
 1061 {
 1062         struct selinfo *si;
 1063 
 1064         mtx_assert(&sellock, MA_OWNED);
 1065         TAILQ_FOREACH(si, &td->td_selq, si_thrlist)
 1066                 si->si_thread = NULL;
 1067         TAILQ_INIT(&td->td_selq);
 1068 }
 1069 
 1070 /*
 1071  * Record a select request.
 1072  */
 1073 void
 1074 selrecord(selector, sip)
 1075         struct thread *selector;
 1076         struct selinfo *sip;
 1077 {
 1078 
 1079         mtx_lock(&sellock);
 1080         /*
 1081          * If the selinfo's thread pointer is NULL then take ownership of it.
 1082          *
 1083          * If the thread pointer is not NULL and it points to another
 1084          * thread, then we have a collision.
 1085          *
 1086          * If the thread pointer is not NULL and points back to us then leave
 1087          * it alone as we've already added pointed it at us and added it to
 1088          * our list.
 1089          */
 1090         if (sip->si_thread == NULL) {
 1091                 sip->si_thread = selector;
 1092                 TAILQ_INSERT_TAIL(&selector->td_selq, sip, si_thrlist);
 1093         } else if (sip->si_thread != selector) {
 1094                 sip->si_flags |= SI_COLL;
 1095         }
 1096 
 1097         mtx_unlock(&sellock);
 1098 }
 1099 
 1100 /* Wake up a selecting thread. */
 1101 void
 1102 selwakeup(sip)
 1103         struct selinfo *sip;
 1104 {
 1105         doselwakeup(sip, -1);
 1106 }
 1107 
 1108 /* Wake up a selecting thread, and set its priority. */
 1109 void
 1110 selwakeuppri(sip, pri)
 1111         struct selinfo *sip;
 1112         int pri;
 1113 {
 1114         doselwakeup(sip, pri);
 1115 }
 1116 
 1117 /*
 1118  * Do a wakeup when a selectable event occurs.
 1119  */
 1120 static void
 1121 doselwakeup(sip, pri)
 1122         struct selinfo *sip;
 1123         int pri;
 1124 {
 1125         struct thread *td;
 1126 
 1127         mtx_lock(&sellock);
 1128         td = sip->si_thread;
 1129         if ((sip->si_flags & SI_COLL) != 0) {
 1130                 nselcoll++;
 1131                 sip->si_flags &= ~SI_COLL;
 1132                 cv_broadcastpri(&selwait, pri);
 1133         }
 1134         if (td == NULL) {
 1135                 mtx_unlock(&sellock);
 1136                 return;
 1137         }
 1138         TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 1139         sip->si_thread = NULL;
 1140         thread_lock(td);
 1141         td->td_flags &= ~TDF_SELECT;
 1142         thread_unlock(td);
 1143         sleepq_remove(td, &selwait);
 1144         mtx_unlock(&sellock);
 1145 }
 1146 
 1147 static void selectinit(void *);
 1148 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 1149 
 1150 /* ARGSUSED*/
 1151 static void
 1152 selectinit(dummy)
 1153         void *dummy;
 1154 {
 1155         cv_init(&selwait, "select");
 1156         mtx_init(&sellock, "sellck", NULL, MTX_DEF);
 1157 }

Cache object: 55b3daab76e5221405819bed80100915


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.