The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/sys_generic.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)sys_generic.c       8.5 (Berkeley) 1/21/94
   35  * $FreeBSD: src/sys/kern/sys_generic.c,v 1.55.2.10 2001/03/17 10:39:32 peter Exp $
   36  */
   37 
   38 #include "opt_ktrace.h"
   39 
   40 #include <sys/param.h>
   41 #include <sys/systm.h>
   42 #include <sys/sysproto.h>
   43 #include <sys/event.h>
   44 #include <sys/filedesc.h>
   45 #include <sys/filio.h>
   46 #include <sys/fcntl.h>
   47 #include <sys/file.h>
   48 #include <sys/proc.h>
   49 #include <sys/signalvar.h>
   50 #include <sys/socketvar.h>
   51 #include <sys/uio.h>
   52 #include <sys/kernel.h>
   53 #include <sys/kern_syscall.h>
   54 #include <sys/malloc.h>
   55 #include <sys/mapped_ioctl.h>
   56 #include <sys/poll.h>
   57 #include <sys/queue.h>
   58 #include <sys/resourcevar.h>
   59 #include <sys/socketops.h>
   60 #include <sys/sysctl.h>
   61 #include <sys/sysent.h>
   62 #include <sys/buf.h>
   63 #ifdef KTRACE
   64 #include <sys/ktrace.h>
   65 #endif
   66 #include <vm/vm.h>
   67 #include <vm/vm_page.h>
   68 
   69 #include <sys/file2.h>
   70 #include <sys/mplock2.h>
   71 #include <sys/spinlock2.h>
   72 
   73 #include <machine/limits.h>
   74 
   75 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
   76 static MALLOC_DEFINE(M_IOCTLMAP, "ioctlmap", "mapped ioctl handler buffer");
   77 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
   78 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
   79 
   80 typedef struct kfd_set {
   81         fd_mask fds_bits[2];
   82 } kfd_set;
   83 
   84 enum select_copyin_states {
   85     COPYIN_READ, COPYIN_WRITE, COPYIN_EXCEPT, COPYIN_DONE };
   86 
   87 struct select_kevent_copyin_args {
   88         kfd_set         *read_set;
   89         kfd_set         *write_set;
   90         kfd_set         *except_set;
   91         int             active_set;     /* One of select_copyin_states */
   92         struct lwp      *lwp;           /* Pointer to our lwp */
   93         int             num_fds;        /* Number of file descriptors (syscall arg) */
   94         int             proc_fds;       /* Processed fd's (wraps) */
   95         int             error;          /* Returned to userland */
   96 };
   97 
   98 struct poll_kevent_copyin_args {
   99         struct lwp      *lwp;
  100         struct pollfd   *fds;
  101         int             nfds;
  102         int             pfds;
  103         int             error;
  104 };
  105 
  106 static struct lwkt_token mioctl_token = LWKT_TOKEN_INITIALIZER(mioctl_token);
  107 
  108 static int      doselect(int nd, fd_set *in, fd_set *ou, fd_set *ex,
  109                          struct timespec *ts, int *res);
  110 static int      dopoll(int nfds, struct pollfd *fds, struct timespec *ts,
  111                        int *res);
  112 static int      dofileread(int, struct file *, struct uio *, int, size_t *);
  113 static int      dofilewrite(int, struct file *, struct uio *, int, size_t *);
  114 
  115 /*
  116  * Read system call.
  117  *
  118  * MPSAFE
  119  */
  120 int
  121 sys_read(struct read_args *uap)
  122 {
  123         struct thread *td = curthread;
  124         struct uio auio;
  125         struct iovec aiov;
  126         int error;
  127 
  128         if ((ssize_t)uap->nbyte < 0)
  129                 error = EINVAL;
  130 
  131         aiov.iov_base = uap->buf;
  132         aiov.iov_len = uap->nbyte;
  133         auio.uio_iov = &aiov;
  134         auio.uio_iovcnt = 1;
  135         auio.uio_offset = -1;
  136         auio.uio_resid = uap->nbyte;
  137         auio.uio_rw = UIO_READ;
  138         auio.uio_segflg = UIO_USERSPACE;
  139         auio.uio_td = td;
  140 
  141         error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult);
  142         return(error);
  143 }
  144 
  145 /*
  146  * Positioned (Pread) read system call
  147  *
  148  * MPSAFE
  149  */
  150 int
  151 sys_extpread(struct extpread_args *uap)
  152 {
  153         struct thread *td = curthread;
  154         struct uio auio;
  155         struct iovec aiov;
  156         int error;
  157         int flags;
  158 
  159         if ((ssize_t)uap->nbyte < 0)
  160                 return(EINVAL);
  161 
  162         aiov.iov_base = uap->buf;
  163         aiov.iov_len = uap->nbyte;
  164         auio.uio_iov = &aiov;
  165         auio.uio_iovcnt = 1;
  166         auio.uio_offset = uap->offset;
  167         auio.uio_resid = uap->nbyte;
  168         auio.uio_rw = UIO_READ;
  169         auio.uio_segflg = UIO_USERSPACE;
  170         auio.uio_td = td;
  171 
  172         flags = uap->flags & O_FMASK;
  173         if (uap->offset != (off_t)-1)
  174                 flags |= O_FOFFSET;
  175 
  176         error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult);
  177         return(error);
  178 }
  179 
  180 /*
  181  * Scatter read system call.
  182  *
  183  * MPSAFE
  184  */
  185 int
  186 sys_readv(struct readv_args *uap)
  187 {
  188         struct thread *td = curthread;
  189         struct uio auio;
  190         struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
  191         int error;
  192 
  193         error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
  194                              &auio.uio_resid);
  195         if (error)
  196                 return (error);
  197         auio.uio_iov = iov;
  198         auio.uio_iovcnt = uap->iovcnt;
  199         auio.uio_offset = -1;
  200         auio.uio_rw = UIO_READ;
  201         auio.uio_segflg = UIO_USERSPACE;
  202         auio.uio_td = td;
  203 
  204         error = kern_preadv(uap->fd, &auio, 0, &uap->sysmsg_szresult);
  205 
  206         iovec_free(&iov, aiov);
  207         return (error);
  208 }
  209 
  210 
  211 /*
  212  * Scatter positioned read system call.
  213  *
  214  * MPSAFE
  215  */
  216 int
  217 sys_extpreadv(struct extpreadv_args *uap)
  218 {
  219         struct thread *td = curthread;
  220         struct uio auio;
  221         struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
  222         int error;
  223         int flags;
  224 
  225         error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
  226                              &auio.uio_resid);
  227         if (error)
  228                 return (error);
  229         auio.uio_iov = iov;
  230         auio.uio_iovcnt = uap->iovcnt;
  231         auio.uio_offset = uap->offset;
  232         auio.uio_rw = UIO_READ;
  233         auio.uio_segflg = UIO_USERSPACE;
  234         auio.uio_td = td;
  235 
  236         flags = uap->flags & O_FMASK;
  237         if (uap->offset != (off_t)-1)
  238                 flags |= O_FOFFSET;
  239 
  240         error = kern_preadv(uap->fd, &auio, flags, &uap->sysmsg_szresult);
  241 
  242         iovec_free(&iov, aiov);
  243         return(error);
  244 }
  245 
  246 /*
  247  * MPSAFE
  248  */
  249 int
  250 kern_preadv(int fd, struct uio *auio, int flags, size_t *res)
  251 {
  252         struct thread *td = curthread;
  253         struct proc *p = td->td_proc;
  254         struct file *fp;
  255         int error;
  256 
  257         KKASSERT(p);
  258 
  259         fp = holdfp(p->p_fd, fd, FREAD);
  260         if (fp == NULL)
  261                 return (EBADF);
  262         if (flags & O_FOFFSET && fp->f_type != DTYPE_VNODE) {
  263                 error = ESPIPE;
  264         } else {
  265                 error = dofileread(fd, fp, auio, flags, res);
  266         }
  267         fdrop(fp);
  268         return(error);
  269 }
  270 
  271 /*
  272  * Common code for readv and preadv that reads data in
  273  * from a file using the passed in uio, offset, and flags.
  274  *
  275  * MPALMOSTSAFE - ktrace needs help
  276  */
  277 static int
  278 dofileread(int fd, struct file *fp, struct uio *auio, int flags, size_t *res)
  279 {
  280         int error;
  281         size_t len;
  282 #ifdef KTRACE
  283         struct thread *td = curthread;
  284         struct iovec *ktriov = NULL;
  285         struct uio ktruio;
  286 #endif
  287 
  288 #ifdef KTRACE
  289         /*
  290          * if tracing, save a copy of iovec
  291          */
  292         if (KTRPOINT(td, KTR_GENIO))  {
  293                 int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
  294 
  295                 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK);
  296                 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
  297                 ktruio = *auio;
  298         }
  299 #endif
  300         len = auio->uio_resid;
  301         error = fo_read(fp, auio, fp->f_cred, flags);
  302         if (error) {
  303                 if (auio->uio_resid != len && (error == ERESTART ||
  304                     error == EINTR || error == EWOULDBLOCK))
  305                         error = 0;
  306         }
  307 #ifdef KTRACE
  308         if (ktriov != NULL) {
  309                 if (error == 0) {
  310                         ktruio.uio_iov = ktriov;
  311                         ktruio.uio_resid = len - auio->uio_resid;
  312                         get_mplock();
  313                         ktrgenio(td->td_lwp, fd, UIO_READ, &ktruio, error);
  314                         rel_mplock();
  315                 }
  316                 kfree(ktriov, M_TEMP);
  317         }
  318 #endif
  319         if (error == 0)
  320                 *res = len - auio->uio_resid;
  321 
  322         return(error);
  323 }
  324 
  325 /*
  326  * Write system call
  327  *
  328  * MPSAFE
  329  */
  330 int
  331 sys_write(struct write_args *uap)
  332 {
  333         struct thread *td = curthread;
  334         struct uio auio;
  335         struct iovec aiov;
  336         int error;
  337 
  338         if ((ssize_t)uap->nbyte < 0)
  339                 error = EINVAL;
  340 
  341         aiov.iov_base = (void *)(uintptr_t)uap->buf;
  342         aiov.iov_len = uap->nbyte;
  343         auio.uio_iov = &aiov;
  344         auio.uio_iovcnt = 1;
  345         auio.uio_offset = -1;
  346         auio.uio_resid = uap->nbyte;
  347         auio.uio_rw = UIO_WRITE;
  348         auio.uio_segflg = UIO_USERSPACE;
  349         auio.uio_td = td;
  350 
  351         error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult);
  352 
  353         return(error);
  354 }
  355 
  356 /*
  357  * Pwrite system call
  358  *
  359  * MPSAFE
  360  */
  361 int
  362 sys_extpwrite(struct extpwrite_args *uap)
  363 {
  364         struct thread *td = curthread;
  365         struct uio auio;
  366         struct iovec aiov;
  367         int error;
  368         int flags;
  369 
  370         if ((ssize_t)uap->nbyte < 0)
  371                 error = EINVAL;
  372 
  373         aiov.iov_base = (void *)(uintptr_t)uap->buf;
  374         aiov.iov_len = uap->nbyte;
  375         auio.uio_iov = &aiov;
  376         auio.uio_iovcnt = 1;
  377         auio.uio_offset = uap->offset;
  378         auio.uio_resid = uap->nbyte;
  379         auio.uio_rw = UIO_WRITE;
  380         auio.uio_segflg = UIO_USERSPACE;
  381         auio.uio_td = td;
  382 
  383         flags = uap->flags & O_FMASK;
  384         if (uap->offset != (off_t)-1)
  385                 flags |= O_FOFFSET;
  386         error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult);
  387         return(error);
  388 }
  389 
  390 /*
  391  * MPSAFE
  392  */
  393 int
  394 sys_writev(struct writev_args *uap)
  395 {
  396         struct thread *td = curthread;
  397         struct uio auio;
  398         struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
  399         int error;
  400 
  401         error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
  402                              &auio.uio_resid);
  403         if (error)
  404                 return (error);
  405         auio.uio_iov = iov;
  406         auio.uio_iovcnt = uap->iovcnt;
  407         auio.uio_offset = -1;
  408         auio.uio_rw = UIO_WRITE;
  409         auio.uio_segflg = UIO_USERSPACE;
  410         auio.uio_td = td;
  411 
  412         error = kern_pwritev(uap->fd, &auio, 0, &uap->sysmsg_szresult);
  413 
  414         iovec_free(&iov, aiov);
  415         return (error);
  416 }
  417 
  418 
  419 /*
  420  * Gather positioned write system call
  421  *
  422  * MPSAFE
  423  */
  424 int
  425 sys_extpwritev(struct extpwritev_args *uap)
  426 {
  427         struct thread *td = curthread;
  428         struct uio auio;
  429         struct iovec aiov[UIO_SMALLIOV], *iov = NULL;
  430         int error;
  431         int flags;
  432 
  433         error = iovec_copyin(uap->iovp, &iov, aiov, uap->iovcnt,
  434                              &auio.uio_resid);
  435         if (error)
  436                 return (error);
  437         auio.uio_iov = iov;
  438         auio.uio_iovcnt = uap->iovcnt;
  439         auio.uio_offset = uap->offset;
  440         auio.uio_rw = UIO_WRITE;
  441         auio.uio_segflg = UIO_USERSPACE;
  442         auio.uio_td = td;
  443 
  444         flags = uap->flags & O_FMASK;
  445         if (uap->offset != (off_t)-1)
  446                 flags |= O_FOFFSET;
  447 
  448         error = kern_pwritev(uap->fd, &auio, flags, &uap->sysmsg_szresult);
  449 
  450         iovec_free(&iov, aiov);
  451         return(error);
  452 }
  453 
  454 /*
  455  * MPSAFE
  456  */
  457 int
  458 kern_pwritev(int fd, struct uio *auio, int flags, size_t *res)
  459 {
  460         struct thread *td = curthread;
  461         struct proc *p = td->td_proc;
  462         struct file *fp;
  463         int error;
  464 
  465         KKASSERT(p);
  466 
  467         fp = holdfp(p->p_fd, fd, FWRITE);
  468         if (fp == NULL)
  469                 return (EBADF);
  470         else if ((flags & O_FOFFSET) && fp->f_type != DTYPE_VNODE) {
  471                 error = ESPIPE;
  472         } else {
  473                 error = dofilewrite(fd, fp, auio, flags, res);
  474         }
  475         
  476         fdrop(fp);
  477         return (error);
  478 }
  479 
  480 /*
  481  * Common code for writev and pwritev that writes data to
  482  * a file using the passed in uio, offset, and flags.
  483  *
  484  * MPALMOSTSAFE - ktrace needs help
  485  */
  486 static int
  487 dofilewrite(int fd, struct file *fp, struct uio *auio, int flags, size_t *res)
  488 {       
  489         struct thread *td = curthread;
  490         struct lwp *lp = td->td_lwp;
  491         int error;
  492         size_t len;
  493 #ifdef KTRACE
  494         struct iovec *ktriov = NULL;
  495         struct uio ktruio;
  496 #endif
  497 
  498 #ifdef KTRACE
  499         /*
  500          * if tracing, save a copy of iovec and uio
  501          */
  502         if (KTRPOINT(td, KTR_GENIO))  {
  503                 int iovlen = auio->uio_iovcnt * sizeof(struct iovec);
  504 
  505                 ktriov = kmalloc(iovlen, M_TEMP, M_WAITOK);
  506                 bcopy((caddr_t)auio->uio_iov, (caddr_t)ktriov, iovlen);
  507                 ktruio = *auio;
  508         }
  509 #endif
  510         len = auio->uio_resid;
  511         error = fo_write(fp, auio, fp->f_cred, flags);
  512         if (error) {
  513                 if (auio->uio_resid != len && (error == ERESTART ||
  514                     error == EINTR || error == EWOULDBLOCK))
  515                         error = 0;
  516                 /* Socket layer is responsible for issuing SIGPIPE. */
  517                 if (error == EPIPE && fp->f_type != DTYPE_SOCKET)
  518                         lwpsignal(lp->lwp_proc, lp, SIGPIPE);
  519         }
  520 #ifdef KTRACE
  521         if (ktriov != NULL) {
  522                 if (error == 0) {
  523                         ktruio.uio_iov = ktriov;
  524                         ktruio.uio_resid = len - auio->uio_resid;
  525                         get_mplock();
  526                         ktrgenio(lp, fd, UIO_WRITE, &ktruio, error);
  527                         rel_mplock();
  528                 }
  529                 kfree(ktriov, M_TEMP);
  530         }
  531 #endif
  532         if (error == 0)
  533                 *res = len - auio->uio_resid;
  534 
  535         return(error);
  536 }
  537 
  538 /*
  539  * Ioctl system call
  540  *
  541  * MPSAFE
  542  */
  543 int
  544 sys_ioctl(struct ioctl_args *uap)
  545 {
  546         int error;
  547 
  548         error = mapped_ioctl(uap->fd, uap->com, uap->data, NULL, &uap->sysmsg);
  549         return (error);
  550 }
  551 
  552 struct ioctl_map_entry {
  553         const char *subsys;
  554         struct ioctl_map_range *cmd_ranges;
  555         LIST_ENTRY(ioctl_map_entry) entries;
  556 };
  557 
  558 /*
  559  * The true heart of all ioctl syscall handlers (native, emulation).
  560  * If map != NULL, it will be searched for a matching entry for com,
  561  * and appropriate conversions/conversion functions will be utilized.
  562  *
  563  * MPSAFE
  564  */
  565 int
  566 mapped_ioctl(int fd, u_long com, caddr_t uspc_data, struct ioctl_map *map,
  567              struct sysmsg *msg)
  568 {
  569         struct thread *td = curthread;
  570         struct proc *p = td->td_proc;
  571         struct ucred *cred;
  572         struct file *fp;
  573         struct ioctl_map_range *iomc = NULL;
  574         int error;
  575         u_int size;
  576         u_long ocom = com;
  577         caddr_t data, memp;
  578         int tmp;
  579 #define STK_PARAMS      128
  580         union {
  581             char stkbuf[STK_PARAMS];
  582             long align;
  583         } ubuf;
  584 
  585         KKASSERT(p);
  586         cred = td->td_ucred;
  587 
  588         fp = holdfp(p->p_fd, fd, FREAD|FWRITE);
  589         if (fp == NULL)
  590                 return(EBADF);
  591 
  592         if (map != NULL) {      /* obey translation map */
  593                 u_long maskcmd;
  594                 struct ioctl_map_entry *e;
  595 
  596                 maskcmd = com & map->mask;
  597 
  598                 lwkt_gettoken(&mioctl_token);
  599                 LIST_FOREACH(e, &map->mapping, entries) {
  600                         for (iomc = e->cmd_ranges; iomc->start != 0 ||
  601                              iomc->maptocmd != 0 || iomc->wrapfunc != NULL ||
  602                              iomc->mapfunc != NULL;
  603                              iomc++) {
  604                                 if (maskcmd >= iomc->start &&
  605                                     maskcmd <= iomc->end)
  606                                         break;
  607                         }
  608 
  609                         /* Did we find a match? */
  610                         if (iomc->start != 0 || iomc->maptocmd != 0 ||
  611                             iomc->wrapfunc != NULL || iomc->mapfunc != NULL)
  612                                 break;
  613                 }
  614                 lwkt_reltoken(&mioctl_token);
  615 
  616                 if (iomc == NULL ||
  617                     (iomc->start == 0 && iomc->maptocmd == 0
  618                      && iomc->wrapfunc == NULL && iomc->mapfunc == NULL)) {
  619                         kprintf("%s: 'ioctl' fd=%d, cmd=0x%lx ('%c',%d) not implemented\n",
  620                                map->sys, fd, maskcmd,
  621                                (int)((maskcmd >> 8) & 0xff),
  622                                (int)(maskcmd & 0xff));
  623                         error = EINVAL;
  624                         goto done;
  625                 }
  626 
  627                 /*
  628                  * If it's a non-range one to one mapping, maptocmd should be
  629                  * correct. If it's a ranged one to one mapping, we pass the
  630                  * original value of com, and for a range mapped to a different
  631                  * range, we always need a mapping function to translate the
  632                  * ioctl to our native ioctl. Ex. 6500-65ff <-> 9500-95ff
  633                  */
  634                 if (iomc->start == iomc->end && iomc->maptocmd == iomc->maptoend) {
  635                         com = iomc->maptocmd;
  636                 } else if (iomc->start == iomc->maptocmd && iomc->end == iomc->maptoend) {
  637                         if (iomc->mapfunc != NULL)
  638                                 com = iomc->mapfunc(iomc->start, iomc->end,
  639                                                     iomc->start, iomc->end,
  640                                                     com, com);
  641                 } else {
  642                         if (iomc->mapfunc != NULL) {
  643                                 com = iomc->mapfunc(iomc->start, iomc->end,
  644                                                     iomc->maptocmd, iomc->maptoend,
  645                                                     com, ocom);
  646                         } else {
  647                                 kprintf("%s: Invalid mapping for fd=%d, cmd=%#lx ('%c',%d)\n",
  648                                        map->sys, fd, maskcmd,
  649                                        (int)((maskcmd >> 8) & 0xff),
  650                                        (int)(maskcmd & 0xff));
  651                                 error = EINVAL;
  652                                 goto done;
  653                         }
  654                 }
  655         }
  656 
  657         switch (com) {
  658         case FIONCLEX:
  659                 error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE);
  660                 goto done;
  661         case FIOCLEX:
  662                 error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE);
  663                 goto done;
  664         }
  665 
  666         /*
  667          * Interpret high order word to find amount of data to be
  668          * copied to/from the user's address space.
  669          */
  670         size = IOCPARM_LEN(com);
  671         if (size > IOCPARM_MAX) {
  672                 error = ENOTTY;
  673                 goto done;
  674         }
  675 
  676         if (size > sizeof (ubuf.stkbuf)) {
  677                 memp = kmalloc(size, M_IOCTLOPS, M_WAITOK);
  678                 data = memp;
  679         } else {
  680                 memp = NULL;
  681                 data = ubuf.stkbuf;
  682         }
  683         if ((com & IOC_IN) != 0) {
  684                 if (size != 0) {
  685                         error = copyin(uspc_data, data, (size_t)size);
  686                         if (error) {
  687                                 if (memp != NULL)
  688                                         kfree(memp, M_IOCTLOPS);
  689                                 goto done;
  690                         }
  691                 } else {
  692                         *(caddr_t *)data = uspc_data;
  693                 }
  694         } else if ((com & IOC_OUT) != 0 && size) {
  695                 /*
  696                  * Zero the buffer so the user always
  697                  * gets back something deterministic.
  698                  */
  699                 bzero(data, (size_t)size);
  700         } else if ((com & IOC_VOID) != 0) {
  701                 *(caddr_t *)data = uspc_data;
  702         }
  703 
  704         switch (com) {
  705         case FIONBIO:
  706                 if ((tmp = *(int *)data))
  707                         atomic_set_int(&fp->f_flag, FNONBLOCK);
  708                 else
  709                         atomic_clear_int(&fp->f_flag, FNONBLOCK);
  710                 error = 0;
  711                 break;
  712 
  713         case FIOASYNC:
  714                 if ((tmp = *(int *)data))
  715                         atomic_set_int(&fp->f_flag, FASYNC);
  716                 else
  717                         atomic_clear_int(&fp->f_flag, FASYNC);
  718                 error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred, msg);
  719                 break;
  720 
  721         default:
  722                 /*
  723                  *  If there is a override function,
  724                  *  call it instead of directly routing the call
  725                  */
  726                 if (map != NULL && iomc->wrapfunc != NULL)
  727                         error = iomc->wrapfunc(fp, com, ocom, data, cred);
  728                 else
  729                         error = fo_ioctl(fp, com, data, cred, msg);
  730                 /*
  731                  * Copy any data to user, size was
  732                  * already set and checked above.
  733                  */
  734                 if (error == 0 && (com & IOC_OUT) != 0 && size != 0)
  735                         error = copyout(data, uspc_data, (size_t)size);
  736                 break;
  737         }
  738         if (memp != NULL)
  739                 kfree(memp, M_IOCTLOPS);
  740 done:
  741         fdrop(fp);
  742         return(error);
  743 }
  744 
  745 /*
  746  * MPSAFE
  747  */
  748 int
  749 mapped_ioctl_register_handler(struct ioctl_map_handler *he)
  750 {
  751         struct ioctl_map_entry *ne;
  752 
  753         KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL &&
  754                  he->subsys != NULL && *he->subsys != '\0');
  755 
  756         ne = kmalloc(sizeof(struct ioctl_map_entry), M_IOCTLMAP,
  757                      M_WAITOK | M_ZERO);
  758 
  759         ne->subsys = he->subsys;
  760         ne->cmd_ranges = he->cmd_ranges;
  761 
  762         lwkt_gettoken(&mioctl_token);
  763         LIST_INSERT_HEAD(&he->map->mapping, ne, entries);
  764         lwkt_reltoken(&mioctl_token);
  765 
  766         return(0);
  767 }
  768 
  769 /*
  770  * MPSAFE
  771  */
  772 int
  773 mapped_ioctl_unregister_handler(struct ioctl_map_handler *he)
  774 {
  775         struct ioctl_map_entry *ne;
  776         int error = EINVAL;
  777 
  778         KKASSERT(he != NULL && he->map != NULL && he->cmd_ranges != NULL);
  779 
  780         lwkt_gettoken(&mioctl_token);
  781         LIST_FOREACH(ne, &he->map->mapping, entries) {
  782                 if (ne->cmd_ranges == he->cmd_ranges) {
  783                         LIST_REMOVE(ne, entries);
  784                         kfree(ne, M_IOCTLMAP);
  785                         error = 0;
  786                         break;
  787                 }
  788         }
  789         lwkt_reltoken(&mioctl_token);
  790         return(error);
  791 }
  792 
  793 static int      nselcoll;       /* Select collisions since boot */
  794 int     selwait;
  795 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
  796 static int      nseldebug;
  797 SYSCTL_INT(_kern, OID_AUTO, nseldebug, CTLFLAG_RW, &nseldebug, 0, "");
  798 
  799 /*
  800  * Select system call.
  801  *
  802  * MPSAFE
  803  */
  804 int
  805 sys_select(struct select_args *uap)
  806 {
  807         struct timeval ktv;
  808         struct timespec *ktsp, kts;
  809         int error;
  810 
  811         /*
  812          * Get timeout if any.
  813          */
  814         if (uap->tv != NULL) {
  815                 error = copyin(uap->tv, &ktv, sizeof (ktv));
  816                 if (error)
  817                         return (error);
  818                 TIMEVAL_TO_TIMESPEC(&ktv, &kts);
  819                 ktsp = &kts;
  820         } else {
  821                 ktsp = NULL;
  822         }
  823 
  824         /*
  825          * Do real work.
  826          */
  827         error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp,
  828                          &uap->sysmsg_result);
  829 
  830         return (error);
  831 }
  832 
  833 
  834 /*
  835  * Pselect system call.
  836  */
  837 int
  838 sys_pselect(struct pselect_args *uap)
  839 {
  840         struct thread *td = curthread;
  841         struct lwp *lp = td->td_lwp;
  842         struct timespec *ktsp, kts;
  843         sigset_t sigmask;
  844         int error;
  845 
  846         /*
  847          * Get timeout if any.
  848          */
  849         if (uap->ts != NULL) {
  850                 error = copyin(uap->ts, &kts, sizeof (kts));
  851                 if (error)
  852                         return (error);
  853                 ktsp = &kts;
  854         } else {
  855                 ktsp = NULL;
  856         }
  857 
  858         /*
  859          * Install temporary signal mask if any provided.
  860          */
  861         if (uap->sigmask != NULL) {
  862                 error = copyin(uap->sigmask, &sigmask, sizeof(sigmask));
  863                 if (error)
  864                         return (error);
  865                 lwkt_gettoken(&lp->lwp_proc->p_token);
  866                 lp->lwp_oldsigmask = lp->lwp_sigmask;
  867                 SIG_CANTMASK(sigmask);
  868                 lp->lwp_sigmask = sigmask;
  869                 lwkt_reltoken(&lp->lwp_proc->p_token);
  870         }
  871 
  872         /*
  873          * Do real job.
  874          */
  875         error = doselect(uap->nd, uap->in, uap->ou, uap->ex, ktsp,
  876                          &uap->sysmsg_result);
  877 
  878         if (uap->sigmask != NULL) {
  879                 lwkt_gettoken(&lp->lwp_proc->p_token);
  880                 /* doselect() responsible for turning ERESTART into EINTR */
  881                 KKASSERT(error != ERESTART);
  882                 if (error == EINTR) {
  883                         /*
  884                          * We can't restore the previous signal mask now
  885                          * because it could block the signal that interrupted
  886                          * us.  So make a note to restore it after executing
  887                          * the handler.
  888                          */
  889                         lp->lwp_flags |= LWP_OLDMASK;
  890                 } else {
  891                         /*
  892                          * No handler to run. Restore previous mask immediately.
  893                          */
  894                         lp->lwp_sigmask = lp->lwp_oldsigmask;
  895                 }
  896                 lwkt_reltoken(&lp->lwp_proc->p_token);
  897         }
  898 
  899         return (error);
  900 }
  901 
  902 static int
  903 select_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
  904 {
  905         struct select_kevent_copyin_args *skap = NULL;
  906         struct kevent *kev;
  907         int fd;
  908         kfd_set *fdp = NULL;
  909         short filter = 0;
  910         u_int fflags = 0;
  911 
  912         skap = (struct select_kevent_copyin_args *)arg;
  913 
  914         if (*events == maxevents)
  915                 return (0);
  916 
  917         while (skap->active_set < COPYIN_DONE) {
  918                 switch (skap->active_set) {
  919                 case COPYIN_READ:
  920                         /*
  921                          * Register descriptors for the read filter
  922                          */
  923                         fdp = skap->read_set;
  924                         filter = EVFILT_READ;
  925                         fflags = NOTE_OLDAPI;
  926                         if (fdp)
  927                                 break;
  928                         ++skap->active_set;
  929                         skap->proc_fds = 0;
  930                         /* fall through */
  931                 case COPYIN_WRITE:
  932                         /*
  933                          * Register descriptors for the write filter
  934                          */
  935                         fdp = skap->write_set;
  936                         filter = EVFILT_WRITE;
  937                         fflags = NOTE_OLDAPI;
  938                         if (fdp)
  939                                 break;
  940                         ++skap->active_set;
  941                         skap->proc_fds = 0;
  942                         /* fall through */
  943                 case COPYIN_EXCEPT:
  944                         /*
  945                          * Register descriptors for the exception filter
  946                          */
  947                         fdp = skap->except_set;
  948                         filter = EVFILT_EXCEPT;
  949                         fflags = NOTE_OLDAPI | NOTE_OOB;
  950                         if (fdp)
  951                                 break;
  952                         ++skap->active_set;
  953                         skap->proc_fds = 0;
  954                         /* fall through */
  955                 case COPYIN_DONE:
  956                         /*
  957                          * Nothing left to register
  958                          */
  959                         return(0);
  960                         /* NOT REACHED */
  961                 }
  962 
  963                 while (skap->proc_fds < skap->num_fds) {
  964                         fd = skap->proc_fds;
  965                         if (FD_ISSET(fd, fdp)) {
  966                                 kev = &kevp[*events];
  967                                 EV_SET(kev, fd, filter,
  968                                        EV_ADD|EV_ENABLE,
  969                                        fflags, 0,
  970                                        (void *)(uintptr_t)
  971                                         skap->lwp->lwp_kqueue_serial);
  972                                 FD_CLR(fd, fdp);
  973                                 ++*events;
  974 
  975                                 if (nseldebug)
  976                                         kprintf("select fd %d filter %d serial %d\n",
  977                                                 fd, filter, skap->lwp->lwp_kqueue_serial);
  978                         }
  979                         ++skap->proc_fds;
  980                         if (*events == maxevents)
  981                                 return (0);
  982                 }
  983                 skap->active_set++;
  984                 skap->proc_fds = 0;
  985         }
  986 
  987         return (0);
  988 }
  989 
  990 static int
  991 select_copyout(void *arg, struct kevent *kevp, int count, int *res)
  992 {
  993         struct select_kevent_copyin_args *skap;
  994         struct kevent kev;
  995         int i = 0;
  996 
  997         skap = (struct select_kevent_copyin_args *)arg;
  998 
  999         for (i = 0; i < count; ++i) {
 1000                 /*
 1001                  * Filter out and delete spurious events
 1002                  */
 1003                 if ((u_int)(uintptr_t)kevp[i].udata !=
 1004                     skap->lwp->lwp_kqueue_serial) {
 1005                         kev = kevp[i];
 1006                         kev.flags = EV_DISABLE|EV_DELETE;
 1007                         kqueue_register(&skap->lwp->lwp_kqueue, &kev);
 1008                         if (nseldebug)
 1009                                 kprintf("select fd %ju mismatched serial %d\n",
 1010                                         (uintmax_t)kevp[i].ident,
 1011                                         skap->lwp->lwp_kqueue_serial);
 1012                         continue;
 1013                 }
 1014 
 1015                 /*
 1016                  * Handle errors
 1017                  */
 1018                 if (kevp[i].flags & EV_ERROR) {
 1019                         int error = kevp[i].data;
 1020 
 1021                         switch (error) {
 1022                         case EBADF:
 1023                                 /*
 1024                                  * A bad file descriptor is considered a
 1025                                  * fatal error for select, bail out.
 1026                                  */
 1027                                 skap->error = error;
 1028                                 *res = -1;
 1029                                 return error;
 1030 
 1031                         default:
 1032                                 /*
 1033                                  * Select silently swallows any unknown errors
 1034                                  * for descriptors in the read or write sets.
 1035                                  *
 1036                                  * ALWAYS filter out EOPNOTSUPP errors from
 1037                                  * filters (at least until all filters support
 1038                                  * EVFILT_EXCEPT)
 1039                                  *
 1040                                  * We also filter out ENODEV since dev_dkqfilter
 1041                                  * returns ENODEV if EOPNOTSUPP is returned in an
 1042                                  * inner call.
 1043                                  *
 1044                                  * XXX: fix this
 1045                                  */
 1046                                 if (kevp[i].filter != EVFILT_READ &&
 1047                                     kevp[i].filter != EVFILT_WRITE &&
 1048                                     error != EOPNOTSUPP &&
 1049                                     error != ENODEV) {
 1050                                         skap->error = error;
 1051                                         *res = -1;
 1052                                         return error;
 1053                                 }
 1054                                 break;
 1055                         }
 1056                         if (nseldebug)
 1057                                 kprintf("select fd %ju filter %d error %d\n",
 1058                                         (uintmax_t)kevp[i].ident,
 1059                                         kevp[i].filter, error);
 1060                         continue;
 1061                 }
 1062 
 1063                 switch (kevp[i].filter) {
 1064                 case EVFILT_READ:
 1065                         FD_SET(kevp[i].ident, skap->read_set);
 1066                         break;
 1067                 case EVFILT_WRITE:
 1068                         FD_SET(kevp[i].ident, skap->write_set);
 1069                         break;
 1070                 case EVFILT_EXCEPT:
 1071                         FD_SET(kevp[i].ident, skap->except_set);
 1072                         break;
 1073                 }
 1074 
 1075                 ++*res;
 1076         }
 1077 
 1078         return (0);
 1079 }
 1080 
 1081 /*
 1082  * Copy select bits in from userland.  Allocate kernel memory if the
 1083  * set is large.
 1084  */
 1085 static int
 1086 getbits(int bytes, fd_set *in_set, kfd_set **out_set, kfd_set *tmp_set)
 1087 {
 1088         int error;
 1089 
 1090         if (in_set) {
 1091                 if (bytes < sizeof(*tmp_set))
 1092                         *out_set = tmp_set;
 1093                 else
 1094                         *out_set = kmalloc(bytes, M_SELECT, M_WAITOK);
 1095                 error = copyin(in_set, *out_set, bytes);
 1096         } else {
 1097                 *out_set = NULL;
 1098                 error = 0;
 1099         }
 1100         return (error);
 1101 }
 1102 
 1103 /*
 1104  * Copy returned select bits back out to userland.
 1105  */
 1106 static int
 1107 putbits(int bytes, kfd_set *in_set, fd_set *out_set)
 1108 {
 1109         int error;
 1110 
 1111         if (in_set) {
 1112                 error = copyout(in_set, out_set, bytes);
 1113         } else {
 1114                 error = 0;
 1115         }
 1116         return (error);
 1117 }
 1118 
 1119 static int
 1120 dotimeout_only(struct timespec *ts)
 1121 {
 1122         return(nanosleep1(ts, NULL));
 1123 }
 1124 
 1125 /*
 1126  * Common code for sys_select() and sys_pselect().
 1127  *
 1128  * in, out and ex are userland pointers.  ts must point to validated
 1129  * kernel-side timeout value or NULL for infinite timeout.  res must
 1130  * point to syscall return value.
 1131  */
 1132 static int
 1133 doselect(int nd, fd_set *read, fd_set *write, fd_set *except,
 1134          struct timespec *ts, int *res)
 1135 {
 1136         struct proc *p = curproc;
 1137         struct select_kevent_copyin_args *kap, ka;
 1138         int bytes, error;
 1139         kfd_set read_tmp;
 1140         kfd_set write_tmp;
 1141         kfd_set except_tmp;
 1142 
 1143         *res = 0;
 1144         if (nd < 0)
 1145                 return (EINVAL);
 1146         if (nd == 0 && ts)
 1147                 return (dotimeout_only(ts));
 1148 
 1149         if (nd > p->p_fd->fd_nfiles)            /* limit kmalloc */
 1150                 nd = p->p_fd->fd_nfiles;
 1151 
 1152         kap = &ka;
 1153         kap->lwp = curthread->td_lwp;
 1154         kap->num_fds = nd;
 1155         kap->proc_fds = 0;
 1156         kap->error = 0;
 1157         kap->active_set = COPYIN_READ;
 1158 
 1159         /*
 1160          * Calculate bytes based on the number of __fd_mask[] array entries
 1161          * multiplied by the size of __fd_mask.
 1162          */
 1163         bytes = howmany(nd, __NFDBITS) * sizeof(__fd_mask);
 1164 
 1165         /* kap->read_set = NULL; not needed */
 1166         kap->write_set = NULL;
 1167         kap->except_set = NULL;
 1168 
 1169         error = getbits(bytes, read, &kap->read_set, &read_tmp);
 1170         if (error == 0)
 1171                 error = getbits(bytes, write, &kap->write_set, &write_tmp);
 1172         if (error == 0)
 1173                 error = getbits(bytes, except, &kap->except_set, &except_tmp);
 1174         if (error)
 1175                 goto done;
 1176 
 1177         /*
 1178          * NOTE: Make sure the max events passed to kern_kevent() is
 1179          *       effectively unlimited.  (nd * 3) accomplishes this.
 1180          *
 1181          *       (*res) continues to increment as returned events are
 1182          *       loaded in.
 1183          */
 1184         error = kern_kevent(&kap->lwp->lwp_kqueue, 0x7FFFFFFF, res, kap,
 1185                             select_copyin, select_copyout, ts);
 1186         if (error == 0)
 1187                 error = putbits(bytes, kap->read_set, read);
 1188         if (error == 0)
 1189                 error = putbits(bytes, kap->write_set, write);
 1190         if (error == 0)
 1191                 error = putbits(bytes, kap->except_set, except);
 1192 
 1193         /*
 1194          * An error from an individual event that should be passed
 1195          * back to userland (EBADF)
 1196          */
 1197         if (kap->error)
 1198                 error = kap->error;
 1199 
 1200         /*
 1201          * Clean up.
 1202          */
 1203 done:
 1204         if (kap->read_set && kap->read_set != &read_tmp)
 1205                 kfree(kap->read_set, M_SELECT);
 1206         if (kap->write_set && kap->write_set != &write_tmp)
 1207                 kfree(kap->write_set, M_SELECT);
 1208         if (kap->except_set && kap->except_set != &except_tmp)
 1209                 kfree(kap->except_set, M_SELECT);
 1210 
 1211         kap->lwp->lwp_kqueue_serial += kap->num_fds;
 1212 
 1213         return (error);
 1214 }
 1215 
 1216 /*
 1217  * Poll system call.
 1218  *
 1219  * MPSAFE
 1220  */
 1221 int
 1222 sys_poll(struct poll_args *uap)
 1223 {
 1224         struct timespec ts, *tsp;
 1225         int error;
 1226 
 1227         if (uap->timeout != INFTIM) {
 1228                 ts.tv_sec = uap->timeout / 1000;
 1229                 ts.tv_nsec = (uap->timeout % 1000) * 1000 * 1000;
 1230                 tsp = &ts;
 1231         } else {
 1232                 tsp = NULL;
 1233         }
 1234 
 1235         error = dopoll(uap->nfds, uap->fds, tsp, &uap->sysmsg_result);
 1236 
 1237         return (error);
 1238 }
 1239 
 1240 static int
 1241 poll_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
 1242 {
 1243         struct poll_kevent_copyin_args *pkap;
 1244         struct pollfd *pfd;
 1245         struct kevent *kev;
 1246         int kev_count;
 1247 
 1248         pkap = (struct poll_kevent_copyin_args *)arg;
 1249 
 1250         while (pkap->pfds < pkap->nfds) {
 1251                 pfd = &pkap->fds[pkap->pfds];
 1252 
 1253                 /* Clear return events */
 1254                 pfd->revents = 0;
 1255 
 1256                 /* Do not check if fd is equal to -1 */
 1257                 if (pfd->fd == -1) {
 1258                         ++pkap->pfds;
 1259                         continue;
 1260                 }
 1261 
 1262                 kev_count = 0;
 1263                 if (pfd->events & (POLLIN | POLLRDNORM))
 1264                         kev_count++;
 1265                 if (pfd->events & (POLLOUT | POLLWRNORM))
 1266                         kev_count++;
 1267                 if (pfd->events & (POLLPRI | POLLRDBAND))
 1268                         kev_count++;
 1269 
 1270                 if (*events + kev_count > maxevents)
 1271                         return (0);
 1272 
 1273                 /*
 1274                  * NOTE: A combined serial number and poll array index is
 1275                  * stored in kev->udata.
 1276                  */
 1277                 kev = &kevp[*events];
 1278                 if (pfd->events & (POLLIN | POLLRDNORM)) {
 1279                         EV_SET(kev++, pfd->fd, EVFILT_READ, EV_ADD|EV_ENABLE,
 1280                                NOTE_OLDAPI, 0, (void *)(uintptr_t)
 1281                                 (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
 1282                 }
 1283                 if (pfd->events & (POLLOUT | POLLWRNORM)) {
 1284                         EV_SET(kev++, pfd->fd, EVFILT_WRITE, EV_ADD|EV_ENABLE,
 1285                                NOTE_OLDAPI, 0, (void *)(uintptr_t)
 1286                                 (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
 1287                 }
 1288                 if (pfd->events & (POLLPRI | POLLRDBAND)) {
 1289                         EV_SET(kev++, pfd->fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE,
 1290                                NOTE_OLDAPI | NOTE_OOB, 0,
 1291                                (void *)(uintptr_t)
 1292                                 (pkap->lwp->lwp_kqueue_serial + pkap->pfds));
 1293                 }
 1294 
 1295                 if (nseldebug) {
 1296                         kprintf("poll index %d/%d fd %d events %08x serial %d\n",
 1297                                 pkap->pfds, pkap->nfds-1, pfd->fd, pfd->events,
 1298                                 pkap->lwp->lwp_kqueue_serial);
 1299                 }
 1300 
 1301                 ++pkap->pfds;
 1302                 (*events) += kev_count;
 1303         }
 1304 
 1305         return (0);
 1306 }
 1307 
 1308 static int
 1309 poll_copyout(void *arg, struct kevent *kevp, int count, int *res)
 1310 {
 1311         struct poll_kevent_copyin_args *pkap;
 1312         struct pollfd *pfd;
 1313         struct kevent kev;
 1314         int count_res;
 1315         int i;
 1316         u_int pi;
 1317 
 1318         pkap = (struct poll_kevent_copyin_args *)arg;
 1319 
 1320         for (i = 0; i < count; ++i) {
 1321                 /*
 1322                  * Extract the poll array index and delete spurious events.
 1323                  * We can easily tell if the serial number is incorrect
 1324                  * by checking whether the extracted index is out of range.
 1325                  */
 1326                 pi = (u_int)(uintptr_t)kevp[i].udata -
 1327                      (u_int)pkap->lwp->lwp_kqueue_serial;
 1328 
 1329                 if (pi >= pkap->nfds) {
 1330                         kev = kevp[i];
 1331                         kev.flags = EV_DISABLE|EV_DELETE;
 1332                         kqueue_register(&pkap->lwp->lwp_kqueue, &kev);
 1333                         if (nseldebug)
 1334                                 kprintf("poll index %d out of range against serial %d\n",
 1335                                         pi, pkap->lwp->lwp_kqueue_serial);
 1336                         continue;
 1337                 }
 1338                 pfd = &pkap->fds[pi];
 1339                 if (kevp[i].ident == pfd->fd) {
 1340                         /*
 1341                          * A single descriptor may generate an error against
 1342                          * more than one filter, make sure to set the
 1343                          * appropriate flags but do not increment (*res)
 1344                          * more than once.
 1345                          */
 1346                         count_res = (pfd->revents == 0);
 1347                         if (kevp[i].flags & EV_ERROR) {
 1348                                 switch(kevp[i].data) {
 1349                                 case EBADF:
 1350                                 case POLLNVAL:
 1351                                         /* Bad file descriptor */
 1352                                         if (count_res)
 1353                                                 ++*res;
 1354                                         pfd->revents |= POLLNVAL;
 1355                                         break;
 1356                                 default:
 1357                                         /*
 1358                                          * Poll silently swallows any unknown
 1359                                          * errors except in the case of POLLPRI
 1360                                          * (OOB/urgent data).
 1361                                          *
 1362                                          * ALWAYS filter out EOPNOTSUPP errors
 1363                                          * from filters, common applications
 1364                                          * set POLLPRI|POLLRDBAND and most
 1365                                          * filters do not support EVFILT_EXCEPT.
 1366                                          *
 1367                                          * We also filter out ENODEV since dev_dkqfilter
 1368                                          * returns ENODEV if EOPNOTSUPP is returned in an
 1369                                          * inner call.
 1370                                          *
 1371                                          * XXX: fix this
 1372                                          */
 1373                                         if (kevp[i].filter != EVFILT_READ &&
 1374                                             kevp[i].filter != EVFILT_WRITE &&
 1375                                             kevp[i].data != EOPNOTSUPP &&
 1376                                             kevp[i].data != ENODEV) {
 1377                                                 if (count_res == 0)
 1378                                                         ++*res;
 1379                                                 pfd->revents |= POLLERR;
 1380                                         }
 1381                                         break;
 1382                                 }
 1383                                 if (nseldebug) {
 1384                                         kprintf("poll index %d fd %d "
 1385                                                 "filter %d error %jd\n",
 1386                                                 pi, pfd->fd,
 1387                                                 kevp[i].filter,
 1388                                                 (intmax_t)kevp[i].data);
 1389                                 }
 1390                                 continue;
 1391                         }
 1392 
 1393                         switch (kevp[i].filter) {
 1394                         case EVFILT_READ:
 1395 #if 0
 1396                                 /*
 1397                                  * NODATA on the read side can indicate a
 1398                                  * half-closed situation and not necessarily
 1399                                  * a disconnect, so depend on the user
 1400                                  * issuing a read() and getting 0 bytes back.
 1401                                  */
 1402                                 if (kevp[i].flags & EV_NODATA)
 1403                                         pfd->revents |= POLLHUP;
 1404 #endif
 1405                                 if ((kevp[i].flags & EV_EOF) &&
 1406                                     kevp[i].fflags != 0)
 1407                                         pfd->revents |= POLLERR;
 1408                                 if (pfd->events & POLLIN)
 1409                                         pfd->revents |= POLLIN;
 1410                                 if (pfd->events & POLLRDNORM)
 1411                                         pfd->revents |= POLLRDNORM;
 1412                                 break;
 1413                         case EVFILT_WRITE:
 1414                                 /*
 1415                                  * As per the OpenGroup POLLHUP is mutually
 1416                                  * exclusive with the writability flags.  I
 1417                                  * consider this a bit broken but...
 1418                                  *
 1419                                  * In this case a disconnect is implied even
 1420                                  * for a half-closed (write side) situation.
 1421                                  */
 1422                                 if (kevp[i].flags & EV_EOF) {
 1423                                         pfd->revents |= POLLHUP;
 1424                                         if (kevp[i].fflags != 0)
 1425                                                 pfd->revents |= POLLERR;
 1426                                 } else {
 1427                                         if (pfd->events & POLLOUT)
 1428                                                 pfd->revents |= POLLOUT;
 1429                                         if (pfd->events & POLLWRNORM)
 1430                                                 pfd->revents |= POLLWRNORM;
 1431                                 }
 1432                                 break;
 1433                         case EVFILT_EXCEPT:
 1434                                 /*
 1435                                  * EV_NODATA should never be tagged for this
 1436                                  * filter.
 1437                                  */
 1438                                 if (pfd->events & POLLPRI)
 1439                                         pfd->revents |= POLLPRI;
 1440                                 if (pfd->events & POLLRDBAND)
 1441                                         pfd->revents |= POLLRDBAND;
 1442                                 break;
 1443                         }
 1444 
 1445                         if (nseldebug) {
 1446                                 kprintf("poll index %d/%d fd %d revents %08x\n",
 1447                                         pi, pkap->nfds, pfd->fd, pfd->revents);
 1448                         }
 1449 
 1450                         if (count_res && pfd->revents)
 1451                                 ++*res;
 1452                 } else {
 1453                         if (nseldebug) {
 1454                                 kprintf("poll index %d mismatch %ju/%d\n",
 1455                                         pi, (uintmax_t)kevp[i].ident, pfd->fd);
 1456                         }
 1457                 }
 1458         }
 1459 
 1460         return (0);
 1461 }
 1462 
 1463 static int
 1464 dopoll(int nfds, struct pollfd *fds, struct timespec *ts, int *res)
 1465 {
 1466         struct poll_kevent_copyin_args ka;
 1467         struct pollfd sfds[64];
 1468         int bytes;
 1469         int error;
 1470 
 1471         *res = 0;
 1472         if (nfds < 0)
 1473                 return (EINVAL);
 1474 
 1475         if (nfds == 0 && ts)
 1476                 return (dotimeout_only(ts));
 1477 
 1478         /*
 1479          * This is a bit arbitrary but we need to limit internal kmallocs.
 1480          */
 1481         if (nfds > maxfilesperproc * 2)
 1482                 nfds = maxfilesperproc * 2;
 1483         bytes = sizeof(struct pollfd) * nfds;
 1484 
 1485         ka.lwp = curthread->td_lwp;
 1486         ka.nfds = nfds;
 1487         ka.pfds = 0;
 1488         ka.error = 0;
 1489 
 1490         if (ka.nfds < 64)
 1491                 ka.fds = sfds;
 1492         else
 1493                 ka.fds = kmalloc(bytes, M_SELECT, M_WAITOK);
 1494 
 1495         error = copyin(fds, ka.fds, bytes);
 1496         if (error == 0)
 1497                 error = kern_kevent(&ka.lwp->lwp_kqueue, 0x7FFFFFFF, res, &ka,
 1498                                     poll_copyin, poll_copyout, ts);
 1499 
 1500         if (error == 0)
 1501                 error = copyout(ka.fds, fds, bytes);
 1502 
 1503         if (ka.fds != sfds)
 1504                 kfree(ka.fds, M_SELECT);
 1505 
 1506         ka.lwp->lwp_kqueue_serial += nfds;
 1507 
 1508         return (error);
 1509 }
 1510 
 1511 static int
 1512 socket_wait_copyin(void *arg, struct kevent *kevp, int maxevents, int *events)
 1513 {
 1514         return (0);
 1515 }
 1516 
 1517 static int
 1518 socket_wait_copyout(void *arg, struct kevent *kevp, int count, int *res)
 1519 {
 1520         ++*res;
 1521         return (0);
 1522 }
 1523 
 1524 extern  struct fileops socketops;
 1525 
 1526 /*
 1527  * NOTE: Callers of socket_wait() must already have a reference on the
 1528  *       socket.
 1529  */
 1530 int
 1531 socket_wait(struct socket *so, struct timespec *ts, int *res)
 1532 {
 1533         struct thread *td = curthread;
 1534         struct file *fp;
 1535         struct kqueue kq;
 1536         struct kevent kev;
 1537         int error, fd;
 1538 
 1539         if ((error = falloc(td->td_lwp, &fp, &fd)) != 0)
 1540                 return (error);
 1541 
 1542         fp->f_type = DTYPE_SOCKET;
 1543         fp->f_flag = FREAD | FWRITE;
 1544         fp->f_ops = &socketops;
 1545         fp->f_data = so;
 1546         fsetfd(td->td_lwp->lwp_proc->p_fd, fp, fd);
 1547 
 1548         kqueue_init(&kq, td->td_lwp->lwp_proc->p_fd);
 1549         EV_SET(&kev, fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, NULL);
 1550         if ((error = kqueue_register(&kq, &kev)) != 0) {
 1551                 fdrop(fp);
 1552                 return (error);
 1553         }
 1554 
 1555         error = kern_kevent(&kq, 1, res, NULL, socket_wait_copyin,
 1556                             socket_wait_copyout, ts);
 1557 
 1558         EV_SET(&kev, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
 1559         kqueue_register(&kq, &kev);
 1560         fp->f_ops = &badfileops;
 1561         fdrop(fp);
 1562 
 1563         return (error);
 1564 }
 1565 
 1566 /*
 1567  * OpenBSD poll system call.
 1568  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
 1569  *
 1570  * MPSAFE
 1571  */
 1572 int
 1573 sys_openbsd_poll(struct openbsd_poll_args *uap)
 1574 {
 1575         return (sys_poll((struct poll_args *)uap));
 1576 }
 1577 
 1578 /*ARGSUSED*/
 1579 int
 1580 seltrue(cdev_t dev, int events)
 1581 {
 1582         return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 1583 }

Cache object: ab58cac0a4a8f1ace2dd3870f30af6d0


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.