spec_vnops.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
    5  * 
    6  * This file contains Original Code and/or Modifications of Original Code
    7  * as defined in and that are subject to the Apple Public Source License
    8  * Version 2.0 (the 'License'). You may not use this file except in
    9  * compliance with the License. The rights granted to you under the License
   10  * may not be used to create, or enable the creation or redistribution of,
   11  * unlawful or unlicensed copies of an Apple operating system, or to
   12  * circumvent, violate, or enable the circumvention or violation of, any
   13  * terms of an Apple operating system software license agreement.
   14  * 
   15  * Please obtain a copy of the License at
   16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
   17  * 
   18  * The Original Code and all software distributed under the License are
   19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   23  * Please see the License for the specific language governing rights and
   24  * limitations under the License.
   25  * 
   26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   27  */
   28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
   29 /*
   30  * Copyright (c) 1989, 1993, 1995
   31  *      The Regents of the University of California.  All rights reserved.
   32  *
   33  * Redistribution and use in source and binary forms, with or without
   34  * modification, are permitted provided that the following conditions
   35  * are met:
   36  * 1. Redistributions of source code must retain the above copyright
   37  *    notice, this list of conditions and the following disclaimer.
   38  * 2. Redistributions in binary form must reproduce the above copyright
   39  *    notice, this list of conditions and the following disclaimer in the
   40  *    documentation and/or other materials provided with the distribution.
   41  * 3. All advertising materials mentioning features or use of this software
   42  *    must display the following acknowledgement:
   43  *      This product includes software developed by the University of
   44  *      California, Berkeley and its contributors.
   45  * 4. Neither the name of the University nor the names of its contributors
   46  *    may be used to endorse or promote products derived from this software
   47  *    without specific prior written permission.
   48  *
   49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   59  * SUCH DAMAGE.
   60  *
   61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
   62  */
   63 
   64 #include <sys/param.h>
   65 #include <sys/proc_internal.h>
   66 #include <sys/kauth.h>
   67 #include <sys/systm.h>
   68 #include <sys/kernel.h>
   69 #include <sys/conf.h>
   70 #include <sys/buf_internal.h>
   71 #include <sys/mount_internal.h>
   72 #include <sys/vnode_internal.h>
   73 #include <sys/file_internal.h>
   74 #include <sys/namei.h>
   75 #include <sys/stat.h>
   76 #include <sys/errno.h>
   77 #include <sys/ioctl.h>
   78 #include <sys/file.h>
   79 #include <sys/user.h>
   80 #include <sys/malloc.h>
   81 #include <sys/disk.h>
   82 #include <sys/uio_internal.h>
   83 #include <sys/resource.h>
   84 #include <miscfs/specfs/specdev.h>
   85 #include <vfs/vfs_support.h>
   86 #include <kern/assert.h>
   87 #include <kern/task.h>
   88 
   89 #include <sys/kdebug.h>
   90 
   91 /* XXX following three prototypes should be in a header file somewhere */
   92 extern dev_t    chrtoblk(dev_t dev);
   93 extern int      iskmemdev(dev_t dev);
   94 extern int      bpfkqfilter(dev_t dev, struct knote *kn);
   95 extern int      ptsd_kqfilter(dev_t dev, struct knote *kn);
   96 
   97 struct vnode *speclisth[SPECHSZ];
   98 
   99 /* symbolic sleep message strings for devices */
  100 char    devopn[] = "devopn";
  101 char    devio[] = "devio";
  102 char    devwait[] = "devwait";
  103 char    devin[] = "devin";
  104 char    devout[] = "devout";
  105 char    devioc[] = "devioc";
  106 char    devcls[] = "devcls";
  107 
  108 #define VOPFUNC int (*)(void *)
  109 
  110 int (**spec_vnodeop_p)(void *);
  111 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
  112         { &vnop_default_desc, (VOPFUNC)vn_default_error },
  113         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
  114         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
  115         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
  116         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
  117         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
  118         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
  119         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
  120         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
  121         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
  122         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
  123         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
  124         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
  125         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
  126         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
  127         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
  128         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
  129         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
  130         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
  131         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
  132         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
  133         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
  134         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
  135         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
  136         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
  137         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
  138         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
  139         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
  140         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
  141         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
  142         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
  143         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
  144         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
  145         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
  146         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
  147         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
  148         { (struct vnodeop_desc*)NULL, (int(*)())NULL }
  149 };
  150 struct vnodeopv_desc spec_vnodeop_opv_desc =
  151         { &spec_vnodeop_p, spec_vnodeop_entries };
  152 
  153 
  154 static void set_blocksize(vnode_t, dev_t);
  155 
  156 
  157 /*
  158  * Trivial lookup routine that always fails.
  159  */
  160 int
  161 spec_lookup(struct vnop_lookup_args *ap)
  162 {
  163 
  164         *ap->a_vpp = NULL;
  165         return (ENOTDIR);
  166 }
  167 
  168 static void
  169 set_blocksize(struct vnode *vp, dev_t dev)
  170 {
  171     int (*size)(dev_t);
  172     int rsize;
  173 
  174     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
  175         rsize = (*size)(dev);
  176         if (rsize <= 0)        /* did size fail? */
  177             vp->v_specsize = DEV_BSIZE;
  178         else
  179             vp->v_specsize = rsize;
  180     }
  181     else
  182             vp->v_specsize = DEV_BSIZE;
  183 }
  184 
  185 void
  186 set_fsblocksize(struct vnode *vp)
  187 {
  188         
  189         if (vp->v_type == VBLK) {
  190                 dev_t dev = (dev_t)vp->v_rdev;
  191                 int maj = major(dev);
  192 
  193                 if ((u_int)maj >= (u_int)nblkdev)
  194                         return;
  195 
  196                 vnode_lock(vp);
  197                 set_blocksize(vp, dev);
  198                 vnode_unlock(vp);
  199         }
  200 
  201 }
  202 
  203 
  204 /*
  205  * Open a special file.
  206  */
  207 int
  208 spec_open(struct vnop_open_args *ap)
  209 {
  210         struct proc *p = vfs_context_proc(ap->a_context);
  211         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
  212         struct vnode *vp = ap->a_vp;
  213         dev_t bdev, dev = (dev_t)vp->v_rdev;
  214         int maj = major(dev);
  215         int error;
  216 
  217         /*
  218          * Don't allow open if fs is mounted -nodev.
  219          */
  220         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
  221                 return (ENXIO);
  222 
  223         switch (vp->v_type) {
  224 
  225         case VCHR:
  226                 if ((u_int)maj >= (u_int)nchrdev)
  227                         return (ENXIO);
  228                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
  229                         /*
  230                          * When running in very secure mode, do not allow
  231                          * opens for writing of any disk character devices.
  232                          */
  233                         if (securelevel >= 2 && isdisk(dev, VCHR))
  234                                 return (EPERM);
  235                         /*
  236                          * When running in secure mode, do not allow opens
  237                          * for writing of /dev/mem, /dev/kmem, or character
  238                          * devices whose corresponding block devices are
  239                          * currently mounted.
  240                          */
  241                         if (securelevel >= 1) {
  242                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
  243                                         return (error);
  244                                 if (iskmemdev(dev))
  245                                         return (EPERM);
  246                         }
  247                 }
  248                 if (cdevsw[maj].d_type == D_TTY) {
  249                         vnode_lock(vp);
  250                         vp->v_flag |= VISTTY;
  251                         vnode_unlock(vp);
  252                 }
  253                 
  254                 devsw_lock(dev, S_IFCHR);
  255                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
  256 
  257                 if (error == 0) {
  258                         vp->v_specinfo->si_opencount++;
  259                 }
  260 
  261                 devsw_unlock(dev, S_IFCHR);
  262                 return (error);
  263 
  264         case VBLK:
  265                 if ((u_int)maj >= (u_int)nblkdev)
  266                         return (ENXIO);
  267                 /*
  268                  * When running in very secure mode, do not allow
  269                  * opens for writing of any disk block devices.
  270                  */
  271                 if (securelevel >= 2 && cred != FSCRED &&
  272                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
  273                         return (EPERM);
  274                 /*
  275                  * Do not allow opens of block devices that are
  276                  * currently mounted.
  277                  */
  278                 if ( (error = vfs_mountedon(vp)) )
  279                         return (error);
  280 
  281                 devsw_lock(dev, S_IFBLK);
  282                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
  283                 if (!error) {
  284                         vp->v_specinfo->si_opencount++;
  285                 }
  286                 devsw_unlock(dev, S_IFBLK);
  287 
  288                 if (!error) {
  289                     u_int64_t blkcnt;
  290                     u_int32_t blksize;
  291                         int setsize = 0;
  292                         u_int32_t size512 = 512;
  293 
  294 
  295                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
  296                                 /* Switch to 512 byte sectors (temporarily) */
  297 
  298                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
  299                                 /* Get the number of 512 byte physical blocks. */
  300                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
  301                                                 setsize = 1;
  302                                 }
  303                                 }
  304                                 /* If it doesn't set back, we can't recover */
  305                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
  306                                 error = ENXIO;
  307                     }
  308 
  309 
  310                         vnode_lock(vp);
  311                     set_blocksize(vp, dev);
  312 
  313                     /*
  314                      * Cache the size in bytes of the block device for later
  315                      * use by spec_write().
  316                      */
  317                         if (setsize)
  318                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
  319                         else
  320                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
  321                         
  322                         vnode_unlock(vp);
  323 
  324                 }
  325                 return(error);
  326         default:
  327                 panic("spec_open type");
  328         }
  329         return (0);
  330 }
  331 
  332 /*
  333  * Vnode op for read
  334  */
  335 int
  336 spec_read(struct vnop_read_args *ap)
  337 {
  338         struct vnode *vp = ap->a_vp;
  339         struct uio *uio = ap->a_uio;
  340         struct buf *bp;
  341         daddr64_t bn, nextbn;
  342         long bsize, bscale;
  343         int devBlockSize=0;
  344         int n, on;
  345         int error = 0;
  346         dev_t dev;
  347 
  348 #if DIAGNOSTIC
  349         if (uio->uio_rw != UIO_READ)
  350                 panic("spec_read mode");
  351         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
  352                 panic("spec_read proc");
  353 #endif
  354         if (uio_resid(uio) == 0)
  355                 return (0);
  356 
  357         switch (vp->v_type) {
  358 
  359         case VCHR:
  360                 error = (*cdevsw[major(vp->v_rdev)].d_read)
  361                         (vp->v_rdev, uio, ap->a_ioflag);
  362                 return (error);
  363 
  364         case VBLK:
  365                 if (uio->uio_offset < 0)
  366                         return (EINVAL);
  367 
  368                 dev = vp->v_rdev;
  369 
  370                 devBlockSize = vp->v_specsize;
  371 
  372                 if (devBlockSize > PAGE_SIZE) 
  373                         return (EINVAL);
  374 
  375                 bscale = PAGE_SIZE / devBlockSize;
  376                 bsize = bscale * devBlockSize;
  377 
  378                 do {
  379                         on = uio->uio_offset % bsize;
  380 
  381                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
  382                         
  383                         if (vp->v_speclastr + bscale == bn) {
  384                                 nextbn = bn + bscale;
  385                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
  386                                                (int *)&bsize, 1, NOCRED, &bp);
  387                         } else
  388                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
  389 
  390                         vnode_lock(vp);
  391                         vp->v_speclastr = bn;
  392                         vnode_unlock(vp);
  393 
  394                         n = bsize - buf_resid(bp);
  395                         if ((on > n) || error) {
  396                                 if (!error)
  397                                         error = EINVAL;
  398                                 buf_brelse(bp);
  399                                 return (error);
  400                         }
  401                         n = min((unsigned)(n  - on), uio_resid(uio));
  402 
  403                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
  404                         if (n + on == bsize)
  405                                 buf_markaged(bp);
  406                         buf_brelse(bp);
  407                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
  408                 return (error);
  409 
  410         default:
  411                 panic("spec_read type");
  412         }
  413         /* NOTREACHED */
  414 
  415         return (0);
  416 }
  417 
  418 /*
  419  * Vnode op for write
  420  */
  421 int
  422 spec_write(struct vnop_write_args *ap)
  423 {
  424         struct vnode *vp = ap->a_vp;
  425         struct uio *uio = ap->a_uio;
  426         struct buf *bp;
  427         daddr64_t bn;
  428         int bsize, blkmask, bscale;
  429         int io_sync;
  430         int devBlockSize=0;
  431         int n, on;
  432         int error = 0;
  433         dev_t dev;
  434 
  435 #if DIAGNOSTIC
  436         if (uio->uio_rw != UIO_WRITE)
  437                 panic("spec_write mode");
  438         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
  439                 panic("spec_write proc");
  440 #endif
  441 
  442         switch (vp->v_type) {
  443 
  444         case VCHR:
  445                 error = (*cdevsw[major(vp->v_rdev)].d_write)
  446                         (vp->v_rdev, uio, ap->a_ioflag);
  447                 return (error);
  448 
  449         case VBLK:
  450                 if (uio_resid(uio) == 0)
  451                         return (0);
  452                 if (uio->uio_offset < 0)
  453                         return (EINVAL);
  454 
  455                 io_sync = (ap->a_ioflag & IO_SYNC);
  456 
  457                 dev = (vp->v_rdev);
  458 
  459                 devBlockSize = vp->v_specsize;
  460                 if (devBlockSize > PAGE_SIZE)
  461                         return(EINVAL);
  462 
  463                 bscale = PAGE_SIZE / devBlockSize;
  464                 blkmask = bscale - 1;
  465                 bsize = bscale * devBlockSize;
  466                 
  467 
  468                 do {
  469                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
  470                         on = uio->uio_offset % bsize;
  471 
  472                         n = min((unsigned)(bsize - on), uio_resid(uio));
  473 
  474                         /*
  475                          * Use buf_getblk() as an optimization IFF:
  476                          *
  477                          * 1)   We are reading exactly a block on a block
  478                          *      aligned boundary
  479                          * 2)   We know the size of the device from spec_open
  480                          * 3)   The read doesn't span the end of the device
  481                          *
  482                          * Otherwise, we fall back on buf_bread().
  483                          */
  484                         if (n == bsize &&
  485                             vp->v_specdevsize != (u_int64_t)0 &&
  486                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
  487                             /* reduce the size of the read to what is there */
  488                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
  489                         }
  490 
  491                         if (n == bsize)
  492                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
  493                         else
  494                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
  495 
  496                         /* Translate downstream error for upstream, if needed */
  497                         if (!error)
  498                                 error = (int)buf_error(bp);
  499                         if (error) {
  500                                 buf_brelse(bp);
  501                                 return (error);
  502                         }
  503                         n = min(n, bsize - buf_resid(bp));
  504 
  505                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
  506                         if (error) {
  507                                 buf_brelse(bp);
  508                                 return (error);
  509                         }
  510                         buf_markaged(bp);
  511 
  512                         if (io_sync) 
  513                                 error = buf_bwrite(bp);
  514                         else {
  515                                 if ((n + on) == bsize)
  516                                         error = buf_bawrite(bp);
  517                                 else
  518                                         error = buf_bdwrite(bp);
  519                         }
  520                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
  521                 return (error);
  522 
  523         default:
  524                 panic("spec_write type");
  525         }
  526         /* NOTREACHED */
  527 
  528         return (0);
  529 }
  530 
  531 /*
  532  * Device ioctl operation.
  533  */
  534 int
  535 spec_ioctl(struct vnop_ioctl_args *ap)
  536 {
  537         proc_t p = vfs_context_proc(ap->a_context);
  538         dev_t dev = ap->a_vp->v_rdev;
  539         int     retval = 0;
  540 
  541         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
  542                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
  543 
  544         switch (ap->a_vp->v_type) {
  545 
  546         case VCHR:
  547                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
  548                                                        ap->a_fflag, p);
  549                 break;
  550 
  551         case VBLK:
  552                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
  553                                                        ap->a_fflag, p);
  554                 break;
  555 
  556         default:
  557                 panic("spec_ioctl");
  558                 /* NOTREACHED */
  559         }
  560         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
  561                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
  562 
  563         return (retval);
  564 }
  565 
  566 int
  567 spec_select(struct vnop_select_args *ap)
  568 {
  569         proc_t p = vfs_context_proc(ap->a_context);
  570         dev_t dev;
  571 
  572         switch (ap->a_vp->v_type) {
  573 
  574         default:
  575                 return (1);             /* XXX */
  576 
  577         case VCHR:
  578                 dev = ap->a_vp->v_rdev;
  579                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
  580         }
  581 }
  582 
  583 static int filt_specattach(struct knote *kn);
  584 
  585 int
  586 spec_kqfilter(vnode_t vp, struct knote *kn)
  587 {
  588         dev_t dev;
  589         int err = EINVAL;
  590 
  591         /*
  592          * For a few special kinds of devices, we can attach knotes.
  593          * Each filter function must check whether the dev type matches it.
  594          */
  595         dev = vnode_specrdev(vp);
  596 
  597         if (vnode_istty(vp)) {
  598                 /* We can hook into TTYs... */
  599                 err = filt_specattach(kn);
  600         } else {
  601                 /* Try a bpf device, as defined in bsd/net/bpf.c */
  602                 err = bpfkqfilter(dev, kn);
  603         }
  604 
  605         return err;
  606 }
  607 
  608 /*
  609  * Synch buffers associated with a block device
  610  */
  611 int
  612 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
  613 {
  614         if (vp->v_type == VCHR)
  615                 return (0);
  616         /*
  617          * Flush all dirty buffers associated with a block device.
  618          */
  619         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
  620 
  621         return (0);
  622 }
  623 
  624 int
  625 spec_fsync(struct vnop_fsync_args *ap)
  626 {
  627         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
  628 }
  629 
  630 /*
  631  * Just call the device strategy routine
  632  */
  633 extern int hard_throttle_on_root;
  634 void IOSleep(int);
  635 
  636 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
  637 #define LOWPRI_INITIAL_WINDOW_MSECS 100
  638 #define LOWPRI_WINDOW_MSECS_INC 50
  639 #define LOWPRI_MAX_WINDOW_MSECS 200
  640 #define LOWPRI_MAX_WAITING_MSECS 200
  641 
  642 #if CONFIG_EMBEDDED
  643 #define LOWPRI_SLEEP_INTERVAL 5
  644 #else
  645 #define LOWPRI_SLEEP_INTERVAL 2
  646 #endif
  647 
  648 struct _throttle_io_info_t {
  649         struct timeval  last_normal_IO_timestamp;
  650         struct timeval  last_IO_timestamp;
  651         SInt32 numthreads_throttling;
  652         SInt32 refcnt;
  653         SInt32 alloc;
  654 };
  655 
  656 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
  657 int     lowpri_IO_initial_window_msecs  = LOWPRI_INITIAL_WINDOW_MSECS;
  658 int     lowpri_IO_window_msecs_inc  = LOWPRI_WINDOW_MSECS_INC;
  659 int     lowpri_max_window_msecs  = LOWPRI_MAX_WINDOW_MSECS;
  660 int     lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
  661 
  662 #if 0 
  663 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
  664         do {                                                    \
  665                if ((debug_info)->alloc)                           \
  666                printf("%s: "format, __FUNCTION__, ## args);     \
  667        } while(0)
  668 
  669 #else 
  670 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
  671 #endif
  672 
  673 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  674 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
  675 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  676 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  677 
  678 /*
  679  * throttled I/O helper function
  680  * convert the index of the lowest set bit to a device index
  681  */
  682 int
  683 num_trailing_0(uint64_t n)
  684 {
  685         /*
  686          * since in most cases the number of trailing 0s is very small,
  687      * we simply counting sequentially from the lowest bit
  688          */
  689         if (n == 0)
  690                 return sizeof(n) * 8;
  691         int count = 0;
  692         while (!ISSET(n, 1)) {
  693                 n >>= 1;
  694                 ++count;
  695         }
  696         return count;
  697 }
  698 
  699 /*
  700  * Release the reference and if the item was allocated and this is the last
  701  * reference then free it.
  702  *
  703  * This routine always returns the old value.
  704  */
  705 static int
  706 throttle_info_rel(struct _throttle_io_info_t *info)
  707 {
  708         SInt32 oldValue = OSDecrementAtomic(&info->refcnt);
  709 
  710         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 
  711                 info, (int)(oldValue -1), info );
  712 
  713         /* The reference count just went negative, very bad */
  714         if (oldValue == 0)
  715                 panic("throttle info ref cnt went negative!");
  716 
  717         /* 
  718          * Once reference count is zero, no one else should be able to take a 
  719          * reference 
  720          */
  721         if ((info->refcnt == 0) && (info->alloc)) {
  722                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info, info );
  723                 FREE(info, M_TEMP); 
  724         }
  725         return oldValue;
  726 }
  727 
  728 /*
  729  * Just take a reference on the throttle info structure.
  730  *
  731  * This routine always returns the old value.
  732  */
  733 static SInt32
  734 throttle_info_ref(struct _throttle_io_info_t *info)
  735 {
  736         SInt32 oldValue = OSIncrementAtomic(&info->refcnt);
  737 
  738         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 
  739                 info, (int)(oldValue -1), info );
  740         /* Allocated items should never have a reference of zero */
  741         if (info->alloc && (oldValue == 0))
  742                 panic("Taking a reference without calling create throttle info!\n");
  743 
  744         return oldValue;
  745 }
  746 
  747 /*
  748  * KPI routine
  749  *
  750  * Create and take a reference on a throttle info structure and return a
  751  * pointer for the file system to use when calling throttle_info_update.
  752  * Calling file system must have a matching release for every create.
  753  */
  754 void *
  755 throttle_info_create(void)
  756 {
  757         struct _throttle_io_info_t *info; 
  758 
  759         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
  760         /* Should never happen but just in case */
  761         if (info == NULL)
  762                 return NULL;
  763         /* Mark that this one was allocated and needs to be freed */
  764         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
  765         info->alloc = TRUE;
  766         /* Take a reference */
  767         OSIncrementAtomic(&info->refcnt);
  768         return info;
  769 }
  770 
  771 /*
  772  * KPI routine
  773  *
  774  * Release the throttle info pointer if all the reference are gone. Should be 
  775  * called to release reference taken by throttle_info_create 
  776  */ 
  777 void
  778 throttle_info_release(void *throttle_info)
  779 {
  780         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
  781                 (struct _throttle_io_info_t *)throttle_info,
  782                 (struct _throttle_io_info_t *)throttle_info);
  783         if (throttle_info) /* Just to be careful */
  784                 throttle_info_rel(throttle_info);
  785 }
  786 
  787 /*
  788  * KPI routine
  789  *
  790  * File Systems that create an info structure, need to call this routine in
  791  * their mount routine (used by cluster code). File Systems that call this in
  792  * their mount routines must call throttle_info_mount_rel in their unmount
  793  * routines. 
  794  */
  795 void 
  796 throttle_info_mount_ref(mount_t mp, void *throttle_info)
  797 {
  798         if ((throttle_info == NULL) || (mp == NULL))
  799                 return;
  800         throttle_info_ref(throttle_info);
  801         /* We already have a reference release it before adding the new one */
  802         if (mp->mnt_throttle_info)
  803                 throttle_info_rel(mp->mnt_throttle_info);
  804         mp->mnt_throttle_info = throttle_info;
  805 }
  806 
  807 /*
  808  * Private KPI routine
  809  *
  810  * return a handle for accessing throttle_info given a throttle_mask.  The
  811  * handle must be released by throttle_info_rel_by_mask
  812  */
  813 int
  814 throttle_info_ref_by_mask(uint64_t throttle_mask,
  815                                                   throttle_info_handle_t *throttle_info_handle)
  816 {
  817         int dev_index;
  818         struct _throttle_io_info_t *info;
  819 
  820         if (throttle_info_handle == NULL)
  821                 return EINVAL;
  822         
  823         dev_index = num_trailing_0(throttle_mask);
  824         info = &_throttle_io_info[dev_index];
  825         throttle_info_ref(info);
  826         *(struct _throttle_io_info_t**)throttle_info_handle = info;
  827         return 0;
  828 }
  829 
  830 /*
  831  * Private KPI routine
  832  *
  833  * release the handle obtained by throttle_info_ref_by_mask
  834  */
  835 void
  836 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
  837 {
  838         /* for now the handle is just a pointer to _throttle_io_info_t */
  839         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
  840 }
  841 
  842 /*
  843  * KPI routine
  844  *
  845  * File Systems that throttle_info_mount_ref, must call this routine in their
  846  * umount routine.
  847  */ 
  848 void
  849 throttle_info_mount_rel(mount_t mp)
  850 {
  851         if (mp->mnt_throttle_info)
  852                 throttle_info_rel(mp->mnt_throttle_info);
  853         mp->mnt_throttle_info = NULL;
  854 }
  855 
  856 void
  857 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
  858 {
  859         struct _throttle_io_info_t *info;
  860 
  861         if (mp == NULL)
  862             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
  863         else if (mp->mnt_throttle_info == NULL)
  864             info = &_throttle_io_info[mp->mnt_devbsdunit];
  865         else
  866             info = mp->mnt_throttle_info;
  867 
  868         *tv = info->last_IO_timestamp;
  869 }
  870 
  871 void
  872 update_last_io_time(mount_t mp)
  873 {
  874         struct _throttle_io_info_t *info;
  875                 
  876         if (mp == NULL)
  877             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
  878         else if (mp->mnt_throttle_info == NULL)
  879             info = &_throttle_io_info[mp->mnt_devbsdunit];
  880         else
  881             info = mp->mnt_throttle_info;
  882 
  883         microuptime(&info->last_IO_timestamp);
  884 }
  885 
  886 
  887 #if CONFIG_EMBEDDED
  888 
  889 int throttle_get_io_policy(struct uthread **ut)
  890 {
  891         int policy = IOPOL_DEFAULT;
  892         proc_t p = current_proc();
  893 
  894         *ut = get_bsdthread_info(current_thread());
  895                 
  896         if (p != NULL)
  897                 policy = p->p_iopol_disk;
  898 
  899         if (*ut != NULL) {
  900                 // the I/O policy of the thread overrides that of the process
  901                 // unless the I/O policy of the thread is default
  902                 if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT)
  903                         policy = (*ut)->uu_iopol_disk;
  904         }
  905         return policy;
  906 }
  907 #else
  908 
  909 int throttle_get_io_policy(__unused struct uthread **ut)
  910 {
  911         *ut = get_bsdthread_info(current_thread());
  912 
  913         return (proc_get_task_selfdiskacc());
  914 }
  915 #endif
  916 
  917 
  918 static int
  919 throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info)
  920 {
  921         struct _throttle_io_info_t *info = throttle_info;
  922         struct timeval elapsed;
  923         int elapsed_msecs;
  924         int policy;
  925         struct uthread  *ut;
  926 
  927         policy = throttle_get_io_policy(&ut);
  928 
  929         if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE)
  930                 return (0);
  931 
  932         microuptime(&elapsed);
  933         timevalsub(&elapsed, &info->last_normal_IO_timestamp);
  934         elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
  935 
  936         if (lowpri_window_msecs == -1) // use the max waiting time
  937                 lowpri_window_msecs = lowpri_max_waiting_msecs;
  938 
  939         return elapsed_msecs < lowpri_window_msecs;
  940 }
  941 
  942 /* 
  943  * If we have a mount point and it has a throttle info pointer then
  944  * use it to do the check, otherwise use the device unit number to find
  945  * the correct throttle info array element.
  946  */
  947 int
  948 throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp)
  949 {
  950         void *info;
  951 
  952         /* Should we just return zero if no mount point */
  953         if (mp == NULL)
  954             info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
  955         else if (mp->mnt_throttle_info == NULL)
  956             info = &_throttle_io_info[mp->mnt_devbsdunit];
  957         else
  958             info = mp->mnt_throttle_info;
  959         return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info);
  960 }
  961 
  962 uint32_t
  963 throttle_lowpri_io(int sleep_amount)
  964 {
  965         int sleep_cnt = 0;
  966         int numthreads_throttling;
  967         int max_try_num;
  968         struct uthread *ut;
  969         struct _throttle_io_info_t *info;
  970         int max_waiting_msecs;
  971 
  972         ut = get_bsdthread_info(current_thread());
  973 
  974         if ((ut->uu_lowpri_window == 0) || (ut->uu_throttle_info == NULL))
  975                 goto done;
  976 
  977         info = ut->uu_throttle_info;
  978 
  979         if (sleep_amount != 0) {
  980 #if CONFIG_EMBEDDED
  981                 max_waiting_msecs = lowpri_max_waiting_msecs;
  982 #else 
  983                 if (ut->uu_throttle_isssd == TRUE)
  984                         max_waiting_msecs = lowpri_max_waiting_msecs / 100;
  985                 else
  986                         max_waiting_msecs = lowpri_max_waiting_msecs;
  987 #endif
  988                 if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL)
  989                         max_waiting_msecs = LOWPRI_SLEEP_INTERVAL;
  990 
  991                 numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1;
  992                 max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling);
  993 
  994                 for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) {
  995                         if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) {
  996                                 if (sleep_cnt == 0) {
  997                                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
  998                                                               ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0);
  999                                 }
 1000                                 IOSleep(LOWPRI_SLEEP_INTERVAL);
 1001                                 DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info );
 1002                         } else {
 1003                                 break;
 1004                         }
 1005                 }
 1006                 if (sleep_cnt) {
 1007                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
 1008                                               ut->uu_lowpri_window, sleep_cnt, 0, 0, 0);
 1009                 }
 1010         }
 1011         SInt32 oldValue;
 1012         oldValue = OSDecrementAtomic(&info->numthreads_throttling);
 1013 
 1014         if (oldValue <= 0) {
 1015                 panic("%s: numthreads negative", __func__);
 1016         }
 1017 done:
 1018         ut->uu_lowpri_window = 0;
 1019         if (ut->uu_throttle_info)
 1020                 throttle_info_rel(ut->uu_throttle_info);
 1021         ut->uu_throttle_info = NULL;
 1022         ut->uu_throttle_bc = FALSE;
 1023 
 1024         return (sleep_cnt * LOWPRI_SLEEP_INTERVAL);
 1025 }
 1026 
 1027 /*
 1028  * KPI routine
 1029  *
 1030  * set a kernel thread's IO policy.  policy can be:
 1031  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
 1032  *
 1033  * explanations about these policies are in the man page of setiopolicy_np
 1034  */
 1035 void throttle_set_thread_io_policy(int policy)
 1036 {
 1037 #if !CONFIG_EMBEDDED
 1038         proc_apply_thread_selfdiskacc(policy);
 1039 #else /* !CONFIG_EMBEDDED */
 1040         struct uthread *ut;
 1041         ut = get_bsdthread_info(current_thread());
 1042         ut->uu_iopol_disk = policy;
 1043 #endif /* !CONFIG_EMBEDDED */
 1044 }
 1045 
 1046 
 1047 static
 1048 void throttle_info_reset_window(struct uthread *ut)
 1049 {
 1050         struct _throttle_io_info_t *info;
 1051 
 1052         info = ut->uu_throttle_info;
 1053 
 1054         OSDecrementAtomic(&info->numthreads_throttling);
 1055         throttle_info_rel(info);
 1056         ut->uu_throttle_info = NULL;
 1057         ut->uu_lowpri_window = 0;
 1058 }
 1059 
 1060 static
 1061 void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle)
 1062 {
 1063         SInt32 oldValue;
 1064 
 1065         ut->uu_throttle_info = info;
 1066         throttle_info_ref(info);
 1067         DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
 1068 
 1069         oldValue = OSIncrementAtomic(&info->numthreads_throttling);
 1070         if (oldValue < 0) {
 1071                 panic("%s: numthreads negative", __func__);
 1072         }
 1073         ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
 1074         ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue;
 1075         ut->uu_throttle_isssd = isssd;
 1076         ut->uu_throttle_bc = BC_throttle;
 1077 }
 1078 
 1079 
 1080 static
 1081 void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd)
 1082 {
 1083         struct _throttle_io_info_t *info = throttle_info;
 1084         struct uthread  *ut;
 1085         int policy;
 1086         int is_throttleable_io = 0;
 1087         int is_passive_io = 0;
 1088 
 1089         if (!lowpri_IO_initial_window_msecs || (info == NULL))
 1090                 return;
 1091         policy = throttle_get_io_policy(&ut);
 1092 
 1093         switch (policy) {
 1094         case IOPOL_DEFAULT:
 1095         case IOPOL_NORMAL:
 1096                 break;
 1097         case IOPOL_THROTTLE:
 1098                 is_throttleable_io = 1;
 1099                 break;
 1100         case IOPOL_PASSIVE:
 1101                 is_passive_io = 1;
 1102                 break;
 1103         default:
 1104                 printf("unknown I/O policy %d", policy);
 1105                 break;
 1106         }
 1107 
 1108         if (!is_throttleable_io && ISSET(flags, B_PASSIVE))
 1109                 is_passive_io |= 1;
 1110 
 1111         if (!is_throttleable_io) {
 1112                 if (!is_passive_io){
 1113                         microuptime(&info->last_normal_IO_timestamp);
 1114                 }
 1115         } else if (ut) {
 1116                 /*
 1117                  * I'd really like to do the IOSleep here, but
 1118                  * we may be holding all kinds of filesystem related locks
 1119                  * and the pages for this I/O marked 'busy'...
 1120                  * we don't want to cause a normal task to block on
 1121                  * one of these locks while we're throttling a task marked
 1122                  * for low priority I/O... we'll mark the uthread and
 1123                  * do the delay just before we return from the system
 1124                  * call that triggered this I/O or from vnode_pagein
 1125                  */
 1126                 if (ut->uu_lowpri_window == 0)
 1127                         throttle_info_set_initial_window(ut, info, isssd, FALSE);
 1128                 else {
 1129                         /* The thread sends I/Os to different devices within the same system call */
 1130                         if (ut->uu_throttle_info != info) {
 1131                                 struct _throttle_io_info_t *old_info = ut->uu_throttle_info;
 1132 
 1133                                 // keep track of the numthreads in the right device
 1134                                 OSDecrementAtomic(&old_info->numthreads_throttling);
 1135                                 OSIncrementAtomic(&info->numthreads_throttling);
 1136 
 1137                                 DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info );
 1138                                 DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info );
 1139                                 /* This thread no longer needs a reference on that throttle info */
 1140                                 throttle_info_rel(ut->uu_throttle_info);
 1141                                 ut->uu_throttle_info = info;
 1142                                 /* Need to take a reference on this throttle info */
 1143                                 throttle_info_ref(ut->uu_throttle_info);
 1144                         }
 1145                         int numthreads = MAX(1, info->numthreads_throttling);
 1146                         ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads;
 1147                         if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads)
 1148                                 ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads;
 1149 
 1150                         if (isssd == FALSE) {
 1151                                 /*
 1152                                  * we're here because we've actually issued I/Os to different devices...
 1153                                  * if at least one of them was a non SSD, then thottle the thread
 1154                                  * using the policy for non SSDs
 1155                                  */
 1156                                 ut->uu_throttle_isssd = FALSE;
 1157                         }
 1158                 }
 1159         }
 1160 }
 1161 
 1162 /*
 1163  * KPI routine
 1164  *
 1165  * this is usually called before every I/O, used for throttled I/O
 1166  * book keeping.  This routine has low overhead and does not sleep
 1167  */
 1168 void throttle_info_update(void *throttle_info, int flags)
 1169 {
 1170         throttle_info_update_internal(throttle_info, flags, FALSE);
 1171 }
 1172 
 1173 /*
 1174  * KPI routine
 1175  *
 1176  * this is usually called before every I/O, used for throttled I/O
 1177  * book keeping.  This routine has low overhead and does not sleep
 1178  */
 1179 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
 1180 {
 1181         void *throttle_info = throttle_info_handle;
 1182         /* for now we only use the lowest bit of the throttle mask, so the
 1183          * handle is the same as the throttle_info.  Later if we store a
 1184          * set of throttle infos in the handle, we will want to loop through
 1185          * them and call throttle_info_update in a loop
 1186          */
 1187         throttle_info_update(throttle_info, flags);
 1188 }
 1189 
 1190 extern int ignore_is_ssd;
 1191 
 1192 int
 1193 spec_strategy(struct vnop_strategy_args *ap)
 1194 {
 1195         buf_t   bp;
 1196         int     bflags;
 1197         int     policy;
 1198         dev_t   bdev;
 1199         uthread_t ut;
 1200         mount_t mp;
 1201         int strategy_ret;
 1202         struct _throttle_io_info_t *throttle_info;
 1203         boolean_t isssd = FALSE;
 1204 
 1205         bp = ap->a_bp;
 1206         bdev = buf_device(bp);
 1207         mp = buf_vnode(bp)->v_mount;
 1208 
 1209         policy = throttle_get_io_policy(&ut);
 1210 
 1211         if (policy == IOPOL_THROTTLE) {
 1212                 bp->b_flags |= B_THROTTLED_IO;
 1213                 bp->b_flags &= ~B_PASSIVE;
 1214         } else if (policy == IOPOL_PASSIVE)
 1215                 bp->b_flags |= B_PASSIVE;
 1216 
 1217         bflags = bp->b_flags;
 1218 
 1219         if (kdebug_enable) {
 1220                 int    code = 0;
 1221 
 1222                 if (bflags & B_READ)
 1223                         code |= DKIO_READ;
 1224                 if (bflags & B_ASYNC)
 1225                         code |= DKIO_ASYNC;
 1226 
 1227                 if (bflags & B_META)
 1228                         code |= DKIO_META;
 1229                 else if (bflags & B_PAGEIO)
 1230                         code |= DKIO_PAGING;
 1231 
 1232                 if (bflags & B_THROTTLED_IO)
 1233                         code |= DKIO_THROTTLE;
 1234                 else if (bflags & B_PASSIVE)
 1235                         code |= DKIO_PASSIVE;
 1236 
 1237                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
 1238                                       bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
 1239         }
 1240         if (((bflags & (B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
 1241             mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
 1242                 hard_throttle_on_root = 1;
 1243 
 1244         if (mp != NULL) {
 1245                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
 1246                         isssd = TRUE;
 1247                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
 1248         } else
 1249                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 1250 
 1251         throttle_info_update_internal(throttle_info, bflags, isssd);
 1252 
 1253         if ((bflags & B_READ) == 0) {
 1254                 microuptime(&throttle_info->last_IO_timestamp);
 1255                 if (mp) {
 1256                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
 1257                 }
 1258         } else if (mp) {
 1259                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
 1260         }
 1261         /*
 1262          * The BootCache may give us special information about
 1263          * the IO, so it returns special values that we check
 1264          * for here.
 1265          *
 1266          * IO_SATISFIED_BY_CACHE
 1267          * The read has been satisfied by the boot cache. Don't
 1268          * throttle the thread unnecessarily.
 1269          *
 1270          * IO_SHOULD_BE_THROTTLED
 1271          * The boot cache is playing back a playlist and this IO
 1272          * cut through. Throttle it so we're not cutting through
 1273          * the boot cache too often.
 1274          *
 1275          * Note that typical strategy routines are defined with
 1276          * a void return so we'll get garbage here. In the 
 1277          * unlikely case the garbage matches our special return
 1278          * value, it's not a big deal since we're only adjusting
 1279          * the throttling delay.
 1280          */
 1281 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
 1282 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
 1283         typedef int strategy_fcn_ret_t(struct buf *bp);
 1284         
 1285         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
 1286         
 1287         if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) {
 1288                 /*
 1289                  * If this was a throttled IO satisfied by the boot cache,
 1290                  * don't delay the thread.
 1291                  */
 1292                 throttle_info_reset_window(ut);
 1293 
 1294         } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) {
 1295                 /*
 1296                  * If the boot cache indicates this IO should be throttled,
 1297                  * delay the thread.
 1298                  */
 1299                 throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE);
 1300         }
 1301         return (0);
 1302 }
 1303 
 1304 
 1305 /*
 1306  * This is a noop, simply returning what one has been given.
 1307  */
 1308 int
 1309 spec_blockmap(__unused struct vnop_blockmap_args *ap)
 1310 {
 1311         return (ENOTSUP);
 1312 }
 1313 
 1314 
 1315 /*
 1316  * Device close routine
 1317  */
 1318 int
 1319 spec_close(struct vnop_close_args *ap)
 1320 {
 1321         struct vnode *vp = ap->a_vp;
 1322         dev_t dev = vp->v_rdev;
 1323         int error = 0;
 1324         int flags = ap->a_fflag;
 1325         struct proc *p = vfs_context_proc(ap->a_context);
 1326         struct session *sessp;
 1327         int do_rele = 0;
 1328 
 1329         switch (vp->v_type) {
 1330 
 1331         case VCHR:
 1332                 /*
 1333                  * Hack: a tty device that is a controlling terminal
 1334                  * has a reference from the session structure.
 1335                  * We cannot easily tell that a character device is
 1336                  * a controlling terminal, unless it is the closing
 1337                  * process' controlling terminal.  In that case,
 1338                  * if the reference count is 1 (this is the very
 1339              * last close)
 1340                  */
 1341                 sessp = proc_session(p);
 1342                 if (sessp != SESSION_NULL) {
 1343                         if ((vcount(vp) == 1) && 
 1344                                 (vp == sessp->s_ttyvp)) {
 1345 
 1346                                 session_lock(sessp);
 1347                                 if (vp == sessp->s_ttyvp) {
 1348                                         sessp->s_ttyvp = NULL;
 1349                                         sessp->s_ttyvid = 0;
 1350                                         sessp->s_ttyp = TTY_NULL;
 1351                                         sessp->s_ttypgrpid = NO_PID;
 1352                                         do_rele = 1;
 1353                                 } 
 1354                                 session_unlock(sessp);
 1355 
 1356                                 if (do_rele) {
 1357                                         vnode_rele(vp);
 1358                                 }
 1359                         }
 1360                         session_rele(sessp);
 1361                 }
 1362 
 1363                 devsw_lock(dev, S_IFCHR);
 1364 
 1365                 vp->v_specinfo->si_opencount--;
 1366 
 1367                 if (vp->v_specinfo->si_opencount < 0) {
 1368                         panic("Negative open count?");
 1369                 }
 1370                 /*
 1371                  * close on last reference or on vnode revoke call
 1372                  */
 1373                 if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) {
 1374                         devsw_unlock(dev, S_IFCHR);
 1375                         return (0);
 1376                 }       
 1377                 
 1378                 error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
 1379 
 1380                 devsw_unlock(dev, S_IFCHR);
 1381                 break;
 1382 
 1383         case VBLK:
 1384                 /*
 1385                  * If there is more than one outstanding open, don't
 1386                  * send the close to the device.
 1387                  */
 1388                 devsw_lock(dev, S_IFBLK);
 1389                 if (vcount(vp) > 1) {
 1390                         vp->v_specinfo->si_opencount--;
 1391                         devsw_unlock(dev, S_IFBLK);
 1392                         return (0);
 1393                 }
 1394                 devsw_unlock(dev, S_IFBLK);
 1395 
 1396                 /*
 1397                  * On last close of a block device (that isn't mounted)
 1398                  * we must invalidate any in core blocks, so that
 1399                  * we can, for instance, change floppy disks.
 1400                  */
 1401                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
 1402                         return (error);
 1403 
 1404                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
 1405                 if (error)
 1406                         return (error);
 1407 
 1408                 devsw_lock(dev, S_IFBLK);
 1409 
 1410                 vp->v_specinfo->si_opencount--;
 1411                 
 1412                 if (vp->v_specinfo->si_opencount < 0) {
 1413                         panic("Negative open count?");
 1414                 }
 1415 
 1416                 if (vcount(vp) > 0) {
 1417                         devsw_unlock(dev, S_IFBLK);
 1418                         return (0);
 1419                 }
 1420 
 1421                 error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
 1422 
 1423                 devsw_unlock(dev, S_IFBLK);
 1424                 break;
 1425 
 1426         default:
 1427                 panic("spec_close: not special");
 1428                 return(EBADF);
 1429         }
 1430 
 1431         return error;
 1432 }
 1433 
 1434 /*
 1435  * Return POSIX pathconf information applicable to special devices.
 1436  */
 1437 int
 1438 spec_pathconf(struct vnop_pathconf_args *ap)
 1439 {
 1440 
 1441         switch (ap->a_name) {
 1442         case _PC_LINK_MAX:
 1443                 *ap->a_retval = LINK_MAX;
 1444                 return (0);
 1445         case _PC_MAX_CANON:
 1446                 *ap->a_retval = MAX_CANON;
 1447                 return (0);
 1448         case _PC_MAX_INPUT:
 1449                 *ap->a_retval = MAX_INPUT;
 1450                 return (0);
 1451         case _PC_PIPE_BUF:
 1452                 *ap->a_retval = PIPE_BUF;
 1453                 return (0);
 1454         case _PC_CHOWN_RESTRICTED:
 1455                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
 1456                 return (0);
 1457         case _PC_VDISABLE:
 1458                 *ap->a_retval = _POSIX_VDISABLE;
 1459                 return (0);
 1460         default:
 1461                 return (EINVAL);
 1462         }
 1463         /* NOTREACHED */
 1464 }
 1465 
 1466 /*
 1467  * Special device failed operation
 1468  */
 1469 int
 1470 spec_ebadf(__unused void *dummy)
 1471 {
 1472 
 1473         return (EBADF);
 1474 }
 1475 
 1476 /* Blktooff derives file offset from logical block number */
 1477 int
 1478 spec_blktooff(struct vnop_blktooff_args *ap)
 1479 {
 1480         struct vnode *vp = ap->a_vp;
 1481 
 1482         switch (vp->v_type) {
 1483         case VCHR:
 1484                 *ap->a_offset = (off_t)-1; /* failure */
 1485                 return (ENOTSUP);
 1486 
 1487         case VBLK:
 1488                 printf("spec_blktooff: not implemented for VBLK\n");
 1489                 *ap->a_offset = (off_t)-1; /* failure */
 1490                 return (ENOTSUP);
 1491 
 1492         default:
 1493                 panic("spec_blktooff type");
 1494         }
 1495         /* NOTREACHED */
 1496 
 1497         return (0);
 1498 }
 1499 
 1500 /* Offtoblk derives logical block number from file offset */
 1501 int
 1502 spec_offtoblk(struct vnop_offtoblk_args *ap)
 1503 {
 1504         struct vnode *vp = ap->a_vp;
 1505 
 1506         switch (vp->v_type) {
 1507         case VCHR:
 1508                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
 1509                 return (ENOTSUP);
 1510 
 1511         case VBLK:
 1512                 printf("spec_offtoblk: not implemented for VBLK\n");
 1513                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
 1514                 return (ENOTSUP);
 1515 
 1516         default:
 1517                 panic("spec_offtoblk type");
 1518         }
 1519         /* NOTREACHED */
 1520 
 1521         return (0);
 1522 }
 1523 
 1524 static void filt_specdetach(struct knote *kn);
 1525 static int filt_spec(struct knote *kn, long hint);
 1526 static unsigned filt_specpeek(struct knote *kn);
 1527 
 1528 struct filterops spec_filtops = {
 1529         .f_isfd         = 1,
 1530         .f_attach       = filt_specattach,
 1531         .f_detach       = filt_specdetach,
 1532         .f_event        = filt_spec,
 1533         .f_peek         = filt_specpeek
 1534 };
 1535 
 1536 static int
 1537 filter_to_seltype(int16_t filter)
 1538 {
 1539         switch (filter) {
 1540         case EVFILT_READ: 
 1541                 return FREAD;
 1542         case EVFILT_WRITE:
 1543                 return FWRITE;
 1544                 break;
 1545         default:
 1546                 panic("filt_to_seltype(): invalid filter %d\n", filter);
 1547                 return 0;
 1548         }
 1549 }
 1550 
 1551 static int 
 1552 filt_specattach(struct knote *kn)
 1553 {
 1554         vnode_t vp;
 1555         dev_t dev;
 1556 
 1557         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
 1558 
 1559         assert(vnode_ischr(vp));
 1560 
 1561         dev = vnode_specrdev(vp);
 1562 
 1563         if (major(dev) > nchrdev) {
 1564                 return ENXIO;
 1565         }
 1566 
 1567         if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) {
 1568                 return EINVAL;
 1569         }
 1570 
 1571         /* Resulting wql is safe to unlink even if it has never been linked */
 1572         kn->kn_hook = wait_queue_link_allocate();
 1573         if (kn->kn_hook == NULL) {
 1574                 return EAGAIN;
 1575         }
 1576 
 1577         kn->kn_fop = &spec_filtops;
 1578         kn->kn_hookid = vnode_vid(vp);
 1579 
 1580         knote_markstayqueued(kn);
 1581 
 1582         return 0;
 1583 }
 1584 
 1585 static void 
 1586 filt_specdetach(struct knote *kn)
 1587 {
 1588         kern_return_t ret;
 1589 
 1590         /* 
 1591          * Given wait queue link and wait queue set, unlink.  This is subtle.
 1592          * If the device has been revoked from under us, selclearthread() will
 1593          * have removed our link from the kqueue's wait queue set, which 
 1594          * wait_queue_set_unlink_one() will detect and handle.
 1595          */
 1596         ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook);
 1597         if (ret != KERN_SUCCESS) {
 1598                 panic("filt_specdetach(): failed to unlink wait queue link.");
 1599         }
 1600 
 1601         (void)wait_queue_link_free(kn->kn_hook);
 1602         kn->kn_hook = NULL;
 1603         kn->kn_status &= ~KN_STAYQUEUED;
 1604 }
 1605 
 1606 static int 
 1607 filt_spec(struct knote *kn, long hint)
 1608 {
 1609         vnode_t vp;
 1610         uthread_t uth;
 1611         wait_queue_set_t old_wqs;
 1612         vfs_context_t ctx;
 1613         int selres;
 1614         int error;
 1615         int use_offset;
 1616         dev_t dev;
 1617         uint64_t flags;
 1618 
 1619         assert(kn->kn_hook != NULL);
 1620 
 1621         if (hint != 0) {
 1622                 panic("filt_spec(): nonzero hint?");
 1623         }
 1624 
 1625         uth = get_bsdthread_info(current_thread());
 1626         ctx = vfs_context_current();
 1627         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
 1628 
 1629         error = vnode_getwithvid(vp, kn->kn_hookid);
 1630         if (error != 0) {
 1631                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 1632                 return 1;
 1633         }
 1634         
 1635         dev = vnode_specrdev(vp);
 1636         flags = cdevsw_flags[major(dev)];
 1637         use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
 1638         assert((flags & CDEVSW_SELECT_KQUEUE) != 0);
 1639 
 1640         /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
 1641         old_wqs = uth->uu_wqset;
 1642         uth->uu_wqset = kn->kn_kq->kq_wqs;
 1643         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
 1644         uth->uu_wqset = old_wqs;
 1645 
 1646         if (use_offset) {
 1647                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
 1648                         kn->kn_data = 0;
 1649                 } else {
 1650                         kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
 1651                 }
 1652         } else {
 1653                 kn->kn_data = selres;
 1654         }
 1655 
 1656         vnode_put(vp);
 1657 
 1658         return (kn->kn_data != 0);
 1659 }
 1660 
 1661 static unsigned
 1662 filt_specpeek(struct knote *kn)
 1663 {
 1664         vnode_t vp;
 1665         uthread_t uth;
 1666         wait_queue_set_t old_wqs;
 1667         vfs_context_t ctx;
 1668         int error, selres;
 1669         
 1670         uth = get_bsdthread_info(current_thread());
 1671         ctx = vfs_context_current();
 1672         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
 1673 
 1674         error = vnode_getwithvid(vp, kn->kn_hookid);
 1675         if (error != 0) {
 1676                 return 1; /* Just like VNOP_SELECT() on recycled vnode */
 1677         }
 1678 
 1679         /*
 1680          * Why pass the link here?  Because we may not have registered in the past...
 1681          */
 1682         old_wqs = uth->uu_wqset;
 1683         uth->uu_wqset = kn->kn_kq->kq_wqs;
 1684         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
 1685         uth->uu_wqset = old_wqs;
 1686 
 1687         vnode_put(vp);
 1688         return selres;
 1689 }
 1690
Cache object: 46b3b9e74107842cc3a1e5aa2275a95e
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/bsd/miscfs/specfs/spec_vnops.c

FreeBSD/Linux Kernel Cross Reference
sys/bsd/miscfs/specfs/spec_vnops.c