The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System, Second Edition

[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/bsd/miscfs/specfs/spec_vnops.c

Version: -  FREEBSD  -  FREEBSD10  -  FREEBSD9  -  FREEBSD92  -  FREEBSD91  -  FREEBSD90  -  FREEBSD8  -  FREEBSD82  -  FREEBSD81  -  FREEBSD80  -  FREEBSD7  -  FREEBSD74  -  FREEBSD73  -  FREEBSD72  -  FREEBSD71  -  FREEBSD70  -  FREEBSD6  -  FREEBSD64  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  cheribsd  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1  -  FREEBSD-LIBC  -  FREEBSD8-LIBC  -  FREEBSD7-LIBC  -  FREEBSD6-LIBC  -  GLIBC27 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2000-2012 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
    5  * 
    6  * This file contains Original Code and/or Modifications of Original Code
    7  * as defined in and that are subject to the Apple Public Source License
    8  * Version 2.0 (the 'License'). You may not use this file except in
    9  * compliance with the License. The rights granted to you under the License
   10  * may not be used to create, or enable the creation or redistribution of,
   11  * unlawful or unlicensed copies of an Apple operating system, or to
   12  * circumvent, violate, or enable the circumvention or violation of, any
   13  * terms of an Apple operating system software license agreement.
   14  * 
   15  * Please obtain a copy of the License at
   16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
   17  * 
   18  * The Original Code and all software distributed under the License are
   19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   23  * Please see the License for the specific language governing rights and
   24  * limitations under the License.
   25  * 
   26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   27  */
   28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
   29 /*
   30  * Copyright (c) 1989, 1993, 1995
   31  *      The Regents of the University of California.  All rights reserved.
   32  *
   33  * Redistribution and use in source and binary forms, with or without
   34  * modification, are permitted provided that the following conditions
   35  * are met:
   36  * 1. Redistributions of source code must retain the above copyright
   37  *    notice, this list of conditions and the following disclaimer.
   38  * 2. Redistributions in binary form must reproduce the above copyright
   39  *    notice, this list of conditions and the following disclaimer in the
   40  *    documentation and/or other materials provided with the distribution.
   41  * 3. All advertising materials mentioning features or use of this software
   42  *    must display the following acknowledgement:
   43  *      This product includes software developed by the University of
   44  *      California, Berkeley and its contributors.
   45  * 4. Neither the name of the University nor the names of its contributors
   46  *    may be used to endorse or promote products derived from this software
   47  *    without specific prior written permission.
   48  *
   49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   59  * SUCH DAMAGE.
   60  *
   61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
   62  */
   63 
   64 #include <sys/param.h>
   65 #include <sys/proc_internal.h>
   66 #include <sys/kauth.h>
   67 #include <sys/systm.h>
   68 #include <sys/kernel.h>
   69 #include <sys/conf.h>
   70 #include <sys/buf_internal.h>
   71 #include <sys/mount_internal.h>
   72 #include <sys/vnode_internal.h>
   73 #include <sys/file_internal.h>
   74 #include <sys/namei.h>
   75 #include <sys/stat.h>
   76 #include <sys/errno.h>
   77 #include <sys/ioctl.h>
   78 #include <sys/file.h>
   79 #include <sys/user.h>
   80 #include <sys/malloc.h>
   81 #include <sys/disk.h>
   82 #include <sys/uio_internal.h>
   83 #include <sys/resource.h>
   84 #include <miscfs/specfs/specdev.h>
   85 #include <vfs/vfs_support.h>
   86 #include <kern/assert.h>
   87 #include <kern/task.h>
   88 
   89 #include <sys/kdebug.h>
   90 
   91 /* XXX following three prototypes should be in a header file somewhere */
   92 extern dev_t    chrtoblk(dev_t dev);
   93 extern int      iskmemdev(dev_t dev);
   94 extern int      bpfkqfilter(dev_t dev, struct knote *kn);
   95 extern int      ptsd_kqfilter(dev_t dev, struct knote *kn);
   96 
   97 extern int ignore_is_ssd;
   98 
   99 struct vnode *speclisth[SPECHSZ];
  100 
  101 /* symbolic sleep message strings for devices */
  102 char    devopn[] = "devopn";
  103 char    devio[] = "devio";
  104 char    devwait[] = "devwait";
  105 char    devin[] = "devin";
  106 char    devout[] = "devout";
  107 char    devioc[] = "devioc";
  108 char    devcls[] = "devcls";
  109 
  110 #define VOPFUNC int (*)(void *)
  111 
  112 int (**spec_vnodeop_p)(void *);
  113 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
  114         { &vnop_default_desc, (VOPFUNC)vn_default_error },
  115         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
  116         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
  117         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
  118         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
  119         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
  120         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
  121         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
  122         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
  123         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
  124         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
  125         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
  126         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
  127         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
  128         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
  129         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
  130         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
  131         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
  132         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
  133         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
  134         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
  135         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
  136         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
  137         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
  138         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
  139         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
  140         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
  141         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
  142         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
  143         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
  144         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
  145         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
  146         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
  147         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
  148         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
  149         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
  150         { (struct vnodeop_desc*)NULL, (int(*)())NULL }
  151 };
  152 struct vnodeopv_desc spec_vnodeop_opv_desc =
  153         { &spec_vnodeop_p, spec_vnodeop_entries };
  154 
  155 
  156 static void set_blocksize(vnode_t, dev_t);
  157 
  158 
  159 #define THROTTLE_LEVEL_NONE     -1
  160 #define THROTTLE_LEVEL_TIER0     0
  161 
  162 #define THROTTLE_LEVEL_THROTTLED 1
  163 #define THROTTLE_LEVEL_TIER1     1
  164 #define THROTTLE_LEVEL_TIER2     2
  165 
  166 #define THROTTLE_LEVEL_START     0
  167 #define THROTTLE_LEVEL_END       2
  168 
  169 
  170 struct _throttle_io_info_t {
  171         struct timeval  throttle_last_IO_timestamp[THROTTLE_LEVEL_END + 1];
  172         struct timeval  throttle_last_write_timestamp;
  173         struct timeval  throttle_start_IO_period_timestamp;
  174 
  175         TAILQ_HEAD( , uthread) throttle_uthlist;        /* List of throttled uthreads */
  176 
  177         lck_mtx_t       throttle_lock;
  178         thread_call_t   throttle_timer_call;
  179         int32_t throttle_timer_running;
  180         int32_t throttle_io_count;
  181         int32_t throttle_io_count_begin;
  182         int32_t throttle_io_period;
  183         uint32_t throttle_io_period_num;
  184         int32_t throttle_refcnt;
  185         int32_t throttle_alloc;
  186 };
  187 
  188 struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV];
  189 
  190 static void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int policy, int flags, boolean_t isssd);
  191 static int throttle_get_thread_throttle_level(uthread_t ut, int policy);
  192 
  193 __private_extern__ int32_t throttle_legacy_process_count = 0;
  194 
  195 /*
  196  * Trivial lookup routine that always fails.
  197  */
  198 int
  199 spec_lookup(struct vnop_lookup_args *ap)
  200 {
  201 
  202         *ap->a_vpp = NULL;
  203         return (ENOTDIR);
  204 }
  205 
  206 static void
  207 set_blocksize(struct vnode *vp, dev_t dev)
  208 {
  209     int (*size)(dev_t);
  210     int rsize;
  211 
  212     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
  213         rsize = (*size)(dev);
  214         if (rsize <= 0)        /* did size fail? */
  215             vp->v_specsize = DEV_BSIZE;
  216         else
  217             vp->v_specsize = rsize;
  218     }
  219     else
  220             vp->v_specsize = DEV_BSIZE;
  221 }
  222 
  223 void
  224 set_fsblocksize(struct vnode *vp)
  225 {
  226         
  227         if (vp->v_type == VBLK) {
  228                 dev_t dev = (dev_t)vp->v_rdev;
  229                 int maj = major(dev);
  230 
  231                 if ((u_int)maj >= (u_int)nblkdev)
  232                         return;
  233 
  234                 vnode_lock(vp);
  235                 set_blocksize(vp, dev);
  236                 vnode_unlock(vp);
  237         }
  238 
  239 }
  240 
  241 
  242 /*
  243  * Open a special file.
  244  */
  245 int
  246 spec_open(struct vnop_open_args *ap)
  247 {
  248         struct proc *p = vfs_context_proc(ap->a_context);
  249         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
  250         struct vnode *vp = ap->a_vp;
  251         dev_t bdev, dev = (dev_t)vp->v_rdev;
  252         int maj = major(dev);
  253         int error;
  254 
  255         /*
  256          * Don't allow open if fs is mounted -nodev.
  257          */
  258         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
  259                 return (ENXIO);
  260 
  261         switch (vp->v_type) {
  262 
  263         case VCHR:
  264                 if ((u_int)maj >= (u_int)nchrdev)
  265                         return (ENXIO);
  266                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
  267                         /*
  268                          * When running in very secure mode, do not allow
  269                          * opens for writing of any disk character devices.
  270                          */
  271                         if (securelevel >= 2 && isdisk(dev, VCHR))
  272                                 return (EPERM);
  273                         /*
  274                          * When running in secure mode, do not allow opens
  275                          * for writing of /dev/mem, /dev/kmem, or character
  276                          * devices whose corresponding block devices are
  277                          * currently mounted.
  278                          */
  279                         if (securelevel >= 1) {
  280                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
  281                                         return (error);
  282                                 if (iskmemdev(dev))
  283                                         return (EPERM);
  284                         }
  285                 }
  286 
  287                 devsw_lock(dev, S_IFCHR);
  288                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
  289 
  290                 if (error == 0) {
  291                         vp->v_specinfo->si_opencount++;
  292                 }
  293 
  294                 devsw_unlock(dev, S_IFCHR);
  295 
  296                 if (error == 0 && (D_TYPEMASK & cdevsw[maj].d_type) == D_DISK && !vp->v_un.vu_specinfo->si_initted) {
  297                         int     isssd = 0;
  298                         uint64_t throttle_mask = 0;
  299                         uint32_t devbsdunit = 0;
  300 
  301                         if (VNOP_IOCTL(vp, DKIOCGETTHROTTLEMASK, (caddr_t)&throttle_mask, 0, NULL) == 0) {
  302                                 
  303                                 if (throttle_mask != 0 &&
  304                                     VNOP_IOCTL(vp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, ap->a_context) == 0) {
  305                                         /*
  306                                          * as a reasonable approximation, only use the lowest bit of the mask
  307                                          * to generate a disk unit number
  308                                          */
  309                                         devbsdunit = num_trailing_0(throttle_mask);
  310 
  311                                         vnode_lock(vp);
  312                                         
  313                                         vp->v_un.vu_specinfo->si_isssd = isssd;
  314                                         vp->v_un.vu_specinfo->si_devbsdunit = devbsdunit;
  315                                         vp->v_un.vu_specinfo->si_throttle_mask = throttle_mask;
  316                                         vp->v_un.vu_specinfo->si_throttleable = 1;
  317                                         vp->v_un.vu_specinfo->si_initted = 1;
  318 
  319                                         vnode_unlock(vp);
  320                                 }
  321                         }
  322                         if (vp->v_un.vu_specinfo->si_initted == 0) {
  323                                 vnode_lock(vp);
  324                                 vp->v_un.vu_specinfo->si_initted = 1;
  325                                 vnode_unlock(vp);
  326                         }
  327                 }
  328                 return (error);
  329 
  330         case VBLK:
  331                 if ((u_int)maj >= (u_int)nblkdev)
  332                         return (ENXIO);
  333                 /*
  334                  * When running in very secure mode, do not allow
  335                  * opens for writing of any disk block devices.
  336                  */
  337                 if (securelevel >= 2 && cred != FSCRED &&
  338                     (ap->a_mode & FWRITE) && isdisk(dev, VBLK))
  339                         return (EPERM);
  340                 /*
  341                  * Do not allow opens of block devices that are
  342                  * currently mounted.
  343                  */
  344                 if ( (error = vfs_mountedon(vp)) )
  345                         return (error);
  346 
  347                 devsw_lock(dev, S_IFBLK);
  348                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
  349                 if (!error) {
  350                         vp->v_specinfo->si_opencount++;
  351                 }
  352                 devsw_unlock(dev, S_IFBLK);
  353 
  354                 if (!error) {
  355                     u_int64_t blkcnt;
  356                     u_int32_t blksize;
  357                         int setsize = 0;
  358                         u_int32_t size512 = 512;
  359 
  360 
  361                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
  362                                 /* Switch to 512 byte sectors (temporarily) */
  363 
  364                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
  365                                 /* Get the number of 512 byte physical blocks. */
  366                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
  367                                                 setsize = 1;
  368                                 }
  369                                 }
  370                                 /* If it doesn't set back, we can't recover */
  371                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
  372                                 error = ENXIO;
  373                     }
  374 
  375 
  376                         vnode_lock(vp);
  377                     set_blocksize(vp, dev);
  378 
  379                     /*
  380                      * Cache the size in bytes of the block device for later
  381                      * use by spec_write().
  382                      */
  383                         if (setsize)
  384                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
  385                         else
  386                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
  387                         
  388                         vnode_unlock(vp);
  389 
  390                 }
  391                 return(error);
  392         default:
  393                 panic("spec_open type");
  394         }
  395         return (0);
  396 }
  397 
  398 /*
  399  * Vnode op for read
  400  */
  401 int
  402 spec_read(struct vnop_read_args *ap)
  403 {
  404         struct vnode *vp = ap->a_vp;
  405         struct uio *uio = ap->a_uio;
  406         struct buf *bp;
  407         daddr64_t bn, nextbn;
  408         long bsize, bscale;
  409         int devBlockSize=0;
  410         int n, on;
  411         int error = 0;
  412         dev_t dev;
  413 
  414 #if DIAGNOSTIC
  415         if (uio->uio_rw != UIO_READ)
  416                 panic("spec_read mode");
  417         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
  418                 panic("spec_read proc");
  419 #endif
  420         if (uio_resid(uio) == 0)
  421                 return (0);
  422 
  423         switch (vp->v_type) {
  424 
  425         case VCHR:
  426                 if ((D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type) == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
  427                         struct _throttle_io_info_t *throttle_info;
  428 
  429                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
  430 
  431                         throttle_info_update_internal(throttle_info, NULL, -1, 0, vp->v_un.vu_specinfo->si_isssd);
  432                 }
  433                 error = (*cdevsw[major(vp->v_rdev)].d_read)
  434                         (vp->v_rdev, uio, ap->a_ioflag);
  435 
  436                 return (error);
  437 
  438         case VBLK:
  439                 if (uio->uio_offset < 0)
  440                         return (EINVAL);
  441 
  442                 dev = vp->v_rdev;
  443 
  444                 devBlockSize = vp->v_specsize;
  445 
  446                 if (devBlockSize > PAGE_SIZE) 
  447                         return (EINVAL);
  448 
  449                 bscale = PAGE_SIZE / devBlockSize;
  450                 bsize = bscale * devBlockSize;
  451 
  452                 do {
  453                         on = uio->uio_offset % bsize;
  454 
  455                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
  456                         
  457                         if (vp->v_speclastr + bscale == bn) {
  458                                 nextbn = bn + bscale;
  459                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
  460                                                (int *)&bsize, 1, NOCRED, &bp);
  461                         } else
  462                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
  463 
  464                         vnode_lock(vp);
  465                         vp->v_speclastr = bn;
  466                         vnode_unlock(vp);
  467 
  468                         n = bsize - buf_resid(bp);
  469                         if ((on > n) || error) {
  470                                 if (!error)
  471                                         error = EINVAL;
  472                                 buf_brelse(bp);
  473                                 return (error);
  474                         }
  475                         n = min((unsigned)(n  - on), uio_resid(uio));
  476 
  477                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
  478                         if (n + on == bsize)
  479                                 buf_markaged(bp);
  480                         buf_brelse(bp);
  481                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
  482                 return (error);
  483 
  484         default:
  485                 panic("spec_read type");
  486         }
  487         /* NOTREACHED */
  488 
  489         return (0);
  490 }
  491 
  492 /*
  493  * Vnode op for write
  494  */
  495 int
  496 spec_write(struct vnop_write_args *ap)
  497 {
  498         struct vnode *vp = ap->a_vp;
  499         struct uio *uio = ap->a_uio;
  500         struct buf *bp;
  501         daddr64_t bn;
  502         int bsize, blkmask, bscale;
  503         int io_sync;
  504         int devBlockSize=0;
  505         int n, on;
  506         int error = 0;
  507         dev_t dev;
  508 
  509 #if DIAGNOSTIC
  510         if (uio->uio_rw != UIO_WRITE)
  511                 panic("spec_write mode");
  512         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
  513                 panic("spec_write proc");
  514 #endif
  515 
  516         switch (vp->v_type) {
  517 
  518         case VCHR:
  519                 if ((D_TYPEMASK & cdevsw[major(vp->v_rdev)].d_type) == D_DISK && vp->v_un.vu_specinfo->si_throttleable) {
  520                         struct _throttle_io_info_t *throttle_info;
  521 
  522                         throttle_info = &_throttle_io_info[vp->v_un.vu_specinfo->si_devbsdunit];
  523 
  524                         throttle_info_update_internal(throttle_info, NULL, -1, 0, vp->v_un.vu_specinfo->si_isssd);
  525 
  526                         microuptime(&throttle_info->throttle_last_write_timestamp);
  527                 }
  528                 error = (*cdevsw[major(vp->v_rdev)].d_write)
  529                         (vp->v_rdev, uio, ap->a_ioflag);
  530 
  531                 return (error);
  532 
  533         case VBLK:
  534                 if (uio_resid(uio) == 0)
  535                         return (0);
  536                 if (uio->uio_offset < 0)
  537                         return (EINVAL);
  538 
  539                 io_sync = (ap->a_ioflag & IO_SYNC);
  540 
  541                 dev = (vp->v_rdev);
  542 
  543                 devBlockSize = vp->v_specsize;
  544                 if (devBlockSize > PAGE_SIZE)
  545                         return(EINVAL);
  546 
  547                 bscale = PAGE_SIZE / devBlockSize;
  548                 blkmask = bscale - 1;
  549                 bsize = bscale * devBlockSize;
  550                 
  551 
  552                 do {
  553                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
  554                         on = uio->uio_offset % bsize;
  555 
  556                         n = min((unsigned)(bsize - on), uio_resid(uio));
  557 
  558                         /*
  559                          * Use buf_getblk() as an optimization IFF:
  560                          *
  561                          * 1)   We are reading exactly a block on a block
  562                          *      aligned boundary
  563                          * 2)   We know the size of the device from spec_open
  564                          * 3)   The read doesn't span the end of the device
  565                          *
  566                          * Otherwise, we fall back on buf_bread().
  567                          */
  568                         if (n == bsize &&
  569                             vp->v_specdevsize != (u_int64_t)0 &&
  570                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
  571                             /* reduce the size of the read to what is there */
  572                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
  573                         }
  574 
  575                         if (n == bsize)
  576                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
  577                         else
  578                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
  579 
  580                         /* Translate downstream error for upstream, if needed */
  581                         if (!error)
  582                                 error = (int)buf_error(bp);
  583                         if (error) {
  584                                 buf_brelse(bp);
  585                                 return (error);
  586                         }
  587                         n = min(n, bsize - buf_resid(bp));
  588 
  589                         error = uiomove((char *)buf_dataptr(bp) + on, n, uio);
  590                         if (error) {
  591                                 buf_brelse(bp);
  592                                 return (error);
  593                         }
  594                         buf_markaged(bp);
  595 
  596                         if (io_sync) 
  597                                 error = buf_bwrite(bp);
  598                         else {
  599                                 if ((n + on) == bsize)
  600                                         error = buf_bawrite(bp);
  601                                 else
  602                                         error = buf_bdwrite(bp);
  603                         }
  604                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
  605                 return (error);
  606 
  607         default:
  608                 panic("spec_write type");
  609         }
  610         /* NOTREACHED */
  611 
  612         return (0);
  613 }
  614 
  615 /*
  616  * Device ioctl operation.
  617  */
  618 int
  619 spec_ioctl(struct vnop_ioctl_args *ap)
  620 {
  621         proc_t p = vfs_context_proc(ap->a_context);
  622         dev_t dev = ap->a_vp->v_rdev;
  623         int     retval = 0;
  624 
  625         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_START,
  626                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, (unsigned int)ap->a_vp->v_type, 0);
  627 
  628         switch (ap->a_vp->v_type) {
  629 
  630         case VCHR:
  631                 retval = (*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
  632                                                        ap->a_fflag, p);
  633                 break;
  634 
  635         case VBLK:
  636                 if (kdebug_enable) {
  637                         if (ap->a_command == DKIOCUNMAP) {
  638                                 dk_unmap_t      *unmap;
  639                                 dk_extent_t     *extent;
  640                                 uint32_t        i;
  641 
  642                                 unmap = (dk_unmap_t *)ap->a_data;
  643                                 extent = unmap->extents;
  644 
  645                                 for (i = 0; i < unmap->extentsCount; i++, extent++) {
  646                                         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 1) | DBG_FUNC_NONE, dev, extent->offset/ap->a_vp->v_specsize, extent->length, 0, 0);
  647                                 }
  648                         }
  649                 }
  650                 retval = (*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data, ap->a_fflag, p);
  651                 break;
  652 
  653         default:
  654                 panic("spec_ioctl");
  655                 /* NOTREACHED */
  656         }
  657         KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_IOCTL, 0) | DBG_FUNC_END,
  658                               (unsigned int)dev, (unsigned int)ap->a_command, (unsigned int)ap->a_fflag, retval, 0);
  659 
  660         return (retval);
  661 }
  662 
  663 int
  664 spec_select(struct vnop_select_args *ap)
  665 {
  666         proc_t p = vfs_context_proc(ap->a_context);
  667         dev_t dev;
  668 
  669         switch (ap->a_vp->v_type) {
  670 
  671         default:
  672                 return (1);             /* XXX */
  673 
  674         case VCHR:
  675                 dev = ap->a_vp->v_rdev;
  676                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
  677         }
  678 }
  679 
  680 static int filt_specattach(struct knote *kn);
  681 
  682 int
  683 spec_kqfilter(vnode_t vp, struct knote *kn)
  684 {
  685         dev_t dev;
  686         int err = EINVAL;
  687 
  688         /*
  689          * For a few special kinds of devices, we can attach knotes.
  690          * Each filter function must check whether the dev type matches it.
  691          */
  692         dev = vnode_specrdev(vp);
  693 
  694         if (vnode_istty(vp)) {
  695                 /* We can hook into TTYs... */
  696                 err = filt_specattach(kn);
  697         } else {
  698                 /* Try a bpf device, as defined in bsd/net/bpf.c */
  699                 err = bpfkqfilter(dev, kn);
  700         }
  701 
  702         return err;
  703 }
  704 
  705 /*
  706  * Synch buffers associated with a block device
  707  */
  708 int
  709 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
  710 {
  711         if (vp->v_type == VCHR)
  712                 return (0);
  713         /*
  714          * Flush all dirty buffers associated with a block device.
  715          */
  716         buf_flushdirtyblks(vp, (waitfor == MNT_WAIT || waitfor == MNT_DWAIT), 0, "spec_fsync");
  717 
  718         return (0);
  719 }
  720 
  721 int
  722 spec_fsync(struct vnop_fsync_args *ap)
  723 {
  724         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
  725 }
  726 
  727 
  728 /*
  729  * Just call the device strategy routine
  730  */
  731 extern int hard_throttle_on_root;
  732 
  733 void throttle_init(void);
  734 
  735 
  736 #define LOWPRI_THROTTLE_WINDOW_MSECS 500
  737 #define LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS 200
  738 #define LOWPRI_IO_PERIOD_MSECS 200
  739 #define LOWPRI_IO_PERIOD_SSD_MSECS 20
  740 #define LOWPRI_TIMER_PERIOD_MSECS 10
  741 
  742 
  743 int     lowpri_throttle_window_msecs = LOWPRI_THROTTLE_WINDOW_MSECS;
  744 int     lowpri_legacy_throttle_window_msecs = LOWPRI_LEGACY_THROTTLE_WINDOW_MSECS;
  745 int     lowpri_io_period_msecs = LOWPRI_IO_PERIOD_MSECS;
  746 int     lowpri_io_period_ssd_msecs = LOWPRI_IO_PERIOD_SSD_MSECS;
  747 int     lowpri_timer_period_msecs = LOWPRI_TIMER_PERIOD_MSECS;
  748 
  749 /*
  750  * If a process requiring legacy iothrottle behavior is running on the
  751  * system, use legacy limits for throttle window and max IO size.
  752  */
  753 #if CONFIG_EMBEDDED
  754 #define THROTTLE_WINDOW (lowpri_throttle_window_msecs)
  755 #else
  756 #define THROTTLE_WINDOW (throttle_legacy_process_count == 0 ? lowpri_throttle_window_msecs : lowpri_legacy_throttle_window_msecs)
  757 #endif
  758 
  759 #if 0 
  760 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)  \
  761         do {                                                    \
  762                if ((debug_info)->alloc)                           \
  763                printf("%s: "format, __FUNCTION__, ## args);     \
  764        } while(0)
  765 
  766 #else 
  767 #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...)
  768 #endif
  769 
  770 SYSCTL_INT(_debug, OID_AUTO, lowpri_throttle_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_throttle_window_msecs, 0, "");
  771 SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_throttle_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_legacy_throttle_window_msecs, 0, "");
  772 SYSCTL_INT(_debug, OID_AUTO, lowpri_io_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_io_period_msecs, 0, "");
  773 SYSCTL_INT(_debug, OID_AUTO, lowpri_io_period_ssd_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_io_period_ssd_msecs, 0, "");
  774 SYSCTL_INT(_debug, OID_AUTO, lowpri_timer_period_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_timer_period_msecs, 0, "");
  775 SYSCTL_INT(_debug, OID_AUTO, lowpri_legacy_process_count, CTLFLAG_RD | CTLFLAG_LOCKED, &throttle_legacy_process_count, 0, "");
  776 
  777 static lck_grp_t        *throttle_mtx_grp;
  778 static lck_attr_t       *throttle_mtx_attr;
  779 static lck_grp_attr_t   *throttle_mtx_grp_attr;
  780 
  781 
  782 /*
  783  * throttled I/O helper function
  784  * convert the index of the lowest set bit to a device index
  785  */
  786 int
  787 num_trailing_0(uint64_t n)
  788 {
  789         /*
  790          * since in most cases the number of trailing 0s is very small,
  791          * we simply counting sequentially from the lowest bit
  792          */
  793         if (n == 0)
  794                 return sizeof(n) * 8;
  795         int count = 0;
  796         while (!ISSET(n, 1)) {
  797                 n >>= 1;
  798                 ++count;
  799         }
  800         return count;
  801 }
  802 
  803 
  804 /*
  805  * Release the reference and if the item was allocated and this is the last
  806  * reference then free it.
  807  *
  808  * This routine always returns the old value.
  809  */
  810 static int
  811 throttle_info_rel(struct _throttle_io_info_t *info)
  812 {
  813         SInt32 oldValue = OSDecrementAtomic(&info->throttle_refcnt);
  814 
  815         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 
  816                 info, (int)(oldValue -1), info );
  817 
  818         /* The reference count just went negative, very bad */
  819         if (oldValue == 0)
  820                 panic("throttle info ref cnt went negative!");
  821 
  822         /* 
  823          * Once reference count is zero, no one else should be able to take a 
  824          * reference 
  825          */
  826         if ((info->throttle_refcnt == 0) && (info->throttle_alloc)) {
  827                 DEBUG_ALLOC_THROTTLE_INFO("Freeing info = %p\n", info);
  828                 
  829                 lck_mtx_destroy(&info->throttle_lock, throttle_mtx_grp);
  830                 FREE(info, M_TEMP); 
  831         }
  832         return oldValue;
  833 }
  834 
  835 
  836 /*
  837  * Just take a reference on the throttle info structure.
  838  *
  839  * This routine always returns the old value.
  840  */
  841 static SInt32
  842 throttle_info_ref(struct _throttle_io_info_t *info)
  843 {
  844         SInt32 oldValue = OSIncrementAtomic(&info->throttle_refcnt);
  845 
  846         DEBUG_ALLOC_THROTTLE_INFO("refcnt = %d info = %p\n", 
  847                 info, (int)(oldValue -1), info );
  848         /* Allocated items should never have a reference of zero */
  849         if (info->throttle_alloc && (oldValue == 0))
  850                 panic("Taking a reference without calling create throttle info!\n");
  851 
  852         return oldValue;
  853 }
  854 
  855 
  856 /*
  857  * on entry the throttle_lock is held...
  858  * this function is responsible for taking
  859  * and dropping the reference on the info
  860  * structure which will keep it from going
  861  * away while the timer is running if it
  862  * happens to have been dynamically allocated by
  863  * a network fileystem kext which is now trying
  864  * to free it
  865  */
  866 static uint32_t
  867 throttle_timer_start(struct _throttle_io_info_t *info, boolean_t update_io_count)
  868 {       
  869         struct timeval  elapsed;
  870         int             elapsed_msecs;
  871         int             throttle_level;
  872         uint64_t        deadline;
  873 
  874         if (update_io_count == TRUE) {
  875                 info->throttle_io_count_begin = info->throttle_io_count;
  876                 info->throttle_io_period_num++;
  877 
  878                 microuptime(&info->throttle_start_IO_period_timestamp);
  879         }
  880         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < THROTTLE_LEVEL_END; throttle_level++) {
  881 
  882                 microuptime(&elapsed);
  883                 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]);
  884                 elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
  885 
  886                 if (elapsed_msecs < THROTTLE_WINDOW) {
  887                         /*
  888                          * we had an I/O occur in this level within
  889                          * our throttle window, so we need to
  890                          * to make sure the timer continues to run
  891                          */
  892                         break;
  893                 }
  894         }
  895         if (throttle_level >= THROTTLE_LEVEL_END) {
  896                 /*
  897                  * we're outside all of the throttle windows...
  898                  * don't start a new timer
  899                  */
  900                 info->throttle_timer_running = 0;
  901 
  902                 return (THROTTLE_LEVEL_END);
  903         }
  904         if (info->throttle_timer_running == 0) {
  905                 /*
  906                  * take a reference for the timer
  907                  */
  908                 throttle_info_ref(info);
  909 
  910                 info->throttle_timer_running = 1;
  911         }
  912         clock_interval_to_deadline(lowpri_timer_period_msecs, 1000000, &deadline);
  913 
  914         thread_call_enter_delayed(info->throttle_timer_call, deadline);
  915 
  916         return (throttle_level);
  917 }
  918 
  919 
  920 static void
  921 throttle_timer(struct _throttle_io_info_t *info)
  922 {
  923         uthread_t       ut, utlist;
  924         struct timeval  elapsed;
  925         int             elapsed_msecs;
  926         int             throttle_level;
  927         boolean_t       update_io_count = FALSE;
  928         boolean_t       need_wakeup = FALSE;
  929         boolean_t       need_release = FALSE;
  930 
  931         lck_mtx_lock(&info->throttle_lock);
  932         
  933         microuptime(&elapsed);
  934         timevalsub(&elapsed, &info->throttle_start_IO_period_timestamp);
  935         elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
  936 
  937         if (elapsed_msecs >= info->throttle_io_period) {
  938                 /*
  939                  * we're closing out the current IO period...
  940                  * if we have a waiting thread, wake it up
  941                  * after we have reset the I/O window info
  942                  */
  943                 need_wakeup = TRUE;
  944                 update_io_count = TRUE;
  945         }
  946         if ((throttle_level = throttle_timer_start(info, update_io_count)) == THROTTLE_LEVEL_END) {
  947                 /*
  948                  * we are now outside of the throttle window
  949                  * for all throttle levels...
  950                  *
  951                  * the timer is not restarted in this case, so
  952                  * we need to get rid of the reference we took when
  953                  * we started up the timer... we can't do this
  954                  * until we are entirely done playing with 'info'
  955                  */
  956                 need_release = TRUE;
  957         }
  958 
  959         TAILQ_FOREACH_SAFE(ut, &info->throttle_uthlist, uu_throttlelist, utlist) {
  960                 /*
  961                  * if we are now outside of the throttle window release
  962                  * all of the currently blocked threads, otherwise
  963                  * look for threads that have had their IO policy changed
  964                  * by someone else and are no longer throttleable, or are
  965                  * not at the current throttle level and unblock them
  966                  */
  967                 if (throttle_level == THROTTLE_LEVEL_END || throttle_get_thread_throttle_level(ut, -1) <= throttle_level) {
  968 
  969                         TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist);
  970                         ut->uu_on_throttlelist = 0;
  971 
  972                         wakeup(&ut->uu_on_throttlelist);
  973                 }
  974         }
  975         if (need_wakeup && !TAILQ_EMPTY(&info->throttle_uthlist)) {
  976                 /*
  977                  * we've entered a new I/O period and we're still
  978                  * in the throttle window, so wakeup the next guy in line
  979                  */
  980                 ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist);
  981                 TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist);
  982                 ut->uu_on_throttlelist = 0;
  983 
  984                 wakeup(&ut->uu_on_throttlelist);
  985         }
  986         lck_mtx_unlock(&info->throttle_lock);
  987 
  988         if (need_release == TRUE)
  989                 throttle_info_rel(info);
  990 }
  991 
  992 
  993 void
  994 throttle_init(void)
  995 {
  996         struct _throttle_io_info_t *info;
  997         int     i;
  998 
  999         /*                                                                                                                                    
 1000          * allocate lock group attribute and group                                                                                            
 1001          */
 1002         throttle_mtx_grp_attr = lck_grp_attr_alloc_init();
 1003         throttle_mtx_grp = lck_grp_alloc_init("throttle I/O", throttle_mtx_grp_attr);
 1004 
 1005         /*                                                                                                                                    
 1006          * allocate the lock attribute                                                                                                        
 1007          */
 1008         throttle_mtx_attr = lck_attr_alloc_init();
 1009 
 1010         for (i = 0; i < LOWPRI_MAX_NUM_DEV; i++) {
 1011                 info = &_throttle_io_info[i];
 1012           
 1013                 lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr);
 1014                 info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
 1015 
 1016                 TAILQ_INIT(&info->throttle_uthlist);
 1017         }
 1018 }
 1019 
 1020 
 1021 /*
 1022  * KPI routine
 1023  * 
 1024  * wakeup and remove the specified thread from the throttle queue
 1025  * if it's no longer in a throttleable state...
 1026  * takes a valid uthread (which may or may not be on the
 1027  * throttle queue) as input
 1028  */
 1029 void
 1030 unthrottle_thread(uthread_t ut)
 1031 {
 1032        struct _throttle_io_info_t *info;
 1033 
 1034        if ((info = ut->uu_throttle_info) == NULL)
 1035                return;
 1036 
 1037         lck_mtx_lock(&info->throttle_lock);
 1038 
 1039        if (ut->uu_on_throttlelist && throttle_get_thread_throttle_level(ut, -1) <= THROTTLE_LEVEL_THROTTLED) { 
 1040                TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist);
 1041                ut->uu_on_throttlelist = 0;
 1042 
 1043                wakeup(&ut->uu_on_throttlelist);
 1044        }
 1045         lck_mtx_unlock(&info->throttle_lock);
 1046 }
 1047 
 1048 
 1049 /*
 1050  * KPI routine
 1051  *
 1052  * Create and take a reference on a throttle info structure and return a
 1053  * pointer for the file system to use when calling throttle_info_update.
 1054  * Calling file system must have a matching release for every create.
 1055  */
 1056 void *
 1057 throttle_info_create(void)
 1058 {
 1059         struct _throttle_io_info_t *info; 
 1060 
 1061         MALLOC(info, struct _throttle_io_info_t *, sizeof(*info), M_TEMP, M_ZERO | M_WAITOK);
 1062         /* Should never happen but just in case */
 1063         if (info == NULL)
 1064                 return NULL;
 1065         /* Mark that this one was allocated and needs to be freed */
 1066         DEBUG_ALLOC_THROTTLE_INFO("Creating info = %p\n", info, info );
 1067         info->throttle_alloc = TRUE;
 1068 
 1069         lck_mtx_init(&info->throttle_lock, throttle_mtx_grp, throttle_mtx_attr);
 1070         info->throttle_timer_call = thread_call_allocate((thread_call_func_t)throttle_timer, (thread_call_param_t)info);
 1071 
 1072         TAILQ_INIT(&info->throttle_uthlist);
 1073 
 1074         /* Take a reference */
 1075         OSIncrementAtomic(&info->throttle_refcnt);
 1076         return info;
 1077 }
 1078 
 1079 /*
 1080  * KPI routine
 1081  *
 1082  * Release the throttle info pointer if all the reference are gone. Should be 
 1083  * called to release reference taken by throttle_info_create 
 1084  */ 
 1085 void
 1086 throttle_info_release(void *throttle_info)
 1087 {
 1088         DEBUG_ALLOC_THROTTLE_INFO("Releaseing info = %p\n",
 1089                 (struct _throttle_io_info_t *)throttle_info,
 1090                 (struct _throttle_io_info_t *)throttle_info);
 1091         if (throttle_info) /* Just to be careful */
 1092                 throttle_info_rel(throttle_info);
 1093 }
 1094 
 1095 /*
 1096  * KPI routine
 1097  *
 1098  * File Systems that create an info structure, need to call this routine in
 1099  * their mount routine (used by cluster code). File Systems that call this in
 1100  * their mount routines must call throttle_info_mount_rel in their unmount
 1101  * routines. 
 1102  */
 1103 void 
 1104 throttle_info_mount_ref(mount_t mp, void *throttle_info)
 1105 {
 1106         if ((throttle_info == NULL) || (mp == NULL))
 1107                 return;
 1108         throttle_info_ref(throttle_info);
 1109 
 1110         /*
 1111          * We already have a reference release it before adding the new one
 1112          */
 1113         if (mp->mnt_throttle_info)
 1114                 throttle_info_rel(mp->mnt_throttle_info);
 1115         mp->mnt_throttle_info = throttle_info;
 1116 }
 1117 
 1118 /*
 1119  * Private KPI routine
 1120  *
 1121  * return a handle for accessing throttle_info given a throttle_mask.  The
 1122  * handle must be released by throttle_info_rel_by_mask
 1123  */
 1124 int
 1125 throttle_info_ref_by_mask(uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle)
 1126 {
 1127         int     dev_index;
 1128         struct _throttle_io_info_t *info;
 1129 
 1130         if (throttle_info_handle == NULL)
 1131                 return EINVAL;
 1132         
 1133         dev_index = num_trailing_0(throttle_mask);
 1134         info = &_throttle_io_info[dev_index];
 1135         throttle_info_ref(info);
 1136         *(struct _throttle_io_info_t**)throttle_info_handle = info;
 1137 
 1138         return 0;
 1139 }
 1140 
 1141 /*
 1142  * Private KPI routine
 1143  *
 1144  * release the handle obtained by throttle_info_ref_by_mask
 1145  */
 1146 void
 1147 throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle)
 1148 {
 1149         /*
 1150          * for now the handle is just a pointer to _throttle_io_info_t
 1151          */
 1152         throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle);
 1153 }
 1154 
 1155 /*
 1156  * KPI routine
 1157  *
 1158  * File Systems that throttle_info_mount_ref, must call this routine in their
 1159  * umount routine.
 1160  */ 
 1161 void
 1162 throttle_info_mount_rel(mount_t mp)
 1163 {
 1164         if (mp->mnt_throttle_info)
 1165                 throttle_info_rel(mp->mnt_throttle_info);
 1166         mp->mnt_throttle_info = NULL;
 1167 }
 1168 
 1169 void
 1170 throttle_info_get_last_io_time(mount_t mp, struct timeval *tv)
 1171 {
 1172         struct _throttle_io_info_t *info;
 1173 
 1174         if (mp == NULL)
 1175                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 1176         else if (mp->mnt_throttle_info == NULL)
 1177                 info = &_throttle_io_info[mp->mnt_devbsdunit];
 1178         else
 1179                 info = mp->mnt_throttle_info;
 1180 
 1181         *tv = info->throttle_last_write_timestamp;
 1182 }
 1183 
 1184 void
 1185 update_last_io_time(mount_t mp)
 1186 {
 1187         struct _throttle_io_info_t *info;
 1188                 
 1189         if (mp == NULL)
 1190                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 1191         else if (mp->mnt_throttle_info == NULL)
 1192                 info = &_throttle_io_info[mp->mnt_devbsdunit];
 1193         else
 1194                 info = mp->mnt_throttle_info;
 1195 
 1196         microuptime(&info->throttle_last_write_timestamp);
 1197 }
 1198 
 1199 
 1200 int
 1201 throttle_get_io_policy(uthread_t *ut)
 1202 {
 1203         *ut = get_bsdthread_info(current_thread());
 1204 
 1205         return (proc_get_task_selfdiskacc());
 1206 }
 1207 
 1208 
 1209 
 1210 static int
 1211 throttle_get_thread_throttle_level(uthread_t ut, int policy)
 1212 {       
 1213         int     thread_throttle_level = THROTTLE_LEVEL_NONE;
 1214 
 1215         if (ut == NULL)
 1216                 ut = get_bsdthread_info(current_thread());
 1217 
 1218         if (policy == -1)
 1219                 policy = proc_get_diskacc(ut->uu_thread);
 1220 
 1221         switch (policy) {
 1222 
 1223         case IOPOL_DEFAULT:
 1224         case IOPOL_NORMAL:
 1225                 thread_throttle_level = THROTTLE_LEVEL_TIER0;
 1226         case IOPOL_PASSIVE:
 1227                 if (ut->uu_throttle_bc == TRUE)
 1228                         thread_throttle_level = THROTTLE_LEVEL_TIER2;
 1229                 break;
 1230         case IOPOL_THROTTLE:
 1231                 thread_throttle_level = THROTTLE_LEVEL_TIER2;
 1232                 break;
 1233         case IOPOL_UTILITY:
 1234                 thread_throttle_level = THROTTLE_LEVEL_TIER1;
 1235                 break;
 1236         default:
 1237                 printf("unknown I/O policy %d", policy);
 1238                 break;
 1239         }
 1240         return (thread_throttle_level);
 1241 }
 1242 
 1243 
 1244 static int
 1245 throttle_io_will_be_throttled_internal(void * throttle_info)
 1246 {
 1247         struct _throttle_io_info_t *info = throttle_info;
 1248         struct timeval elapsed;
 1249         int     elapsed_msecs;
 1250         int     thread_throttle_level;
 1251         int     throttle_level;
 1252 
 1253         if ((thread_throttle_level = throttle_get_thread_throttle_level(NULL, -1)) < THROTTLE_LEVEL_THROTTLED)
 1254                 return (0);
 1255 
 1256         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
 1257 
 1258                 microuptime(&elapsed);
 1259                 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]);
 1260                 elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
 1261 
 1262                 if (elapsed_msecs < THROTTLE_WINDOW)
 1263                         break;
 1264         }
 1265         if (throttle_level >= thread_throttle_level) {
 1266                 /*
 1267                  * we're beyond all of the throttle windows
 1268                  * that affect the throttle level of this thread,
 1269                  * so go ahead and treat as normal I/O
 1270                  */
 1271                 return (0);
 1272         }
 1273         if (info->throttle_io_count != info->throttle_io_count_begin) {
 1274                 /*
 1275                  * we've already issued at least one throttleable I/O
 1276                  * in the current I/O window, so avoid issuing another one
 1277                  */
 1278                 return (2);
 1279         }
 1280         /*
 1281          * we're in the throttle window, so
 1282          * cut the I/O size back
 1283          */
 1284         return (1);
 1285 }
 1286 
 1287 /* 
 1288  * If we have a mount point and it has a throttle info pointer then
 1289  * use it to do the check, otherwise use the device unit number to find
 1290  * the correct throttle info array element.
 1291  */
 1292 int
 1293 throttle_io_will_be_throttled(__unused int lowpri_window_msecs, mount_t mp)
 1294 {
 1295         void    *info;
 1296 
 1297         /*
 1298          * Should we just return zero if no mount point
 1299          */
 1300         if (mp == NULL)
 1301                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 1302         else if (mp->mnt_throttle_info == NULL)
 1303                 info = &_throttle_io_info[mp->mnt_devbsdunit];
 1304         else
 1305                 info = mp->mnt_throttle_info;
 1306 
 1307         return throttle_io_will_be_throttled_internal(info);
 1308 }
 1309 
 1310 
 1311 uint32_t
 1312 throttle_lowpri_io(int sleep_amount)
 1313 {
 1314         uthread_t ut;
 1315         struct _throttle_io_info_t *info;
 1316         int     throttle_type = 0;
 1317         int     sleep_cnt = 0;
 1318         int     locked = 0;
 1319         uint32_t  throttle_io_period_num = 0;
 1320         boolean_t insert_tail = TRUE;
 1321 
 1322         ut = get_bsdthread_info(current_thread());
 1323 
 1324         if (ut->uu_lowpri_window == 0)
 1325                 return (0);
 1326 
 1327         info = ut->uu_throttle_info;
 1328 
 1329         if ((sleep_amount == 0) || (info == NULL))
 1330                 goto done;
 1331 
 1332         if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE)
 1333                 sleep_amount = 0;
 1334 
 1335         throttle_io_period_num = info->throttle_io_period_num;
 1336 
 1337         while ( (throttle_type = throttle_io_will_be_throttled_internal(info)) ) {
 1338 
 1339                 if (throttle_type == 1) {
 1340                         if (sleep_amount == 0)
 1341                                 break;                  
 1342                         if (info->throttle_io_period_num < throttle_io_period_num)
 1343                                 break;
 1344                         if ((info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount)
 1345                                 break;
 1346                 }
 1347                 if (!locked) {
 1348                         lck_mtx_lock(&info->throttle_lock);
 1349                         locked = 1;
 1350                 }
 1351                 if (info->throttle_timer_running == 0) {
 1352                         /*
 1353                          * try to start the timer since it's
 1354                          * currently not running.  on failure, no
 1355                          * timer reference to drop since it wasn't started
 1356                          */
 1357                         if (throttle_timer_start(info, TRUE) == THROTTLE_LEVEL_END)
 1358                                 goto done;
 1359                 }
 1360                 if (sleep_cnt == 0) {
 1361                         KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
 1362                                                       ut->uu_lowpri_window, info->throttle_io_period, info->throttle_io_count, 0, 0);
 1363                 }
 1364                 if (ut->uu_on_throttlelist == 0) {
 1365                         if (insert_tail == TRUE)
 1366                                 TAILQ_INSERT_TAIL(&info->throttle_uthlist, ut, uu_throttlelist);
 1367                         else
 1368                                 TAILQ_INSERT_HEAD(&info->throttle_uthlist, ut, uu_throttlelist);
 1369 
 1370                         ut->uu_on_throttlelist = 1;
 1371                 }
 1372                 msleep((caddr_t)&ut->uu_on_throttlelist, &info->throttle_lock, PRIBIO + 1, "throttle_lowpri_io", NULL);
 1373 
 1374                 sleep_cnt++;
 1375                 
 1376                 if (sleep_amount == 0)
 1377                         insert_tail = FALSE;
 1378                 else if (info->throttle_io_period_num < throttle_io_period_num ||
 1379                          (info->throttle_io_period_num - throttle_io_period_num) >= (uint32_t)sleep_amount) {
 1380                         insert_tail = FALSE;
 1381                         sleep_amount = 0;
 1382                 }
 1383         }
 1384 done:
 1385         if (ut->uu_on_throttlelist) {
 1386                 if (!locked) {
 1387                         lck_mtx_lock(&info->throttle_lock);
 1388                         locked = 1;
 1389                 }
 1390                 if (ut->uu_on_throttlelist) {
 1391                         TAILQ_REMOVE(&info->throttle_uthlist, ut, uu_throttlelist);
 1392 
 1393                         ut->uu_on_throttlelist = 0;
 1394                 }
 1395         }
 1396         if (locked)
 1397                 lck_mtx_unlock(&info->throttle_lock);
 1398                 
 1399         if (sleep_cnt)
 1400                 KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
 1401                                       ut->uu_lowpri_window, info->throttle_io_period, info->throttle_io_count, 0, 0);
 1402         if (info)
 1403                 throttle_info_rel(info);
 1404 
 1405         ut->uu_throttle_info = NULL;
 1406         ut->uu_throttle_bc = FALSE;
 1407         ut->uu_lowpri_window = 0;
 1408 
 1409         return (sleep_cnt);
 1410 }
 1411 
 1412 /*
 1413  * KPI routine
 1414  *
 1415  * set a kernel thread's IO policy.  policy can be:
 1416  * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE
 1417  *
 1418  * explanations about these policies are in the man page of setiopolicy_np
 1419  */
 1420 void throttle_set_thread_io_policy(int policy)
 1421 {
 1422         proc_apply_thread_selfdiskacc(policy);
 1423 }
 1424 
 1425 
 1426 static
 1427 void throttle_info_reset_window(uthread_t ut)
 1428 {
 1429         struct _throttle_io_info_t *info;
 1430 
 1431         if ( (info = ut->uu_throttle_info) ) {
 1432                 throttle_info_rel(info);
 1433 
 1434                 ut->uu_throttle_info = NULL;
 1435                 ut->uu_lowpri_window = 0;
 1436                 ut->uu_throttle_bc = FALSE;
 1437         }
 1438 }
 1439 
 1440 static
 1441 void throttle_info_set_initial_window(uthread_t ut, struct _throttle_io_info_t *info, boolean_t BC_throttle)
 1442 {
 1443         if (ut->uu_throttle_info == NULL) {
 1444 
 1445                 ut->uu_throttle_info = info;
 1446                 throttle_info_ref(info);
 1447                 DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info );
 1448 
 1449                 ut->uu_lowpri_window = THROTTLE_WINDOW;
 1450                 ut->uu_throttle_bc = BC_throttle;
 1451         }
 1452 }
 1453 
 1454 
 1455 static
 1456 void throttle_info_update_internal(struct _throttle_io_info_t *info, uthread_t ut, int policy, int flags, boolean_t isssd)
 1457 {
 1458         int     thread_throttle_level;
 1459 
 1460         if (THROTTLE_WINDOW == 0)
 1461                 return;
 1462 
 1463         if (ut == NULL)
 1464                 ut = get_bsdthread_info(current_thread());
 1465 
 1466         thread_throttle_level = throttle_get_thread_throttle_level(ut, policy);
 1467 
 1468         if (thread_throttle_level == THROTTLE_LEVEL_TIER0 && ISSET(flags, B_PASSIVE))
 1469                 thread_throttle_level = THROTTLE_LEVEL_NONE;
 1470 
 1471         if (thread_throttle_level != THROTTLE_LEVEL_NONE)
 1472                 microuptime(&info->throttle_last_IO_timestamp[thread_throttle_level]);
 1473 
 1474         if (thread_throttle_level >= THROTTLE_LEVEL_THROTTLED) {
 1475                 /*
 1476                  * I'd really like to do the IOSleep here, but
 1477                  * we may be holding all kinds of filesystem related locks
 1478                  * and the pages for this I/O marked 'busy'...
 1479                  * we don't want to cause a normal task to block on
 1480                  * one of these locks while we're throttling a task marked
 1481                  * for low priority I/O... we'll mark the uthread and
 1482                  * do the delay just before we return from the system
 1483                  * call that triggered this I/O or from vnode_pagein
 1484                  */
 1485                 if (info->throttle_io_period == 0) {
 1486 
 1487                         if (isssd == TRUE)
 1488                                 info->throttle_io_period = lowpri_io_period_ssd_msecs;
 1489                         else
 1490                                 info->throttle_io_period = lowpri_io_period_msecs;
 1491 
 1492                         if (info->throttle_io_period < lowpri_timer_period_msecs)
 1493                                 info->throttle_io_period = lowpri_timer_period_msecs;
 1494                 }
 1495                 OSAddAtomic(1, &info->throttle_io_count);
 1496 
 1497                 throttle_info_set_initial_window(ut, info, FALSE);
 1498         }
 1499 }
 1500 
 1501 void throttle_info_update_by_mount(mount_t mp)
 1502 {
 1503         struct _throttle_io_info_t *info;
 1504         uthread_t ut;
 1505         boolean_t isssd = FALSE;
 1506 
 1507         ut = get_bsdthread_info(current_thread());
 1508 
 1509         if (ut->uu_lowpri_window)
 1510                 return;
 1511 
 1512         if (mp != NULL) {
 1513                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
 1514                         isssd = TRUE;
 1515                 info = &_throttle_io_info[mp->mnt_devbsdunit];
 1516         } else
 1517                 info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 1518 
 1519         if (info->throttle_io_period == 0) {
 1520 
 1521                 if (isssd == TRUE)
 1522                         info->throttle_io_period = lowpri_io_period_ssd_msecs;
 1523                 else
 1524                         info->throttle_io_period = lowpri_io_period_msecs;
 1525 
 1526                 if (info->throttle_io_period < lowpri_timer_period_msecs)
 1527                         info->throttle_io_period = lowpri_timer_period_msecs;
 1528         }
 1529         throttle_info_set_initial_window(ut, info, FALSE);
 1530 }
 1531 
 1532 
 1533 /*
 1534  * KPI routine
 1535  *
 1536  * this is usually called before every I/O, used for throttled I/O
 1537  * book keeping.  This routine has low overhead and does not sleep
 1538  */
 1539 void throttle_info_update(void *throttle_info, int flags)
 1540 {
 1541         if (throttle_info)
 1542                 throttle_info_update_internal(throttle_info, NULL, -1, flags, FALSE);
 1543 }
 1544 
 1545 /*
 1546  * KPI routine
 1547  *
 1548  * this is usually called before every I/O, used for throttled I/O
 1549  * book keeping.  This routine has low overhead and does not sleep
 1550  */
 1551 void throttle_info_update_by_mask(void *throttle_info_handle, int flags)
 1552 {
 1553         void *throttle_info = throttle_info_handle;
 1554 
 1555         /*
 1556          * for now we only use the lowest bit of the throttle mask, so the
 1557          * handle is the same as the throttle_info.  Later if we store a
 1558          * set of throttle infos in the handle, we will want to loop through
 1559          * them and call throttle_info_update in a loop
 1560          */
 1561         throttle_info_update(throttle_info, flags);
 1562 }
 1563 
 1564 
 1565 int throttle_info_io_will_be_throttled(void * throttle_info, int policy)
 1566 {
 1567         struct _throttle_io_info_t *info = throttle_info;
 1568         struct timeval elapsed;
 1569         int     elapsed_msecs;
 1570         int     throttle_level;
 1571         int     thread_throttle_level;
 1572 
 1573         switch (policy) {
 1574 
 1575         case IOPOL_THROTTLE:
 1576                 thread_throttle_level = THROTTLE_LEVEL_TIER2;
 1577                 break;
 1578         case IOPOL_UTILITY:
 1579                 thread_throttle_level = THROTTLE_LEVEL_TIER1;
 1580                 break;
 1581         default:
 1582                 thread_throttle_level = THROTTLE_LEVEL_TIER0;
 1583                 break;
 1584         }
 1585         for (throttle_level = THROTTLE_LEVEL_START; throttle_level < thread_throttle_level; throttle_level++) {
 1586 
 1587                 microuptime(&elapsed);
 1588                 timevalsub(&elapsed, &info->throttle_last_IO_timestamp[throttle_level]);
 1589                 elapsed_msecs = elapsed.tv_sec * 1000 + elapsed.tv_usec / 1000;
 1590 
 1591                 if (elapsed_msecs < THROTTLE_WINDOW)
 1592                         break;
 1593         }
 1594         if (throttle_level >= thread_throttle_level) {
 1595                 /*
 1596                  * we're beyond all of the throttle windows
 1597                  * so go ahead and treat as normal I/O
 1598                  */
 1599                 return (0);
 1600         }
 1601         /*
 1602          * we're in the throttle window
 1603          */
 1604         return (1);
 1605 }
 1606 
 1607 void
 1608 throttle_legacy_process_incr(void)
 1609 {
 1610         OSIncrementAtomic(&throttle_legacy_process_count);
 1611 }
 1612 
 1613 void
 1614 throttle_legacy_process_decr(void)
 1615 {
 1616         OSDecrementAtomic(&throttle_legacy_process_count);
 1617 }
 1618 
 1619 
 1620 int
 1621 spec_strategy(struct vnop_strategy_args *ap)
 1622 {
 1623         buf_t   bp;
 1624         int     bflags;
 1625         int     policy;
 1626         dev_t   bdev;
 1627         uthread_t ut;
 1628         mount_t mp;
 1629         int     strategy_ret;
 1630         struct _throttle_io_info_t *throttle_info;
 1631         boolean_t isssd = FALSE;
 1632 #if !CONFIG_EMBEDDED
 1633         proc_t curproc = current_proc();
 1634 #endif /* !CONFIG_EMBEDDED */
 1635 
 1636         bp = ap->a_bp;
 1637         bdev = buf_device(bp);
 1638         mp = buf_vnode(bp)->v_mount;
 1639 
 1640         policy = throttle_get_io_policy(&ut);
 1641 
 1642         if (bp->b_flags & B_META) 
 1643                 bp->b_attr.ba_flags |= BA_META;
 1644 
 1645         if (policy == IOPOL_THROTTLE || policy == IOPOL_UTILITY) {
 1646                 bp->b_flags |= B_THROTTLED_IO;
 1647                 bp->b_attr.ba_flags |= BA_THROTTLED_IO;
 1648                 bp->b_flags &= ~B_PASSIVE;
 1649         } else if (policy == IOPOL_PASSIVE)
 1650                 bp->b_flags |= B_PASSIVE;
 1651 
 1652 #if !CONFIG_EMBEDDED
 1653         if ((curproc != NULL) && ((curproc->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP))
 1654                 bp->b_attr.ba_flags |= BA_DELAYIDLESLEEP;
 1655 #endif /* !CONFIG_EMBEDDED */
 1656                 
 1657         bflags = bp->b_flags;
 1658 
 1659         if (kdebug_enable) {
 1660                 int    code = 0;
 1661 
 1662                 if (bflags & B_READ)
 1663                         code |= DKIO_READ;
 1664                 if (bflags & B_ASYNC)
 1665                         code |= DKIO_ASYNC;
 1666 
 1667                 if (bflags & B_META)
 1668                         code |= DKIO_META;
 1669                 else if (bflags & B_PAGEIO)
 1670                         code |= DKIO_PAGING;
 1671 
 1672                 if (bflags & B_THROTTLED_IO)
 1673                         code |= DKIO_THROTTLE;
 1674                 else if (bflags & B_PASSIVE)
 1675                         code |= DKIO_PASSIVE;
 1676 
 1677                 if (bp->b_attr.ba_flags & BA_NOCACHE)
 1678                         code |= DKIO_NOCACHE;
 1679 
 1680                 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
 1681                                           bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
 1682         }
 1683         if (((bflags & (B_THROTTLED_IO | B_PASSIVE | B_IOSTREAMING | B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
 1684             mp && (mp->mnt_kern_flag & MNTK_ROOTDEV))
 1685                 hard_throttle_on_root = 1;
 1686 
 1687         if (mp != NULL) {
 1688                 if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd)
 1689                         isssd = TRUE;
 1690                 throttle_info = &_throttle_io_info[mp->mnt_devbsdunit];
 1691         } else
 1692                 throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1];
 1693 
 1694         throttle_info_update_internal(throttle_info, ut, policy, bflags, isssd);
 1695 
 1696         if ((bflags & B_READ) == 0) {
 1697                 microuptime(&throttle_info->throttle_last_write_timestamp);
 1698 
 1699                 if (mp) {
 1700                         INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size);
 1701                 }
 1702         } else if (mp) {
 1703                 INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size);
 1704         }
 1705         /*
 1706          * The BootCache may give us special information about
 1707          * the IO, so it returns special values that we check
 1708          * for here.
 1709          *
 1710          * IO_SATISFIED_BY_CACHE
 1711          * The read has been satisfied by the boot cache. Don't
 1712          * throttle the thread unnecessarily.
 1713          *
 1714          * IO_SHOULD_BE_THROTTLED
 1715          * The boot cache is playing back a playlist and this IO
 1716          * cut through. Throttle it so we're not cutting through
 1717          * the boot cache too often.
 1718          *
 1719          * Note that typical strategy routines are defined with
 1720          * a void return so we'll get garbage here. In the 
 1721          * unlikely case the garbage matches our special return
 1722          * value, it's not a big deal since we're only adjusting
 1723          * the throttling delay.
 1724          */
 1725 #define IO_SATISFIED_BY_CACHE  ((int)0xcafefeed)
 1726 #define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
 1727         typedef int strategy_fcn_ret_t(struct buf *bp);
 1728         
 1729         strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp);
 1730         
 1731         if (IO_SATISFIED_BY_CACHE == strategy_ret) {
 1732                 /*
 1733                  * If this was a throttled IO satisfied by the boot cache,
 1734                  * don't delay the thread.
 1735                  */
 1736                 throttle_info_reset_window(ut);
 1737 
 1738         } else if (IO_SHOULD_BE_THROTTLED == strategy_ret) {
 1739                 /*
 1740                  * If the boot cache indicates this IO should be throttled,
 1741                  * delay the thread.
 1742                  */
 1743                 throttle_info_set_initial_window(ut, throttle_info, TRUE);
 1744         }
 1745         return (0);
 1746 }
 1747 
 1748 
 1749 /*
 1750  * This is a noop, simply returning what one has been given.
 1751  */
 1752 int
 1753 spec_blockmap(__unused struct vnop_blockmap_args *ap)
 1754 {
 1755         return (ENOTSUP);
 1756 }
 1757 
 1758 
 1759 /*
 1760  * Device close routine
 1761  */
 1762 int
 1763 spec_close(struct vnop_close_args *ap)
 1764 {
 1765         struct vnode *vp = ap->a_vp;
 1766         dev_t dev = vp->v_rdev;
 1767         int error = 0;
 1768         int flags = ap->a_fflag;
 1769         struct proc *p = vfs_context_proc(ap->a_context);
 1770         struct session *sessp;
 1771         int do_rele = 0;
 1772 
 1773         switch (vp->v_type) {
 1774 
 1775         case VCHR:
 1776                 /*
 1777                  * Hack: a tty device that is a controlling terminal
 1778                  * has a reference from the session structure.
 1779                  * We cannot easily tell that a character device is
 1780                  * a controlling terminal, unless it is the closing
 1781                  * process' controlling terminal.  In that case,
 1782                  * if the reference count is 1 (this is the very
 1783                  * last close)
 1784                  */
 1785                 sessp = proc_session(p);
 1786                 if (sessp != SESSION_NULL) {
 1787                         if (vp == sessp->s_ttyvp && vcount(vp) == 1) {
 1788                                 struct tty *tp;
 1789 
 1790                                 session_lock(sessp);
 1791                                 if (vp == sessp->s_ttyvp) {
 1792                                         tp = SESSION_TP(sessp);
 1793                                         sessp->s_ttyvp = NULL;
 1794                                         sessp->s_ttyvid = 0;
 1795                                         sessp->s_ttyp = TTY_NULL;
 1796                                         sessp->s_ttypgrpid = NO_PID;
 1797                                         do_rele = 1;
 1798                                 } 
 1799                                 session_unlock(sessp);
 1800 
 1801                                 if (do_rele) {
 1802                                         vnode_rele(vp);
 1803                                         if (NULL != tp)
 1804                                                 ttyfree(tp);
 1805                                 }
 1806                         }
 1807                         session_rele(sessp);
 1808                 }
 1809 
 1810                 devsw_lock(dev, S_IFCHR);
 1811 
 1812                 if (--vp->v_specinfo->si_opencount < 0)
 1813                         panic("negative open count (c, %u, %u)", major(dev), minor(dev));
 1814 
 1815                 /*
 1816                  * close always, or close on last reference, or close on revoke
 1817                  */
 1818                 if ((D_TRACKCLOSE & cdevsw[major(dev)].d_type) != 0 ||
 1819                     vcount(vp) == 0 || (flags & IO_REVOKE) != 0)
 1820                         error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p);
 1821 
 1822                 devsw_unlock(dev, S_IFCHR);
 1823                 break;
 1824 
 1825         case VBLK:
 1826                 /*
 1827                  * If there is more than one outstanding open, don't
 1828                  * send the close to the device.
 1829                  */
 1830                 devsw_lock(dev, S_IFBLK);
 1831                 if (vcount(vp) > 1) {
 1832                         vp->v_specinfo->si_opencount--;
 1833                         devsw_unlock(dev, S_IFBLK);
 1834                         return (0);
 1835                 }
 1836                 devsw_unlock(dev, S_IFBLK);
 1837 
 1838                 /*
 1839                  * On last close of a block device (that isn't mounted)
 1840                  * we must invalidate any in core blocks, so that
 1841                  * we can, for instance, change floppy disks.
 1842                  */
 1843                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
 1844                         return (error);
 1845 
 1846                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
 1847                 if (error)
 1848                         return (error);
 1849 
 1850                 devsw_lock(dev, S_IFBLK);
 1851 
 1852                 if (--vp->v_specinfo->si_opencount < 0)
 1853                         panic("negative open count (b, %u, %u)", major(dev), minor(dev));
 1854 
 1855                 if (vcount(vp) == 0)
 1856                         error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p);
 1857 
 1858                 devsw_unlock(dev, S_IFBLK);
 1859                 break;
 1860 
 1861         default:
 1862                 panic("spec_close: not special");
 1863                 return(EBADF);
 1864         }
 1865 
 1866         return error;
 1867 }
 1868 
 1869 /*
 1870  * Return POSIX pathconf information applicable to special devices.
 1871  */
 1872 int
 1873 spec_pathconf(struct vnop_pathconf_args *ap)
 1874 {
 1875 
 1876         switch (ap->a_name) {
 1877         case _PC_LINK_MAX:
 1878                 *ap->a_retval = LINK_MAX;
 1879                 return (0);
 1880         case _PC_MAX_CANON:
 1881                 *ap->a_retval = MAX_CANON;
 1882                 return (0);
 1883         case _PC_MAX_INPUT:
 1884                 *ap->a_retval = MAX_INPUT;
 1885                 return (0);
 1886         case _PC_PIPE_BUF:
 1887                 *ap->a_retval = PIPE_BUF;
 1888                 return (0);
 1889         case _PC_CHOWN_RESTRICTED:
 1890                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
 1891                 return (0);
 1892         case _PC_VDISABLE:
 1893                 *ap->a_retval = _POSIX_VDISABLE;
 1894                 return (0);
 1895         default:
 1896                 return (EINVAL);
 1897         }
 1898         /* NOTREACHED */
 1899 }
 1900 
 1901 /*
 1902  * Special device failed operation
 1903  */
 1904 int
 1905 spec_ebadf(__unused void *dummy)
 1906 {
 1907 
 1908         return (EBADF);
 1909 }
 1910 
 1911 /* Blktooff derives file offset from logical block number */
 1912 int
 1913 spec_blktooff(struct vnop_blktooff_args *ap)
 1914 {
 1915         struct vnode *vp = ap->a_vp;
 1916 
 1917         switch (vp->v_type) {
 1918         case VCHR:
 1919                 *ap->a_offset = (off_t)-1; /* failure */
 1920                 return (ENOTSUP);
 1921 
 1922         case VBLK:
 1923                 printf("spec_blktooff: not implemented for VBLK\n");
 1924                 *ap->a_offset = (off_t)-1; /* failure */
 1925                 return (ENOTSUP);
 1926 
 1927         default:
 1928                 panic("spec_blktooff type");
 1929         }
 1930         /* NOTREACHED */
 1931 
 1932         return (0);
 1933 }
 1934 
 1935 /* Offtoblk derives logical block number from file offset */
 1936 int
 1937 spec_offtoblk(struct vnop_offtoblk_args *ap)
 1938 {
 1939         struct vnode *vp = ap->a_vp;
 1940 
 1941         switch (vp->v_type) {
 1942         case VCHR:
 1943                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
 1944                 return (ENOTSUP);
 1945 
 1946         case VBLK:
 1947                 printf("spec_offtoblk: not implemented for VBLK\n");
 1948                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
 1949                 return (ENOTSUP);
 1950 
 1951         default:
 1952                 panic("spec_offtoblk type");
 1953         }
 1954         /* NOTREACHED */
 1955 
 1956         return (0);
 1957 }
 1958 
 1959 static void filt_specdetach(struct knote *kn);
 1960 static int filt_spec(struct knote *kn, long hint);
 1961 static unsigned filt_specpeek(struct knote *kn);
 1962 
 1963 struct filterops spec_filtops = {
 1964         .f_isfd         = 1,
 1965         .f_attach       = filt_specattach,
 1966         .f_detach       = filt_specdetach,
 1967         .f_event        = filt_spec,
 1968         .f_peek         = filt_specpeek
 1969 };
 1970 
 1971 static int
 1972 filter_to_seltype(int16_t filter)
 1973 {
 1974         switch (filter) {
 1975         case EVFILT_READ: 
 1976                 return FREAD;
 1977         case EVFILT_WRITE:
 1978                 return FWRITE;
 1979                 break;
 1980         default:
 1981                 panic("filt_to_seltype(): invalid filter %d\n", filter);
 1982                 return 0;
 1983         }
 1984 }
 1985 
 1986 static int 
 1987 filt_specattach(struct knote *kn)
 1988 {
 1989         vnode_t vp;
 1990         dev_t dev;
 1991 
 1992         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */
 1993 
 1994         assert(vnode_ischr(vp));
 1995 
 1996         dev = vnode_specrdev(vp);
 1997 
 1998         if (major(dev) > nchrdev) {
 1999                 return ENXIO;
 2000         }
 2001 
 2002         if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) {
 2003                 return EINVAL;
 2004         }
 2005 
 2006         /* Resulting wql is safe to unlink even if it has never been linked */
 2007         kn->kn_hook = wait_queue_link_allocate();
 2008         if (kn->kn_hook == NULL) {
 2009                 return EAGAIN;
 2010         }
 2011 
 2012         kn->kn_fop = &spec_filtops;
 2013         kn->kn_hookid = vnode_vid(vp);
 2014 
 2015         knote_markstayqueued(kn);
 2016 
 2017         return 0;
 2018 }
 2019 
 2020 static void 
 2021 filt_specdetach(struct knote *kn)
 2022 {
 2023         kern_return_t ret;
 2024 
 2025         /* 
 2026          * Given wait queue link and wait queue set, unlink.  This is subtle.
 2027          * If the device has been revoked from under us, selclearthread() will
 2028          * have removed our link from the kqueue's wait queue set, which 
 2029          * wait_queue_set_unlink_one() will detect and handle.
 2030          */
 2031         ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook);
 2032         if (ret != KERN_SUCCESS) {
 2033                 panic("filt_specdetach(): failed to unlink wait queue link.");
 2034         }
 2035 
 2036         (void)wait_queue_link_free(kn->kn_hook);
 2037         kn->kn_hook = NULL;
 2038         kn->kn_status &= ~KN_STAYQUEUED;
 2039 }
 2040 
 2041 static int 
 2042 filt_spec(struct knote *kn, long hint)
 2043 {
 2044         vnode_t vp;
 2045         uthread_t uth;
 2046         wait_queue_set_t old_wqs;
 2047         vfs_context_t ctx;
 2048         int selres;
 2049         int error;
 2050         int use_offset;
 2051         dev_t dev;
 2052         uint64_t flags;
 2053 
 2054         assert(kn->kn_hook != NULL);
 2055 
 2056         if (hint != 0) {
 2057                 panic("filt_spec(): nonzero hint?");
 2058         }
 2059 
 2060         uth = get_bsdthread_info(current_thread());
 2061         ctx = vfs_context_current();
 2062         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
 2063 
 2064         error = vnode_getwithvid(vp, kn->kn_hookid);
 2065         if (error != 0) {
 2066                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 2067                 return 1;
 2068         }
 2069         
 2070         dev = vnode_specrdev(vp);
 2071         flags = cdevsw_flags[major(dev)];
 2072         use_offset = ((flags & CDEVSW_USE_OFFSET) != 0);
 2073         assert((flags & CDEVSW_SELECT_KQUEUE) != 0);
 2074 
 2075         /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */
 2076         old_wqs = uth->uu_wqset;
 2077         uth->uu_wqset = kn->kn_kq->kq_wqs;
 2078         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
 2079         uth->uu_wqset = old_wqs;
 2080 
 2081         if (use_offset) {
 2082                 if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) {
 2083                         kn->kn_data = 0;
 2084                 } else {
 2085                         kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset;
 2086                 }
 2087         } else {
 2088                 kn->kn_data = selres;
 2089         }
 2090 
 2091         vnode_put(vp);
 2092 
 2093         return (kn->kn_data != 0);
 2094 }
 2095 
 2096 static unsigned
 2097 filt_specpeek(struct knote *kn)
 2098 {
 2099         vnode_t vp;
 2100         uthread_t uth;
 2101         wait_queue_set_t old_wqs;
 2102         vfs_context_t ctx;
 2103         int error, selres;
 2104         
 2105         uth = get_bsdthread_info(current_thread());
 2106         ctx = vfs_context_current();
 2107         vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
 2108 
 2109         error = vnode_getwithvid(vp, kn->kn_hookid);
 2110         if (error != 0) {
 2111                 return 1; /* Just like VNOP_SELECT() on recycled vnode */
 2112         }
 2113 
 2114         /*
 2115          * Why pass the link here?  Because we may not have registered in the past...
 2116          */
 2117         old_wqs = uth->uu_wqset;
 2118         uth->uu_wqset = kn->kn_kq->kq_wqs;
 2119         selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx);
 2120         uth->uu_wqset = old_wqs;
 2121 
 2122         vnode_put(vp);
 2123         return selres;
 2124 }
 2125 

Cache object: e87892e277c3ca834bc2db719a6db683


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.