The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/bsd/miscfs/specfs/spec_vnops.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
    5  * 
    6  * This file contains Original Code and/or Modifications of Original Code
    7  * as defined in and that are subject to the Apple Public Source License
    8  * Version 2.0 (the 'License'). You may not use this file except in
    9  * compliance with the License. The rights granted to you under the License
   10  * may not be used to create, or enable the creation or redistribution of,
   11  * unlawful or unlicensed copies of an Apple operating system, or to
   12  * circumvent, violate, or enable the circumvention or violation of, any
   13  * terms of an Apple operating system software license agreement.
   14  * 
   15  * Please obtain a copy of the License at
   16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
   17  * 
   18  * The Original Code and all software distributed under the License are
   19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   23  * Please see the License for the specific language governing rights and
   24  * limitations under the License.
   25  * 
   26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
   27  */
   28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
   29 /*
   30  * Copyright (c) 1989, 1993, 1995
   31  *      The Regents of the University of California.  All rights reserved.
   32  *
   33  * Redistribution and use in source and binary forms, with or without
   34  * modification, are permitted provided that the following conditions
   35  * are met:
   36  * 1. Redistributions of source code must retain the above copyright
   37  *    notice, this list of conditions and the following disclaimer.
   38  * 2. Redistributions in binary form must reproduce the above copyright
   39  *    notice, this list of conditions and the following disclaimer in the
   40  *    documentation and/or other materials provided with the distribution.
   41  * 3. All advertising materials mentioning features or use of this software
   42  *    must display the following acknowledgement:
   43  *      This product includes software developed by the University of
   44  *      California, Berkeley and its contributors.
   45  * 4. Neither the name of the University nor the names of its contributors
   46  *    may be used to endorse or promote products derived from this software
   47  *    without specific prior written permission.
   48  *
   49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   59  * SUCH DAMAGE.
   60  *
   61  *      @(#)spec_vnops.c        8.14 (Berkeley) 5/21/95
   62  */
   63 
   64 #include <sys/param.h>
   65 #include <sys/proc_internal.h>
   66 #include <sys/kauth.h>
   67 #include <sys/systm.h>
   68 #include <sys/kernel.h>
   69 #include <sys/conf.h>
   70 #include <sys/buf_internal.h>
   71 #include <sys/mount_internal.h>
   72 #include <sys/namei.h>
   73 #include <sys/vnode_internal.h>
   74 #include <sys/stat.h>
   75 #include <sys/errno.h>
   76 #include <sys/ioctl.h>
   77 #include <sys/file.h>
   78 #include <sys/user.h>
   79 #include <sys/malloc.h>
   80 #include <sys/disk.h>
   81 #include <sys/uio_internal.h>
   82 #include <sys/resource.h>
   83 #include <miscfs/specfs/specdev.h>
   84 #include <vfs/vfs_support.h>
   85 
   86 #include <sys/kdebug.h>
   87 
   88 /* XXX following three prototypes should be in a header file somewhere */
   89 extern int      isdisk(dev_t dev, int type);
   90 extern dev_t    chrtoblk(dev_t dev);
   91 extern int      iskmemdev(dev_t dev);
   92 
   93 struct vnode *speclisth[SPECHSZ];
   94 
   95 /* symbolic sleep message strings for devices */
   96 char    devopn[] = "devopn";
   97 char    devio[] = "devio";
   98 char    devwait[] = "devwait";
   99 char    devin[] = "devin";
  100 char    devout[] = "devout";
  101 char    devioc[] = "devioc";
  102 char    devcls[] = "devcls";
  103 
  104 #define VOPFUNC int (*)(void *)
  105 
  106 int (**spec_vnodeop_p)(void *);
  107 struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
  108         { &vnop_default_desc, (VOPFUNC)vn_default_error },
  109         { &vnop_lookup_desc, (VOPFUNC)spec_lookup },            /* lookup */
  110         { &vnop_create_desc, (VOPFUNC)err_create },             /* create */
  111         { &vnop_mknod_desc, (VOPFUNC)err_mknod },               /* mknod */
  112         { &vnop_open_desc, (VOPFUNC)spec_open },                        /* open */
  113         { &vnop_close_desc, (VOPFUNC)spec_close },              /* close */
  114         { &vnop_access_desc, (VOPFUNC)spec_access },            /* access */
  115         { &vnop_getattr_desc, (VOPFUNC)spec_getattr },          /* getattr */
  116         { &vnop_setattr_desc, (VOPFUNC)spec_setattr },          /* setattr */
  117         { &vnop_read_desc, (VOPFUNC)spec_read },                        /* read */
  118         { &vnop_write_desc, (VOPFUNC)spec_write },              /* write */
  119         { &vnop_ioctl_desc, (VOPFUNC)spec_ioctl },              /* ioctl */
  120         { &vnop_select_desc, (VOPFUNC)spec_select },            /* select */
  121         { &vnop_revoke_desc, (VOPFUNC)nop_revoke },             /* revoke */
  122         { &vnop_mmap_desc, (VOPFUNC)err_mmap },                 /* mmap */
  123         { &vnop_fsync_desc, (VOPFUNC)spec_fsync },              /* fsync */
  124         { &vnop_remove_desc, (VOPFUNC)err_remove },             /* remove */
  125         { &vnop_link_desc, (VOPFUNC)err_link },                 /* link */
  126         { &vnop_rename_desc, (VOPFUNC)err_rename },             /* rename */
  127         { &vnop_mkdir_desc, (VOPFUNC)err_mkdir },               /* mkdir */
  128         { &vnop_rmdir_desc, (VOPFUNC)err_rmdir },               /* rmdir */
  129         { &vnop_symlink_desc, (VOPFUNC)err_symlink },           /* symlink */
  130         { &vnop_readdir_desc, (VOPFUNC)err_readdir },           /* readdir */
  131         { &vnop_readlink_desc, (VOPFUNC)err_readlink },         /* readlink */
  132         { &vnop_inactive_desc, (VOPFUNC)nop_inactive },         /* inactive */
  133         { &vnop_reclaim_desc, (VOPFUNC)nop_reclaim },           /* reclaim */
  134         { &vnop_strategy_desc, (VOPFUNC)spec_strategy },                /* strategy */
  135         { &vnop_pathconf_desc, (VOPFUNC)spec_pathconf },                /* pathconf */
  136         { &vnop_advlock_desc, (VOPFUNC)err_advlock },           /* advlock */
  137         { &vnop_bwrite_desc, (VOPFUNC)spec_bwrite },            /* bwrite */
  138         { &vnop_pagein_desc, (VOPFUNC)err_pagein },             /* Pagein */
  139         { &vnop_pageout_desc, (VOPFUNC)err_pageout },           /* Pageout */
  140         { &vnop_copyfile_desc, (VOPFUNC)err_copyfile },         /* Copyfile */
  141         { &vnop_blktooff_desc, (VOPFUNC)spec_blktooff },                /* blktooff */
  142         { &vnop_offtoblk_desc, (VOPFUNC)spec_offtoblk },                /* offtoblk */
  143         { &vnop_blockmap_desc, (VOPFUNC)spec_blockmap },                /* blockmap */
  144         { (struct vnodeop_desc*)NULL, (int(*)())NULL }
  145 };
  146 struct vnodeopv_desc spec_vnodeop_opv_desc =
  147         { &spec_vnodeop_p, spec_vnodeop_entries };
  148 
  149 
  150 static void set_blocksize(vnode_t, dev_t);
  151 
  152 
  153 /*
  154  * Trivial lookup routine that always fails.
  155  */
  156 int
  157 spec_lookup(struct vnop_lookup_args *ap)
  158 {
  159 
  160         *ap->a_vpp = NULL;
  161         return (ENOTDIR);
  162 }
  163 
  164 static void
  165 set_blocksize(struct vnode *vp, dev_t dev)
  166 {
  167     int (*size)(dev_t);
  168     int rsize;
  169 
  170     if ((major(dev) < nblkdev) && (size = bdevsw[major(dev)].d_psize)) {
  171         rsize = (*size)(dev);
  172         if (rsize <= 0)        /* did size fail? */
  173             vp->v_specsize = DEV_BSIZE;
  174         else
  175             vp->v_specsize = rsize;
  176     }
  177     else
  178             vp->v_specsize = DEV_BSIZE;
  179 }
  180 
  181 void
  182 set_fsblocksize(struct vnode *vp)
  183 {
  184         
  185         if (vp->v_type == VBLK) {
  186                 dev_t dev = (dev_t)vp->v_rdev;
  187                 int maj = major(dev);
  188 
  189                 if ((u_int)maj >= (u_int)nblkdev)
  190                         return;
  191 
  192                 vnode_lock(vp);
  193                 set_blocksize(vp, dev);
  194                 vnode_unlock(vp);
  195         }
  196 
  197 }
  198 
  199 
  200 /*
  201  * Open a special file.
  202  */
  203 int
  204 spec_open(struct vnop_open_args *ap)
  205 {
  206         struct proc *p = vfs_context_proc(ap->a_context);
  207         kauth_cred_t cred = vfs_context_ucred(ap->a_context);
  208         struct vnode *vp = ap->a_vp;
  209         dev_t bdev, dev = (dev_t)vp->v_rdev;
  210         int maj = major(dev);
  211         int error;
  212 
  213         /*
  214          * Don't allow open if fs is mounted -nodev.
  215          */
  216         if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
  217                 return (ENXIO);
  218 
  219         switch (vp->v_type) {
  220 
  221         case VCHR:
  222                 if ((u_int)maj >= (u_int)nchrdev)
  223                         return (ENXIO);
  224                 if (cred != FSCRED && (ap->a_mode & FWRITE)) {
  225                         /*
  226                          * When running in very secure mode, do not allow
  227                          * opens for writing of any disk character devices.
  228                          */
  229                         if (securelevel >= 2 && isdisk(dev, VCHR))
  230                                 return (EPERM);
  231                         /*
  232                          * When running in secure mode, do not allow opens
  233                          * for writing of /dev/mem, /dev/kmem, or character
  234                          * devices whose corresponding block devices are
  235                          * currently mounted.
  236                          */
  237                         if (securelevel >= 1) {
  238                                 if ((bdev = chrtoblk(dev)) != NODEV && check_mountedon(bdev, VBLK, &error))
  239                                         return (error);
  240                                 if (iskmemdev(dev))
  241                                         return (EPERM);
  242                         }
  243                 }
  244                 if (cdevsw[maj].d_type == D_TTY) {
  245                         vnode_lock(vp);
  246                         vp->v_flag |= VISTTY;
  247                         vnode_unlock(vp);
  248                 }
  249                 error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p);
  250                 return (error);
  251 
  252         case VBLK:
  253                 if ((u_int)maj >= (u_int)nblkdev)
  254                         return (ENXIO);
  255                 /*
  256                  * When running in very secure mode, do not allow
  257                  * opens for writing of any disk block devices.
  258                  */
  259                 if (securelevel >= 2 && cred != FSCRED &&
  260                     (ap->a_mode & FWRITE) && bdevsw[maj].d_type == D_DISK)
  261                         return (EPERM);
  262                 /*
  263                  * Do not allow opens of block devices that are
  264                  * currently mounted.
  265                  */
  266                 if ( (error = vfs_mountedon(vp)) )
  267                         return (error);
  268                 error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p);
  269                 if (!error) {
  270                     u_int64_t blkcnt;
  271                     u_int32_t blksize;
  272                         int setsize = 0;
  273                         u_int32_t size512 = 512;
  274 
  275 
  276                     if (!VNOP_IOCTL(vp, DKIOCGETBLOCKSIZE, (caddr_t)&blksize, 0, ap->a_context)) {
  277                                 /* Switch to 512 byte sectors (temporarily) */
  278 
  279                                 if (!VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, ap->a_context)) {
  280                                 /* Get the number of 512 byte physical blocks. */
  281                                 if (!VNOP_IOCTL(vp, DKIOCGETBLOCKCOUNT, (caddr_t)&blkcnt, 0, ap->a_context)) {
  282                                                 setsize = 1;
  283                                 }
  284                                 }
  285                                 /* If it doesn't set back, we can't recover */
  286                                 if (VNOP_IOCTL(vp, DKIOCSETBLOCKSIZE, (caddr_t)&blksize, FWRITE, ap->a_context))
  287                                 error = ENXIO;
  288                     }
  289 
  290 
  291                         vnode_lock(vp);
  292                     set_blocksize(vp, dev);
  293 
  294                     /*
  295                      * Cache the size in bytes of the block device for later
  296                      * use by spec_write().
  297                      */
  298                         if (setsize)
  299                                 vp->v_specdevsize = blkcnt * (u_int64_t)size512;
  300                         else
  301                         vp->v_specdevsize = (u_int64_t)0;       /* Default: Can't get */
  302                         
  303                         vnode_unlock(vp);
  304 
  305                 }
  306                 return(error);
  307         default:
  308                 panic("spec_open type");
  309         }
  310         return (0);
  311 }
  312 
  313 /*
  314  * Vnode op for read
  315  */
  316 int
  317 spec_read(struct vnop_read_args *ap)
  318 {
  319         struct vnode *vp = ap->a_vp;
  320         struct uio *uio = ap->a_uio;
  321         struct buf *bp;
  322         daddr64_t bn, nextbn;
  323         long bsize, bscale;
  324         int devBlockSize=0;
  325         int n, on;
  326         int error = 0;
  327         dev_t dev;
  328 
  329 #if DIAGNOSTIC
  330         if (uio->uio_rw != UIO_READ)
  331                 panic("spec_read mode");
  332         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
  333                 panic("spec_read proc");
  334 #endif
  335         if (uio_resid(uio) == 0)
  336                 return (0);
  337 
  338         switch (vp->v_type) {
  339 
  340         case VCHR:
  341                 error = (*cdevsw[major(vp->v_rdev)].d_read)
  342                         (vp->v_rdev, uio, ap->a_ioflag);
  343                 return (error);
  344 
  345         case VBLK:
  346                 if (uio->uio_offset < 0)
  347                         return (EINVAL);
  348 
  349                 dev = vp->v_rdev;
  350 
  351                 devBlockSize = vp->v_specsize;
  352 
  353                 if (devBlockSize > PAGE_SIZE) 
  354                         return (EINVAL);
  355 
  356                 bscale = PAGE_SIZE / devBlockSize;
  357                 bsize = bscale * devBlockSize;
  358 
  359                 do {
  360                         on = uio->uio_offset % bsize;
  361 
  362                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ (bscale - 1));
  363                         
  364                         if (vp->v_speclastr + bscale == bn) {
  365                                 nextbn = bn + bscale;
  366                                 error = buf_breadn(vp, bn, (int)bsize, &nextbn,
  367                                                (int *)&bsize, 1, NOCRED, &bp);
  368                         } else
  369                                 error = buf_bread(vp, bn, (int)bsize, NOCRED, &bp);
  370 
  371                         vnode_lock(vp);
  372                         vp->v_speclastr = bn;
  373                         vnode_unlock(vp);
  374 
  375                         n = bsize - buf_resid(bp);
  376                         if ((on > n) || error) {
  377                                 if (!error)
  378                                         error = EINVAL;
  379                                 buf_brelse(bp);
  380                                 return (error);
  381                         }
  382                         // LP64todo - fix this!
  383                         n = min((unsigned)(n  - on), uio_resid(uio));
  384 
  385                         error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
  386                         if (n + on == bsize)
  387                                 buf_markaged(bp);
  388                         buf_brelse(bp);
  389                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
  390                 return (error);
  391 
  392         default:
  393                 panic("spec_read type");
  394         }
  395         /* NOTREACHED */
  396 
  397         return (0);
  398 }
  399 
  400 /*
  401  * Vnode op for write
  402  */
  403 int
  404 spec_write(struct vnop_write_args *ap)
  405 {
  406         struct vnode *vp = ap->a_vp;
  407         struct uio *uio = ap->a_uio;
  408         struct buf *bp;
  409         daddr64_t bn;
  410         int bsize, blkmask, bscale;
  411         int io_sync;
  412         int io_size;
  413         int devBlockSize=0;
  414         int n, on;
  415         int error = 0;
  416         dev_t dev;
  417 
  418 #if DIAGNOSTIC
  419         if (uio->uio_rw != UIO_WRITE)
  420                 panic("spec_write mode");
  421         if (UIO_SEG_IS_USER_SPACE(uio->uio_segflg))
  422                 panic("spec_write proc");
  423 #endif
  424 
  425         switch (vp->v_type) {
  426 
  427         case VCHR:
  428                 error = (*cdevsw[major(vp->v_rdev)].d_write)
  429                         (vp->v_rdev, uio, ap->a_ioflag);
  430                 return (error);
  431 
  432         case VBLK:
  433                 if (uio_resid(uio) == 0)
  434                         return (0);
  435                 if (uio->uio_offset < 0)
  436                         return (EINVAL);
  437 
  438                 io_sync = (ap->a_ioflag & IO_SYNC);
  439                 // LP64todo - fix this!
  440                 io_size = uio_resid(uio);
  441 
  442                 dev = (vp->v_rdev);
  443 
  444                 devBlockSize = vp->v_specsize;
  445                 if (devBlockSize > PAGE_SIZE)
  446                         return(EINVAL);
  447 
  448                 bscale = PAGE_SIZE / devBlockSize;
  449                 blkmask = bscale - 1;
  450                 bsize = bscale * devBlockSize;
  451                 
  452 
  453                 do {
  454                         bn = (daddr64_t)((uio->uio_offset / devBlockSize) &~ blkmask);
  455                         on = uio->uio_offset % bsize;
  456 
  457                         // LP64todo - fix this!
  458                         n = min((unsigned)(bsize - on), uio_resid(uio));
  459 
  460                         /*
  461                          * Use buf_getblk() as an optimization IFF:
  462                          *
  463                          * 1)   We are reading exactly a block on a block
  464                          *      aligned boundary
  465                          * 2)   We know the size of the device from spec_open
  466                          * 3)   The read doesn't span the end of the device
  467                          *
  468                          * Otherwise, we fall back on buf_bread().
  469                          */
  470                         if (n == bsize &&
  471                             vp->v_specdevsize != (u_int64_t)0 &&
  472                             (uio->uio_offset + (u_int64_t)n) > vp->v_specdevsize) {
  473                             /* reduce the size of the read to what is there */
  474                             n = (uio->uio_offset + (u_int64_t)n) - vp->v_specdevsize;
  475                         }
  476 
  477                         if (n == bsize)
  478                                 bp = buf_getblk(vp, bn, bsize, 0, 0, BLK_WRITE);
  479                         else
  480                                 error = (int)buf_bread(vp, bn, bsize, NOCRED, &bp);
  481 
  482                         /* Translate downstream error for upstream, if needed */
  483                         if (!error)
  484                                 error = (int)buf_error(bp);
  485                         if (error) {
  486                                 buf_brelse(bp);
  487                                 return (error);
  488                         }
  489                         n = min(n, bsize - buf_resid(bp));
  490 
  491                         error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio);
  492                         if (error) {
  493                                 buf_brelse(bp);
  494                                 return (error);
  495                         }
  496                         buf_markaged(bp);
  497 
  498                         if (io_sync) 
  499                                 error = buf_bwrite(bp);
  500                         else {
  501                                 if ((n + on) == bsize)
  502                                         error = buf_bawrite(bp);
  503                                 else
  504                                         error = buf_bdwrite(bp);
  505                         }
  506                 } while (error == 0 && uio_resid(uio) > 0 && n != 0);
  507                 return (error);
  508 
  509         default:
  510                 panic("spec_write type");
  511         }
  512         /* NOTREACHED */
  513 
  514         return (0);
  515 }
  516 
  517 /*
  518  * Device ioctl operation.
  519  */
  520 int
  521 spec_ioctl(struct vnop_ioctl_args *ap)
  522 {
  523         proc_t p = vfs_context_proc(ap->a_context);
  524         dev_t dev = ap->a_vp->v_rdev;
  525 
  526         switch (ap->a_vp->v_type) {
  527 
  528         case VCHR:
  529                 return ((*cdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
  530                     ap->a_fflag, p));
  531 
  532         case VBLK:
  533                 if (ap->a_command == 0 && (unsigned int)ap->a_data == B_TAPE) {
  534                         if (bdevsw[major(dev)].d_type == D_TAPE)
  535                                 return (0);
  536                         else
  537                                 return (1);
  538                 }
  539                 return ((*bdevsw[major(dev)].d_ioctl)(dev, ap->a_command, ap->a_data,
  540                    ap->a_fflag, p));
  541 
  542         default:
  543                 panic("spec_ioctl");
  544                 /* NOTREACHED */
  545         }
  546         return (0);
  547 }
  548 
  549 int
  550 spec_select(struct vnop_select_args *ap)
  551 {
  552         proc_t p = vfs_context_proc(ap->a_context);
  553         dev_t dev;
  554 
  555         switch (ap->a_vp->v_type) {
  556 
  557         default:
  558                 return (1);             /* XXX */
  559 
  560         case VCHR:
  561                 dev = ap->a_vp->v_rdev;
  562                 return (*cdevsw[major(dev)].d_select)(dev, ap->a_which, ap->a_wql, p);
  563         }
  564 }
  565 
  566 /*
  567  * Synch buffers associated with a block device
  568  */
  569 int
  570 spec_fsync_internal(vnode_t vp, int waitfor, __unused vfs_context_t context)
  571 {
  572         if (vp->v_type == VCHR)
  573                 return (0);
  574         /*
  575          * Flush all dirty buffers associated with a block device.
  576          */
  577         buf_flushdirtyblks(vp, waitfor == MNT_WAIT, 0, "spec_fsync");
  578 
  579         return (0);
  580 }
  581 
  582 int
  583 spec_fsync(struct vnop_fsync_args *ap)
  584 {
  585         return spec_fsync_internal(ap->a_vp, ap->a_waitfor, ap->a_context);
  586 }
  587 
  588 /*
  589  * Just call the device strategy routine
  590  */
  591 extern int hard_throttle_on_root;
  592 void IOSleep(int);
  593 extern void throttle_lowpri_io(int *lowpri_window,mount_t v_mount);
  594 
  595 // the low priority process may wait for at most LOWPRI_MAX_DELAY millisecond
  596 #define LOWPRI_INITIAL_WINDOW_MSECS 100
  597 #define LOWPRI_WINDOW_MSECS_INC 50
  598 #define LOWPRI_MAX_WINDOW_MSECS 200
  599 #define LOWPRI_MAX_WAITING_MSECS 200
  600 #define LOWPRI_SLEEP_INTERVAL 5
  601 
  602 int     lowpri_IO_initial_window_msecs  = LOWPRI_INITIAL_WINDOW_MSECS;
  603 int     lowpri_IO_window_msecs_inc  = LOWPRI_WINDOW_MSECS_INC;
  604 int     lowpri_max_window_msecs  = LOWPRI_MAX_WINDOW_MSECS;
  605 int     lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS;
  606 
  607 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  608 SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, "");
  609 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  610 SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, "");
  611 
  612 void throttle_lowpri_io(int *lowpri_window,mount_t v_mount)
  613 {
  614         int i;
  615         struct timeval last_lowpri_IO_timestamp,last_normal_IO_timestamp;
  616         struct timeval elapsed;
  617         int lowpri_IO_window_msecs;
  618         struct timeval lowpri_IO_window;
  619         int max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL;
  620 
  621         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START,
  622                      *lowpri_window, 0, 0, 0, 0);
  623 
  624         last_normal_IO_timestamp = v_mount->last_normal_IO_timestamp;
  625                         
  626         for (i=0; i<max_try_num; i++) {
  627                 microuptime(&last_lowpri_IO_timestamp);
  628 
  629                 elapsed = last_lowpri_IO_timestamp;
  630                 timevalsub(&elapsed, &last_normal_IO_timestamp);
  631 
  632                 lowpri_IO_window_msecs = *lowpri_window;
  633                 lowpri_IO_window.tv_sec  = lowpri_IO_window_msecs / 1000;
  634                 lowpri_IO_window.tv_usec = (lowpri_IO_window_msecs % 1000) * 1000;
  635 
  636                 if (timevalcmp(&elapsed, &lowpri_IO_window, <)) {
  637                         IOSleep(LOWPRI_SLEEP_INTERVAL);
  638                 } else {
  639                         break;
  640                 }
  641         }
  642 
  643         KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END,
  644                      *lowpri_window, i*5, 0, 0, 0);
  645         *lowpri_window = 0;
  646 }
  647 
  648 int
  649 spec_strategy(struct vnop_strategy_args *ap)
  650 {
  651         buf_t   bp;
  652         int     bflags;
  653         dev_t   bdev;
  654 
  655         bp = ap->a_bp;
  656         bdev = buf_device(bp);
  657         bflags = buf_flags(bp);
  658 
  659         if (kdebug_enable) {
  660                 int    code = 0;
  661 
  662                 if (bflags & B_READ)
  663                         code |= DKIO_READ;
  664                 if (bflags & B_ASYNC)
  665                         code |= DKIO_ASYNC;
  666 
  667                 if (bflags & B_META)
  668                         code |= DKIO_META;
  669                 else if (bflags & B_PAGEIO)
  670                         code |= DKIO_PAGING;
  671 
  672                 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
  673                                       (unsigned int)bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0);
  674         }
  675         if (((bflags & (B_PAGEIO | B_READ)) == (B_PAGEIO | B_READ)) &&
  676             (buf_vnode(bp)->v_mount->mnt_kern_flag & MNTK_ROOTDEV))
  677                 hard_throttle_on_root = 1;
  678 
  679         if (lowpri_IO_initial_window_msecs) {
  680                 proc_t  p;
  681                 struct uthread  *ut;
  682                 int policy = IOPOL_DEFAULT;
  683                 int is_throttleable_io = 0;
  684                 int is_passive_io = 0;
  685                 p = current_proc();
  686                 ut = get_bsdthread_info(current_thread());
  687                 
  688                 if (p != NULL)
  689                         policy = p->p_iopol_disk;
  690 
  691                 if (ut != NULL) {
  692                         // the I/O policy of the thread overrides that of the process
  693                         // unless the I/O policy of the thread is default
  694                         if (ut->uu_iopol_disk != IOPOL_DEFAULT)
  695                                 policy = ut->uu_iopol_disk;
  696                 }
  697 
  698                 switch (policy) {
  699                 case IOPOL_DEFAULT:
  700                 case IOPOL_NORMAL:
  701                         break;
  702                 case IOPOL_THROTTLE:
  703                         is_throttleable_io = 1;
  704                         break;
  705                 case IOPOL_PASSIVE:
  706                         is_passive_io = 1;
  707                         break;
  708                 default:
  709                         printf("unknown I/O policy %d", policy);
  710                         break;
  711                 }
  712 
  713                 if (!is_throttleable_io && ISSET(bflags, B_PASSIVE))
  714                     is_passive_io |= 1;
  715 
  716                 if (!is_throttleable_io) {
  717                         if (!is_passive_io && buf_vnode(bp)->v_mount != NULL){
  718                                 microuptime(&(buf_vnode(bp)->v_mount->last_normal_IO_timestamp));
  719                         }
  720                 } else {
  721                         /*
  722                          * I'd really like to do the IOSleep here, but
  723                          * we may be holding all kinds of filesystem related locks
  724                          * and the pages for this I/O marked 'busy'...
  725                          * we don't want to cause a normal task to block on
  726                          * one of these locks while we're throttling a task marked
  727                          * for low priority I/O... we'll mark the uthread and
  728                          * do the delay just before we return from the system
  729                          * call that triggered this I/O or from vnode_pagein
  730                          */
  731                         if(buf_vnode(bp)->v_mount != NULL)
  732                                 ut->v_mount = buf_vnode(bp)->v_mount;
  733                         if (ut->uu_lowpri_window == 0) {
  734                                 ut->uu_lowpri_window = lowpri_IO_initial_window_msecs;
  735                         } else {
  736                                 ut->uu_lowpri_window += lowpri_IO_window_msecs_inc;
  737                                 if (ut->uu_lowpri_window > lowpri_max_window_msecs)
  738                                         ut->uu_lowpri_window = lowpri_max_window_msecs;
  739                         }
  740                 }
  741         }
  742         (*bdevsw[major(bdev)].d_strategy)(bp);
  743 
  744         return (0);
  745 }
  746 
  747 
  748 /*
  749  * This is a noop, simply returning what one has been given.
  750  */
  751 int
  752 spec_blockmap(__unused struct vnop_blockmap_args *ap)
  753 {
  754         return (ENOTSUP);
  755 }
  756 
  757 
  758 /*
  759  * Device close routine
  760  */
  761 int
  762 spec_close(struct vnop_close_args *ap)
  763 {
  764         struct vnode *vp = ap->a_vp;
  765         dev_t dev = vp->v_rdev;
  766         int (*devclose)(dev_t, int, int, struct proc *);
  767         int mode, error;
  768         int flags = ap->a_fflag;
  769         struct proc *p = vfs_context_proc(ap->a_context);
  770         struct session *sessp;
  771 
  772         switch (vp->v_type) {
  773 
  774         case VCHR:
  775                 /*
  776                  * Hack: a tty device that is a controlling terminal
  777                  * has a reference from the session structure.
  778                  * We cannot easily tell that a character device is
  779                  * a controlling terminal, unless it is the closing
  780                  * process' controlling terminal.  In that case,
  781                  * if the reference count is 2 (this last descriptor
  782                  * plus the session), release the reference from the session.
  783                  */
  784                 sessp = proc_session(p);
  785                 if (sessp != SESSION_NULL) {
  786                         if ((vcount(vp) == 2) && 
  787                                 (vp == sessp->s_ttyvp)) {
  788                                 session_lock(sessp);
  789                                 sessp->s_ttyvp = NULL;
  790                                 sessp->s_ttyvid = 0;
  791                                 sessp->s_ttyp = NULL;
  792                                 sessp->s_ttypgrpid = NO_PID;
  793                                 session_unlock(sessp);
  794                                 vnode_rele(vp);
  795                         }
  796                         session_rele(sessp);
  797                 }
  798 
  799                 devclose = cdevsw[major(dev)].d_close;
  800                 mode = S_IFCHR;
  801                 /*
  802                  * close on last reference or on vnode revoke call
  803                  */
  804                 if ((flags & IO_REVOKE) != 0)
  805                         break;
  806                 if (vcount(vp) > 1)
  807                         return (0);
  808                 break;
  809 
  810         case VBLK:
  811 #ifdef DEVFS_IMPLEMENTS_LOCKING
  812                 /*
  813                  * On last close of a block device (that isn't mounted)
  814                  * we must invalidate any in core blocks, so that
  815                  * we can, for instance, change floppy disks.
  816                  */
  817                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
  818                         return (error);
  819 
  820                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
  821                 if (error)
  822                         return (error);
  823                 /*
  824                  * Since every use (buffer, vnode, swap, blockmap)
  825                  * holds a reference to the vnode, and because we mark
  826                  * any other vnodes that alias this device, when the
  827                  * sum of the reference counts on all the aliased
  828                  * vnodes descends to one, we are on last close.
  829                  */
  830                 if (vcount(vp) > 1)
  831                         return (0);
  832 #else /* DEVFS_IMPLEMENTS_LOCKING */
  833                 /*
  834                  * Since every use (buffer, vnode, swap, blockmap)
  835                  * holds a reference to the vnode, and because we mark
  836                  * any other vnodes that alias this device, when the
  837                  * sum of the reference counts on all the aliased
  838                  * vnodes descends to one, we are on last close.
  839                  */
  840                 if (vcount(vp) > 1)
  841                         return (0);
  842 
  843                 /*
  844                  * On last close of a block device (that isn't mounted)
  845                  * we must invalidate any in core blocks, so that
  846                  * we can, for instance, change floppy disks.
  847                  */
  848                 if ((error = spec_fsync_internal(vp, MNT_WAIT, ap->a_context)))
  849                         return (error);
  850 
  851                 error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0);
  852                 if (error)
  853                         return (error);
  854 #endif /* DEVFS_IMPLEMENTS_LOCKING */
  855                 devclose = bdevsw[major(dev)].d_close;
  856                 mode = S_IFBLK;
  857                 break;
  858 
  859         default:
  860                 panic("spec_close: not special");
  861                 return(EBADF);
  862         }
  863 
  864         return ((*devclose)(dev, flags, mode, p));
  865 }
  866 
  867 /*
  868  * Return POSIX pathconf information applicable to special devices.
  869  */
  870 int
  871 spec_pathconf(struct vnop_pathconf_args *ap)
  872 {
  873 
  874         switch (ap->a_name) {
  875         case _PC_LINK_MAX:
  876                 *ap->a_retval = LINK_MAX;
  877                 return (0);
  878         case _PC_MAX_CANON:
  879                 *ap->a_retval = MAX_CANON;
  880                 return (0);
  881         case _PC_MAX_INPUT:
  882                 *ap->a_retval = MAX_INPUT;
  883                 return (0);
  884         case _PC_PIPE_BUF:
  885                 *ap->a_retval = PIPE_BUF;
  886                 return (0);
  887         case _PC_CHOWN_RESTRICTED:
  888                 *ap->a_retval = 200112;         /* _POSIX_CHOWN_RESTRICTED */
  889                 return (0);
  890         case _PC_VDISABLE:
  891                 *ap->a_retval = _POSIX_VDISABLE;
  892                 return (0);
  893         default:
  894                 return (EINVAL);
  895         }
  896         /* NOTREACHED */
  897 }
  898 
  899 /*
  900  * Special device failed operation
  901  */
  902 int
  903 spec_ebadf(__unused void *dummy)
  904 {
  905 
  906         return (EBADF);
  907 }
  908 
  909 /* Blktooff derives file offset from logical block number */
  910 int
  911 spec_blktooff(struct vnop_blktooff_args *ap)
  912 {
  913         struct vnode *vp = ap->a_vp;
  914 
  915         switch (vp->v_type) {
  916         case VCHR:
  917                 *ap->a_offset = (off_t)-1; /* failure */
  918                 return (ENOTSUP);
  919 
  920         case VBLK:
  921                 printf("spec_blktooff: not implemented for VBLK\n");
  922                 *ap->a_offset = (off_t)-1; /* failure */
  923                 return (ENOTSUP);
  924 
  925         default:
  926                 panic("spec_blktooff type");
  927         }
  928         /* NOTREACHED */
  929 
  930         return (0);
  931 }
  932 
  933 /* Offtoblk derives logical block number from file offset */
  934 int
  935 spec_offtoblk(struct vnop_offtoblk_args *ap)
  936 {
  937         struct vnode *vp = ap->a_vp;
  938 
  939         switch (vp->v_type) {
  940         case VCHR:
  941                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
  942                 return (ENOTSUP);
  943 
  944         case VBLK:
  945                 printf("spec_offtoblk: not implemented for VBLK\n");
  946                 *ap->a_lblkno = (daddr64_t)-1; /* failure */
  947                 return (ENOTSUP);
  948 
  949         default:
  950                 panic("spec_offtoblk type");
  951         }
  952         /* NOTREACHED */
  953 
  954         return (0);
  955 }

Cache object: cfffea9c6c212d7810a6117c590a3a21


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.