uvm_vnode.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $OpenBSD: uvm_vnode.c,v 1.131 2022/12/08 21:32:48 kettenis Exp $        */
    2 /*      $NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $       */
    3 
    4 /*
    5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
    6  * Copyright (c) 1991, 1993
    7  *      The Regents of the University of California.
    8  * Copyright (c) 1990 University of Utah.
    9  *
   10  * All rights reserved.
   11  *
   12  * This code is derived from software contributed to Berkeley by
   13  * the Systems Programming Group of the University of Utah Computer
   14  * Science Department.
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  * 1. Redistributions of source code must retain the above copyright
   20  *    notice, this list of conditions and the following disclaimer.
   21  * 2. Redistributions in binary form must reproduce the above copyright
   22  *    notice, this list of conditions and the following disclaimer in the
   23  *    documentation and/or other materials provided with the distribution.
   24  * 3. Neither the name of the University nor the names of its contributors
   25  *    may be used to endorse or promote products derived from this software
   26  *    without specific prior written permission.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   38  * SUCH DAMAGE.
   39  *
   40  *      @(#)vnode_pager.c       8.8 (Berkeley) 2/13/94
   41  * from: Id: uvm_vnode.c,v 1.1.2.26 1998/02/02 20:38:07 chuck Exp
   42  */
   43 
   44 /*
   45  * uvm_vnode.c: the vnode pager.
   46  */
   47 
   48 #include <sys/param.h>
   49 #include <sys/systm.h>
   50 #include <sys/proc.h>
   51 #include <sys/malloc.h>
   52 #include <sys/vnode.h>
   53 #include <sys/lock.h>
   54 #include <sys/disklabel.h>
   55 #include <sys/fcntl.h>
   56 #include <sys/conf.h>
   57 #include <sys/rwlock.h>
   58 #include <sys/dkio.h>
   59 #include <sys/specdev.h>
   60 
   61 #include <uvm/uvm.h>
   62 #include <uvm/uvm_vnode.h>
   63 
   64 /*
   65  * private global data structure
   66  *
   67  * we keep a list of writeable active vnode-backed VM objects for sync op.
   68  * we keep a simpleq of vnodes that are currently being sync'd.
   69  */
   70 
   71 LIST_HEAD(, uvm_vnode)          uvn_wlist;      /* [K] writeable uvns */
   72 SIMPLEQ_HEAD(, uvm_vnode)       uvn_sync_q;     /* [S] sync'ing uvns */
   73 struct rwlock uvn_sync_lock;                    /* locks sync operation */
   74 
   75 extern int rebooting;
   76 
   77 /*
   78  * functions
   79  */
   80 void             uvn_cluster(struct uvm_object *, voff_t, voff_t *, voff_t *);
   81 void             uvn_detach(struct uvm_object *);
   82 boolean_t        uvn_flush(struct uvm_object *, voff_t, voff_t, int);
   83 int              uvn_get(struct uvm_object *, voff_t, vm_page_t *, int *, int,
   84                      vm_prot_t, int, int);
   85 void             uvn_init(void);
   86 int              uvn_io(struct uvm_vnode *, vm_page_t *, int, int, int);
   87 int              uvn_put(struct uvm_object *, vm_page_t *, int, boolean_t);
   88 void             uvn_reference(struct uvm_object *);
   89 
   90 /*
   91  * master pager structure
   92  */
   93 const struct uvm_pagerops uvm_vnodeops = {
   94         .pgo_init = uvn_init,
   95         .pgo_reference = uvn_reference,
   96         .pgo_detach = uvn_detach,
   97         .pgo_flush = uvn_flush,
   98         .pgo_get = uvn_get,
   99         .pgo_put = uvn_put,
  100         .pgo_cluster = uvn_cluster,
  101         /* use generic version of this: see uvm_pager.c */
  102         .pgo_mk_pcluster = uvm_mk_pcluster,
  103 };
  104 
  105 /*
  106  * the ops!
  107  */
  108 /*
  109  * uvn_init
  110  *
  111  * init pager private data structures.
  112  */
  113 void
  114 uvn_init(void)
  115 {
  116 
  117         LIST_INIT(&uvn_wlist);
  118         /* note: uvn_sync_q init'd in uvm_vnp_sync() */
  119         rw_init_flags(&uvn_sync_lock, "uvnsync", RWL_IS_VNODE);
  120 }
  121 
  122 /*
  123  * uvn_attach
  124  *
  125  * attach a vnode structure to a VM object.  if the vnode is already
  126  * attached, then just bump the reference count by one and return the
  127  * VM object.   if not already attached, attach and return the new VM obj.
  128  * the "accessprot" tells the max access the attaching thread wants to
  129  * our pages.
  130  *
  131  * => in fact, nothing should be locked so that we can sleep here.
  132  * => note that uvm_object is first thing in vnode structure, so their
  133  *    pointers are equiv.
  134  */
  135 struct uvm_object *
  136 uvn_attach(struct vnode *vp, vm_prot_t accessprot)
  137 {
  138         struct uvm_vnode *uvn = vp->v_uvm;
  139         struct vattr vattr;
  140         int oldflags, result;
  141         struct partinfo pi;
  142         u_quad_t used_vnode_size = 0;
  143 
  144         /* if we're mapping a BLK device, make sure it is a disk. */
  145         if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK) {
  146                 return NULL;
  147         }
  148 
  149         /* first get a lock on the uvn. */
  150         rw_enter(uvn->u_obj.vmobjlock, RW_WRITE);
  151         while (uvn->u_flags & UVM_VNODE_BLOCKED) {
  152                 uvn->u_flags |= UVM_VNODE_WANTED;
  153                 rwsleep_nsec(uvn, uvn->u_obj.vmobjlock, PVM, "uvn_attach",
  154                     INFSLP);
  155         }
  156 
  157         /*
  158          * now uvn must not be in a blocked state.
  159          * first check to see if it is already active, in which case
  160          * we can bump the reference count, check to see if we need to
  161          * add it to the writeable list, and then return.
  162          */
  163         if (uvn->u_flags & UVM_VNODE_VALID) {   /* already active? */
  164 
  165                 /* regain vref if we were persisting */
  166                 if (uvn->u_obj.uo_refs == 0) {
  167                         vref(vp);
  168                 }
  169                 uvn->u_obj.uo_refs++;           /* bump uvn ref! */
  170 
  171                 /* check for new writeable uvn */
  172                 if ((accessprot & PROT_WRITE) != 0 &&
  173                     (uvn->u_flags & UVM_VNODE_WRITEABLE) == 0) {
  174                         uvn->u_flags |= UVM_VNODE_WRITEABLE;
  175                         KERNEL_ASSERT_LOCKED();
  176                         LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
  177                 }
  178 
  179                 rw_exit(uvn->u_obj.vmobjlock);
  180                 return (&uvn->u_obj);
  181         }
  182 
  183         /*
  184          * need to call VOP_GETATTR() to get the attributes, but that could
  185          * block (due to I/O), so we want to unlock the object before calling.
  186          * however, we want to keep anyone else from playing with the object
  187          * while it is unlocked.   to do this we set UVM_VNODE_ALOCK which
  188          * prevents anyone from attaching to the vnode until we are done with
  189          * it.
  190          */
  191         uvn->u_flags = UVM_VNODE_ALOCK;
  192         rw_exit(uvn->u_obj.vmobjlock);
  193 
  194         if (vp->v_type == VBLK) {
  195                 /*
  196                  * We could implement this as a specfs getattr call, but:
  197                  *
  198                  *      (1) VOP_GETATTR() would get the file system
  199                  *          vnode operation, not the specfs operation.
  200                  *
  201                  *      (2) All we want is the size, anyhow.
  202                  */
  203                 result = (*bdevsw[major(vp->v_rdev)].d_ioctl)(vp->v_rdev,
  204                     DIOCGPART, (caddr_t)&pi, FREAD, curproc);
  205                 if (result == 0) {
  206                         /* XXX should remember blocksize */
  207                         used_vnode_size = (u_quad_t)pi.disklab->d_secsize *
  208                             (u_quad_t)DL_GETPSIZE(pi.part);
  209                 }
  210         } else {
  211                 result = VOP_GETATTR(vp, &vattr, curproc->p_ucred, curproc);
  212                 if (result == 0)
  213                         used_vnode_size = vattr.va_size;
  214         }
  215 
  216         if (result != 0) {
  217                 rw_enter(uvn->u_obj.vmobjlock, RW_WRITE);
  218                 if (uvn->u_flags & UVM_VNODE_WANTED)
  219                         wakeup(uvn);
  220                 uvn->u_flags = 0;
  221                 rw_exit(uvn->u_obj.vmobjlock);
  222                 return NULL;
  223         }
  224 
  225         /*
  226          * make sure that the newsize fits within a vaddr_t
  227          * XXX: need to revise addressing data types
  228          */
  229 #ifdef DEBUG
  230         if (vp->v_type == VBLK)
  231                 printf("used_vnode_size = %llu\n", (long long)used_vnode_size);
  232 #endif
  233 
  234         /* now set up the uvn. */
  235         KASSERT(uvn->u_obj.uo_refs == 0);
  236         uvn->u_obj.uo_refs++;
  237         oldflags = uvn->u_flags;
  238         uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST;
  239         uvn->u_nio = 0;
  240         uvn->u_size = used_vnode_size;
  241 
  242         /*
  243          * add a reference to the vnode.   this reference will stay as long
  244          * as there is a valid mapping of the vnode.   dropped when the
  245          * reference count goes to zero [and we either free or persist].
  246          */
  247         vref(vp);
  248 
  249         /* if write access, we need to add it to the wlist */
  250         if (accessprot & PROT_WRITE) {
  251                 uvn->u_flags |= UVM_VNODE_WRITEABLE;    /* we are on wlist! */
  252                 KERNEL_ASSERT_LOCKED();
  253                 LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist);
  254         }
  255 
  256         if (oldflags & UVM_VNODE_WANTED)
  257                 wakeup(uvn);
  258 
  259         return &uvn->u_obj;
  260 }
  261 
  262 
  263 /*
  264  * uvn_reference
  265  *
  266  * duplicate a reference to a VM object.  Note that the reference
  267  * count must already be at least one (the passed in reference) so
  268  * there is no chance of the uvn being killed out here.
  269  *
  270  * => caller must be using the same accessprot as was used at attach time
  271  */
  272 
  273 
  274 void
  275 uvn_reference(struct uvm_object *uobj)
  276 {
  277 #ifdef DEBUG
  278         struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
  279 #endif
  280 
  281         rw_enter(uobj->vmobjlock, RW_WRITE);
  282 #ifdef DEBUG
  283         if ((uvn->u_flags & UVM_VNODE_VALID) == 0) {
  284                 printf("uvn_reference: ref=%d, flags=0x%x\n",
  285                     uobj->uo_refs, uvn->u_flags);
  286                 panic("uvn_reference: invalid state");
  287         }
  288 #endif
  289         uobj->uo_refs++;
  290         rw_exit(uobj->vmobjlock);
  291 }
  292 
  293 /*
  294  * uvn_detach
  295  *
  296  * remove a reference to a VM object.
  297  *
  298  * => caller must call with map locked.
  299  * => this starts the detach process, but doesn't have to finish it
  300  *    (async i/o could still be pending).
  301  */
  302 void
  303 uvn_detach(struct uvm_object *uobj)
  304 {
  305         struct uvm_vnode *uvn;
  306         struct vnode *vp;
  307         int oldflags;
  308 
  309         rw_enter(uobj->vmobjlock, RW_WRITE);
  310         uobj->uo_refs--;                        /* drop ref! */
  311         if (uobj->uo_refs) {                    /* still more refs */
  312                 rw_exit(uobj->vmobjlock);
  313                 return;
  314         }
  315 
  316         /* get other pointers ... */
  317         uvn = (struct uvm_vnode *) uobj;
  318         vp = uvn->u_vnode;
  319 
  320         /*
  321          * clear VTEXT flag now that there are no mappings left (VTEXT is used
  322          * to keep an active text file from being overwritten).
  323          */
  324         vp->v_flag &= ~VTEXT;
  325 
  326         /*
  327          * we just dropped the last reference to the uvn.   see if we can
  328          * let it "stick around".
  329          */
  330         if (uvn->u_flags & UVM_VNODE_CANPERSIST) {
  331                 /* won't block */
  332                 uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES);
  333                 goto out;
  334         }
  335 
  336         /* its a goner! */
  337         uvn->u_flags |= UVM_VNODE_DYING;
  338 
  339         /*
  340          * even though we may unlock in flush, no one can gain a reference
  341          * to us until we clear the "dying" flag [because it blocks
  342          * attaches].  we will not do that until after we've disposed of all
  343          * the pages with uvn_flush().  note that before the flush the only
  344          * pages that could be marked PG_BUSY are ones that are in async
  345          * pageout by the daemon.  (there can't be any pending "get"'s
  346          * because there are no references to the object).
  347          */
  348         (void) uvn_flush(uobj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES);
  349 
  350         /*
  351          * given the structure of this pager, the above flush request will
  352          * create the following state: all the pages that were in the object
  353          * have either been free'd or they are marked PG_BUSY and in the 
  354          * middle of an async io. If we still have pages we set the "relkill"
  355          * state, so that in the case the vnode gets terminated we know 
  356          * to leave it alone. Otherwise we'll kill the vnode when it's empty.
  357          */
  358         uvn->u_flags |= UVM_VNODE_RELKILL;
  359         /* wait on any outstanding io */
  360         while (uobj->uo_npages && uvn->u_flags & UVM_VNODE_RELKILL) {
  361                 uvn->u_flags |= UVM_VNODE_IOSYNC;
  362                 rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM, "uvn_term",
  363                     INFSLP);
  364         }
  365 
  366         if ((uvn->u_flags & UVM_VNODE_RELKILL) == 0) {
  367                 rw_exit(uobj->vmobjlock);
  368                 return;
  369         }
  370 
  371         /*
  372          * kill object now.   note that we can't be on the sync q because
  373          * all references are gone.
  374          */
  375         if (uvn->u_flags & UVM_VNODE_WRITEABLE) {
  376                 LIST_REMOVE(uvn, u_wlist);
  377         }
  378         KASSERT(RBT_EMPTY(uvm_objtree, &uobj->memt));
  379         oldflags = uvn->u_flags;
  380         uvn->u_flags = 0;
  381 
  382         /* wake up any sleepers */
  383         if (oldflags & UVM_VNODE_WANTED)
  384                 wakeup(uvn);
  385 out:
  386         rw_exit(uobj->vmobjlock);
  387 
  388         /* drop our reference to the vnode. */
  389         vrele(vp);
  390 
  391         return;
  392 }
  393 
  394 /*
  395  * uvm_vnp_terminate: external hook to clear out a vnode's VM
  396  *
  397  * called in two cases:
  398  *  [1] when a persisting vnode vm object (i.e. one with a zero reference
  399  *      count) needs to be freed so that a vnode can be reused.  this
  400  *      happens under "getnewvnode" in vfs_subr.c.   if the vnode from
  401  *      the free list is still attached (i.e. not VBAD) then vgone is
  402  *      called.   as part of the vgone trace this should get called to
  403  *      free the vm object.   this is the common case.
  404  *  [2] when a filesystem is being unmounted by force (MNT_FORCE,
  405  *      "umount -f") the vgone() function is called on active vnodes
  406  *      on the mounted file systems to kill their data (the vnodes become
  407  *      "dead" ones [see src/sys/miscfs/deadfs/...]).  that results in a
  408  *      call here (even if the uvn is still in use -- i.e. has a non-zero
  409  *      reference count).  this case happens at "umount -f" and during a
  410  *      "reboot/halt" operation.
  411  *
  412  * => the caller must XLOCK and VOP_LOCK the vnode before calling us
  413  *      [protects us from getting a vnode that is already in the DYING
  414  *       state...]
  415  * => in case [2] the uvn is still alive after this call, but all I/O
  416  *      ops will fail (due to the backing vnode now being "dead").  this
  417  *      will prob. kill any process using the uvn due to pgo_get failing.
  418  */
  419 void
  420 uvm_vnp_terminate(struct vnode *vp)
  421 {
  422         struct uvm_vnode *uvn = vp->v_uvm;
  423         struct uvm_object *uobj = &uvn->u_obj;
  424         int oldflags;
  425 
  426         /* check if it is valid */
  427         rw_enter(uobj->vmobjlock, RW_WRITE);
  428         if ((uvn->u_flags & UVM_VNODE_VALID) == 0) {
  429                 rw_exit(uobj->vmobjlock);
  430                 return;
  431         }
  432 
  433         /*
  434          * must be a valid uvn that is not already dying (because XLOCK
  435          * protects us from that).   the uvn can't in the ALOCK state
  436          * because it is valid, and uvn's that are in the ALOCK state haven't
  437          * been marked valid yet.
  438          */
  439 #ifdef DEBUG
  440         /*
  441          * debug check: are we yanking the vnode out from under our uvn?
  442          */
  443         if (uvn->u_obj.uo_refs) {
  444                 printf("uvm_vnp_terminate(%p): terminating active vnode "
  445                     "(refs=%d)\n", uvn, uvn->u_obj.uo_refs);
  446         }
  447 #endif
  448 
  449         /*
  450          * it is possible that the uvn was detached and is in the relkill
  451          * state [i.e. waiting for async i/o to finish].
  452          * we take over the vnode now and cancel the relkill.
  453          * we want to know when the i/o is done so we can recycle right
  454          * away.   note that a uvn can only be in the RELKILL state if it
  455          * has a zero reference count.
  456          */
  457         if (uvn->u_flags & UVM_VNODE_RELKILL)
  458                 uvn->u_flags &= ~UVM_VNODE_RELKILL;     /* cancel RELKILL */
  459 
  460         /*
  461          * block the uvn by setting the dying flag, and then flush the
  462          * pages.
  463          *
  464          * also, note that we tell I/O that we are already VOP_LOCK'd so
  465          * that uvn_io doesn't attempt to VOP_LOCK again.
  466          *
  467          * XXXCDC: setting VNISLOCKED on an active uvn which is being terminated
  468          *      due to a forceful unmount might not be a good idea.  maybe we
  469          *      need a way to pass in this info to uvn_flush through a
  470          *      pager-defined PGO_ constant [currently there are none].
  471          */
  472         uvn->u_flags |= UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED;
  473 
  474         (void) uvn_flush(&uvn->u_obj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES);
  475 
  476         /*
  477          * as we just did a flush we expect all the pages to be gone or in
  478          * the process of going.  sleep to wait for the rest to go [via iosync].
  479          */
  480         while (uvn->u_obj.uo_npages) {
  481 #ifdef DEBUG
  482                 struct vm_page *pp;
  483                 RBT_FOREACH(pp, uvm_objtree, &uvn->u_obj.memt) {
  484                         if ((pp->pg_flags & PG_BUSY) == 0)
  485                                 panic("uvm_vnp_terminate: detected unbusy pg");
  486                 }
  487                 if (uvn->u_nio == 0)
  488                         panic("uvm_vnp_terminate: no I/O to wait for?");
  489                 printf("uvm_vnp_terminate: waiting for I/O to fin.\n");
  490                 /*
  491                  * XXXCDC: this is unlikely to happen without async i/o so we
  492                  * put a printf in just to keep an eye on it.
  493                  */
  494 #endif
  495                 uvn->u_flags |= UVM_VNODE_IOSYNC;
  496                 rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM, "uvn_term",
  497                     INFSLP);
  498         }
  499 
  500         /*
  501          * done.   now we free the uvn if its reference count is zero
  502          * (true if we are zapping a persisting uvn).   however, if we are
  503          * terminating a uvn with active mappings we let it live ... future
  504          * calls down to the vnode layer will fail.
  505          */
  506         oldflags = uvn->u_flags;
  507         if (uvn->u_obj.uo_refs) {
  508                 /*
  509                  * uvn must live on it is dead-vnode state until all references
  510                  * are gone.   restore flags.    clear CANPERSIST state.
  511                  */
  512                 uvn->u_flags &= ~(UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED|
  513                       UVM_VNODE_WANTED|UVM_VNODE_CANPERSIST);
  514         } else {
  515                 /*
  516                  * free the uvn now.   note that the vref reference is already
  517                  * gone [it is dropped when we enter the persist state].
  518                  */
  519                 if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED)
  520                         panic("uvm_vnp_terminate: io sync wanted bit set");
  521 
  522                 if (uvn->u_flags & UVM_VNODE_WRITEABLE) {
  523                         LIST_REMOVE(uvn, u_wlist);
  524                 }
  525                 uvn->u_flags = 0;       /* uvn is history, clear all bits */
  526         }
  527 
  528         if (oldflags & UVM_VNODE_WANTED)
  529                 wakeup(uvn);
  530 
  531         rw_exit(uobj->vmobjlock);
  532 }
  533 
  534 /*
  535  * NOTE: currently we have to use VOP_READ/VOP_WRITE because they go
  536  * through the buffer cache and allow I/O in any size.  These VOPs use
  537  * synchronous i/o.  [vs. VOP_STRATEGY which can be async, but doesn't
  538  * go through the buffer cache or allow I/O sizes larger than a
  539  * block].  we will eventually want to change this.
  540  *
  541  * issues to consider:
  542  *   uvm provides the uvm_aiodesc structure for async i/o management.
  543  * there are two tailq's in the uvm. structure... one for pending async
  544  * i/o and one for "done" async i/o.   to do an async i/o one puts
  545  * an aiodesc on the "pending" list (protected by splbio()), starts the
  546  * i/o and returns VM_PAGER_PEND.    when the i/o is done, we expect
  547  * some sort of "i/o done" function to be called (at splbio(), interrupt
  548  * time).   this function should remove the aiodesc from the pending list
  549  * and place it on the "done" list and wakeup the daemon.   the daemon
  550  * will run at normal spl() and will remove all items from the "done"
  551  * list and call the "aiodone" hook for each done request (see uvm_pager.c).
  552  * [in the old vm code, this was done by calling the "put" routine with
  553  * null arguments which made the code harder to read and understand because
  554  * you had one function ("put") doing two things.]
  555  *
  556  * so the current pager needs:
  557  *   int uvn_aiodone(struct uvm_aiodesc *)
  558  *
  559  * => return 0 (aio finished, free it). otherwise requeue for later collection.
  560  * => called with pageq's locked by the daemon.
  561  *
  562  * general outline:
  563  * - drop "u_nio" (this req is done!)
  564  * - if (object->iosync && u_naio == 0) { wakeup &uvn->u_naio }
  565  * - get "page" structures (atop?).
  566  * - handle "wanted" pages
  567  * dont forget to look at "object" wanted flag in all cases.
  568  */
  569 
  570 /*
  571  * uvn_flush: flush pages out of a uvm object.
  572  *
  573  * => if PGO_CLEANIT is set, we may block (due to I/O).   thus, a caller
  574  *      might want to unlock higher level resources (e.g. vm_map)
  575  *      before calling flush.
  576  * => if PGO_CLEANIT is not set, then we will not block
  577  * => if PGO_ALLPAGE is set, then all pages in the object are valid targets
  578  *      for flushing.
  579  * => NOTE: we are allowed to lock the page queues, so the caller
  580  *      must not be holding the lock on them [e.g. pagedaemon had
  581  *      better not call us with the queues locked]
  582  * => we return TRUE unless we encountered some sort of I/O error
  583  *
  584  * comment on "cleaning" object and PG_BUSY pages:
  585  *      this routine is holding the lock on the object.   the only time
  586  *      that it can run into a PG_BUSY page that it does not own is if
  587  *      some other process has started I/O on the page (e.g. either
  588  *      a pagein, or a pageout).    if the PG_BUSY page is being paged
  589  *      in, then it can not be dirty (!PG_CLEAN) because no one has
  590  *      had a chance to modify it yet.    if the PG_BUSY page is being
  591  *      paged out then it means that someone else has already started
  592  *      cleaning the page for us (how nice!).    in this case, if we
  593  *      have syncio specified, then after we make our pass through the
  594  *      object we need to wait for the other PG_BUSY pages to clear
  595  *      off (i.e. we need to do an iosync).   also note that once a
  596  *      page is PG_BUSY it must stay in its object until it is un-busyed.
  597  */
  598 boolean_t
  599 uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
  600 {
  601         struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
  602         struct vm_page *pp, *ptmp;
  603         struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
  604         struct pglist dead;
  605         int npages, result, lcv;
  606         boolean_t retval, need_iosync, needs_clean;
  607         voff_t curoff;
  608 
  609         KASSERT(rw_write_held(uobj->vmobjlock));
  610         TAILQ_INIT(&dead);
  611 
  612         /* get init vals and determine how we are going to traverse object */
  613         need_iosync = FALSE;
  614         retval = TRUE;          /* return value */
  615         if (flags & PGO_ALLPAGES) {
  616                 start = 0;
  617                 stop = round_page(uvn->u_size);
  618         } else {
  619                 start = trunc_page(start);
  620                 stop = MIN(round_page(stop), round_page(uvn->u_size));
  621         }
  622 
  623         /*
  624          * PG_CLEANCHK: this bit is used by the pgo_mk_pcluster function as
  625          * a _hint_ as to how up to date the PG_CLEAN bit is.   if the hint
  626          * is wrong it will only prevent us from clustering... it won't break
  627          * anything.   we clear all PG_CLEANCHK bits here, and pgo_mk_pcluster
  628          * will set them as it syncs PG_CLEAN.   This is only an issue if we
  629          * are looking at non-inactive pages (because inactive page's PG_CLEAN
  630          * bit is always up to date since there are no mappings).
  631          * [borrowed PG_CLEANCHK idea from FreeBSD VM]
  632          */
  633         if ((flags & PGO_CLEANIT) != 0) {
  634                 KASSERT(uobj->pgops->pgo_mk_pcluster != 0);
  635                 for (curoff = start ; curoff < stop; curoff += PAGE_SIZE) {
  636                         if ((pp = uvm_pagelookup(uobj, curoff)) != NULL)
  637                                 atomic_clearbits_int(&pp->pg_flags,
  638                                     PG_CLEANCHK);
  639                 }
  640         }
  641 
  642         ppsp = NULL;            /* XXX: shut up gcc */
  643         uvm_lock_pageq();
  644         /* locked: both page queues */
  645         for (curoff = start; curoff < stop; curoff += PAGE_SIZE) {
  646                 if ((pp = uvm_pagelookup(uobj, curoff)) == NULL)
  647                         continue;
  648                 /*
  649                  * handle case where we do not need to clean page (either
  650                  * because we are not clean or because page is not dirty or
  651                  * is busy):
  652                  *
  653                  * NOTE: we are allowed to deactivate a non-wired active
  654                  * PG_BUSY page, but once a PG_BUSY page is on the inactive
  655                  * queue it must stay put until it is !PG_BUSY (so as not to
  656                  * confuse pagedaemon).
  657                  */
  658                 if ((flags & PGO_CLEANIT) == 0 || (pp->pg_flags & PG_BUSY) != 0) {
  659                         needs_clean = FALSE;
  660                         if ((pp->pg_flags & PG_BUSY) != 0 &&
  661                             (flags & (PGO_CLEANIT|PGO_SYNCIO)) ==
  662                                      (PGO_CLEANIT|PGO_SYNCIO))
  663                                 need_iosync = TRUE;
  664                 } else {
  665                         /*
  666                          * freeing: nuke all mappings so we can sync
  667                          * PG_CLEAN bit with no race
  668                          */
  669                         if ((pp->pg_flags & PG_CLEAN) != 0 &&
  670                             (flags & PGO_FREE) != 0 &&
  671                             (pp->pg_flags & PQ_ACTIVE) != 0)
  672                                 pmap_page_protect(pp, PROT_NONE);
  673                         if ((pp->pg_flags & PG_CLEAN) != 0 &&
  674                             pmap_is_modified(pp))
  675                                 atomic_clearbits_int(&pp->pg_flags, PG_CLEAN);
  676                         atomic_setbits_int(&pp->pg_flags, PG_CLEANCHK);
  677 
  678                         needs_clean = ((pp->pg_flags & PG_CLEAN) == 0);
  679                 }
  680 
  681                 /* if we don't need a clean, deactivate/free pages then cont. */
  682                 if (!needs_clean) {
  683                         if (flags & PGO_DEACTIVATE) {
  684                                 if (pp->wire_count == 0) {
  685                                         pmap_page_protect(pp, PROT_NONE);
  686                                         uvm_pagedeactivate(pp);
  687                                 }
  688                         } else if (flags & PGO_FREE) {
  689                                 if (pp->pg_flags & PG_BUSY) {
  690                                         uvm_unlock_pageq();
  691                                         uvm_pagewait(pp, uobj->vmobjlock,
  692                                             "uvn_flsh");
  693                                         rw_enter(uobj->vmobjlock, RW_WRITE);
  694                                         uvm_lock_pageq();
  695                                         curoff -= PAGE_SIZE;
  696                                         continue;
  697                                 } else {
  698                                         pmap_page_protect(pp, PROT_NONE);
  699                                         /* removed page from object */
  700                                         uvm_pageclean(pp);
  701                                         TAILQ_INSERT_HEAD(&dead, pp, pageq);
  702                                 }
  703                         }
  704                         continue;
  705                 }
  706 
  707                 /*
  708                  * pp points to a page in the object that we are
  709                  * working on.  if it is !PG_CLEAN,!PG_BUSY and we asked
  710                  * for cleaning (PGO_CLEANIT).  we clean it now.
  711                  *
  712                  * let uvm_pager_put attempted a clustered page out.
  713                  * note: locked: page queues.
  714                  */
  715                 atomic_setbits_int(&pp->pg_flags, PG_BUSY);
  716                 UVM_PAGE_OWN(pp, "uvn_flush");
  717                 pmap_page_protect(pp, PROT_READ);
  718                 /* if we're async, free the page in aiodoned */
  719                 if ((flags & (PGO_FREE|PGO_SYNCIO)) == PGO_FREE)
  720                         atomic_setbits_int(&pp->pg_flags, PG_RELEASED);
  721 ReTry:
  722                 ppsp = pps;
  723                 npages = sizeof(pps) / sizeof(struct vm_page *);
  724 
  725                 result = uvm_pager_put(uobj, pp, &ppsp, &npages,
  726                            flags | PGO_DOACTCLUST, start, stop);
  727 
  728                 /*
  729                  * if we did an async I/O it is remotely possible for the
  730                  * async i/o to complete and the page "pp" be freed or what
  731                  * not before we get a chance to relock the object. Therefore,
  732                  * we only touch it when it won't be freed, RELEASED took care
  733                  * of the rest.
  734                  */
  735                 uvm_lock_pageq();
  736 
  737                 /*
  738                  * VM_PAGER_AGAIN: given the structure of this pager, this
  739                  * can only happen when we are doing async I/O and can't
  740                  * map the pages into kernel memory (pager_map) due to lack
  741                  * of vm space.   if this happens we drop back to sync I/O.
  742                  */
  743                 if (result == VM_PAGER_AGAIN) {
  744                         /*
  745                          * it is unlikely, but page could have been released
  746                          * we ignore this now and retry the I/O.
  747                          * we will detect and
  748                          * handle the released page after the syncio I/O
  749                          * completes.
  750                          */
  751 #ifdef DIAGNOSTIC
  752                         if (flags & PGO_SYNCIO)
  753         panic("%s: PGO_SYNCIO return 'try again' error (impossible)", __func__);
  754 #endif
  755                         flags |= PGO_SYNCIO;
  756                         if (flags & PGO_FREE)
  757                                 atomic_clearbits_int(&pp->pg_flags,
  758                                     PG_RELEASED);
  759 
  760                         goto ReTry;
  761                 }
  762 
  763                 /*
  764                  * the cleaning operation is now done.   finish up.  note that
  765                  * on error (!OK, !PEND) uvm_pager_put drops the cluster for us.
  766                  * if success (OK, PEND) then uvm_pager_put returns the cluster
  767                  * to us in ppsp/npages.
  768                  */
  769                 /*
  770                  * for pending async i/o if we are not deactivating
  771                  * we can move on to the next page. aiodoned deals with
  772                  * the freeing case for us.
  773                  */
  774                 if (result == VM_PAGER_PEND && (flags & PGO_DEACTIVATE) == 0)
  775                         continue;
  776 
  777                 /*
  778                  * need to look at each page of the I/O operation, and do what
  779                  * we gotta do.
  780                  */
  781                 for (lcv = 0 ; lcv < npages; lcv++) {
  782                         ptmp = ppsp[lcv];
  783                         /*
  784                          * verify the page didn't get moved
  785                          */
  786                         if (result == VM_PAGER_PEND && ptmp->uobject != uobj)
  787                                 continue;
  788 
  789                         /*
  790                          * unbusy the page if I/O is done.   note that for
  791                          * pending I/O it is possible that the I/O op
  792                          * finished
  793                          * (in which case the page is no longer busy).
  794                          */
  795                         if (result != VM_PAGER_PEND) {
  796                                 if (ptmp->pg_flags & PG_WANTED)
  797                                         wakeup(ptmp);
  798 
  799                                 atomic_clearbits_int(&ptmp->pg_flags,
  800                                     PG_WANTED|PG_BUSY);
  801                                 UVM_PAGE_OWN(ptmp, NULL);
  802                                 atomic_setbits_int(&ptmp->pg_flags,
  803                                     PG_CLEAN|PG_CLEANCHK);
  804                                 if ((flags & PGO_FREE) == 0)
  805                                         pmap_clear_modify(ptmp);
  806                         }
  807 
  808                         /* dispose of page */
  809                         if (flags & PGO_DEACTIVATE) {
  810                                 if (ptmp->wire_count == 0) {
  811                                         pmap_page_protect(ptmp, PROT_NONE);
  812                                         uvm_pagedeactivate(ptmp);
  813                                 }
  814                         } else if (flags & PGO_FREE &&
  815                             result != VM_PAGER_PEND) {
  816                                 if (result != VM_PAGER_OK) {
  817                                         static struct timeval lasttime;
  818                                         static const struct timeval interval =
  819                                             { 5, 0 };
  820 
  821                                         if (ratecheck(&lasttime, &interval)) {
  822                                                 printf("%s: obj=%p, "
  823                                                    "offset=0x%llx.  error "
  824                                                    "during pageout.\n",
  825                                                     __func__, pp->uobject,
  826                                                     (long long)pp->offset);
  827                                                 printf("%s: WARNING: "
  828                                                     "changes to page may be "
  829                                                     "lost!\n", __func__);
  830                                         }
  831                                         retval = FALSE;
  832                                 }
  833                                 pmap_page_protect(ptmp, PROT_NONE);
  834                                 uvm_pageclean(ptmp);
  835                                 TAILQ_INSERT_TAIL(&dead, ptmp, pageq);
  836                         }
  837 
  838                 }               /* end of "lcv" for loop */
  839 
  840         }               /* end of "pp" for loop */
  841 
  842         /* done with pagequeues: unlock */
  843         uvm_unlock_pageq();
  844 
  845         /* now wait for all I/O if required. */
  846         if (need_iosync) {
  847                 while (uvn->u_nio != 0) {
  848                         uvn->u_flags |= UVM_VNODE_IOSYNC;
  849                         rwsleep_nsec(&uvn->u_nio, uobj->vmobjlock, PVM,
  850                             "uvn_flush", INFSLP);
  851                 }
  852                 if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED)
  853                         wakeup(&uvn->u_flags);
  854                 uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED);
  855         }
  856 
  857         uvm_pglistfree(&dead);
  858 
  859         return retval;
  860 }
  861 
  862 /*
  863  * uvn_cluster
  864  *
  865  * we are about to do I/O in an object at offset.   this function is called
  866  * to establish a range of offsets around "offset" in which we can cluster
  867  * I/O.
  868  */
  869 
  870 void
  871 uvn_cluster(struct uvm_object *uobj, voff_t offset, voff_t *loffset,
  872     voff_t *hoffset)
  873 {
  874         struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
  875         *loffset = offset;
  876 
  877         KASSERT(rw_write_held(uobj->vmobjlock));
  878 
  879         if (*loffset >= uvn->u_size)
  880                 panic("uvn_cluster: offset out of range");
  881 
  882         /*
  883          * XXX: old pager claims we could use VOP_BMAP to get maxcontig value.
  884          */
  885         *hoffset = *loffset + MAXBSIZE;
  886         if (*hoffset > round_page(uvn->u_size)) /* past end? */
  887                 *hoffset = round_page(uvn->u_size);
  888 }
  889 
  890 /*
  891  * uvn_put: flush page data to backing store.
  892  *
  893  * => prefer map unlocked (not required)
  894  * => flags: PGO_SYNCIO -- use sync. I/O
  895  * => note: caller must set PG_CLEAN and pmap_clear_modify (if needed)
  896  * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync.
  897  *      [thus we never do async i/o!  see iodone comment]
  898  */
  899 int
  900 uvn_put(struct uvm_object *uobj, struct vm_page **pps, int npages, int flags)
  901 {
  902         struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
  903         int dying, retval;
  904 
  905         KASSERT(rw_write_held(uobj->vmobjlock));
  906 
  907         /*
  908          * Unless we're recycling this vnode, grab a reference to it
  909          * to prevent it from being recycled from under our feet.
  910          * This also makes sure we can don't panic if we end up in
  911          * uvn_vnp_uncache() as a result of the I/O operation as that
  912          * function assumes we hold a reference.
  913          *
  914          * If the vnode is in the process of being recycled by someone
  915          * else, grabbing a refernce will fail.  In that case the
  916          * pages will already be written out by whoever is cleaning
  917          * the vnode, so simply return VM_PAGER_AGAIN such that we
  918          * skip these pages.
  919          */
  920         dying = (uvn->u_flags & UVM_VNODE_DYING);
  921         if (!dying) {
  922                 if (vget(uvn->u_vnode, LK_NOWAIT))
  923                         return VM_PAGER_AGAIN;
  924         }
  925 
  926         retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE);
  927 
  928         if (!dying)
  929                 vrele(uvn->u_vnode);
  930 
  931         return retval;
  932 }
  933 
  934 /*
  935  * uvn_get: get pages (synchronously) from backing store
  936  *
  937  * => prefer map unlocked (not required)
  938  * => flags: PGO_ALLPAGES: get all of the pages
  939  *           PGO_LOCKED: fault data structures are locked
  940  * => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
  941  * => NOTE: caller must check for released pages!!
  942  */
  943 int
  944 uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
  945     int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
  946 {
  947         voff_t current_offset;
  948         struct vm_page *ptmp;
  949         int lcv, result, gotpages;
  950         boolean_t done;
  951 
  952         KASSERT(((flags & PGO_LOCKED) != 0 && rw_lock_held(uobj->vmobjlock)) ||
  953             (flags & PGO_LOCKED) == 0);
  954 
  955         /* step 1: handled the case where fault data structures are locked. */
  956         if (flags & PGO_LOCKED) {
  957                 /*
  958                  * gotpages is the current number of pages we've gotten (which
  959                  * we pass back up to caller via *npagesp.
  960                  */
  961                 gotpages = 0;
  962 
  963                 /*
  964                  * step 1a: get pages that are already resident.   only do this
  965                  * if the data structures are locked (i.e. the first time
  966                  * through).
  967                  */
  968                 done = TRUE;    /* be optimistic */
  969 
  970                 for (lcv = 0, current_offset = offset ; lcv < *npagesp ;
  971                     lcv++, current_offset += PAGE_SIZE) {
  972 
  973                         /* do we care about this page?  if not, skip it */
  974                         if (pps[lcv] == PGO_DONTCARE)
  975                                 continue;
  976 
  977                         /* lookup page */
  978                         ptmp = uvm_pagelookup(uobj, current_offset);
  979 
  980                         /* to be useful must get a non-busy, non-released pg */
  981                         if (ptmp == NULL ||
  982                             (ptmp->pg_flags & PG_BUSY) != 0) {
  983                                 if (lcv == centeridx || (flags & PGO_ALLPAGES)
  984                                     != 0)
  985                                         done = FALSE;   /* need to do a wait or I/O! */
  986                                 continue;
  987                         }
  988 
  989                         /*
  990                          * useful page: busy it and plug it in our
  991                          * result array
  992                          */
  993                         atomic_setbits_int(&ptmp->pg_flags, PG_BUSY);
  994                         UVM_PAGE_OWN(ptmp, "uvn_get1");
  995                         pps[lcv] = ptmp;
  996                         gotpages++;
  997 
  998                 }
  999 
 1000                 /*
 1001                  * XXX: given the "advice", should we consider async read-ahead?
 1002                  * XXX: fault current does deactivate of pages behind us.  is
 1003                  * this good (other callers might now).
 1004                  */
 1005                 /*
 1006                  * XXX: read-ahead currently handled by buffer cache (bread)
 1007                  * level.
 1008                  * XXX: no async i/o available.
 1009                  * XXX: so we don't do anything now.
 1010                  */
 1011 
 1012                 /*
 1013                  * step 1c: now we've either done everything needed or we to
 1014                  * unlock and do some waiting or I/O.
 1015                  */
 1016 
 1017                 *npagesp = gotpages;            /* let caller know */
 1018                 if (done)
 1019                         return VM_PAGER_OK;             /* bingo! */
 1020                 else
 1021                         return VM_PAGER_UNLOCK;
 1022         }
 1023 
 1024         /*
 1025          * step 2: get non-resident or busy pages.
 1026          * data structures are unlocked.
 1027          *
 1028          * XXX: because we can't do async I/O at this level we get things
 1029          * page at a time (otherwise we'd chunk).   the VOP_READ() will do
 1030          * async-read-ahead for us at a lower level.
 1031          */
 1032         for (lcv = 0, current_offset = offset;
 1033                          lcv < *npagesp ; lcv++, current_offset += PAGE_SIZE) {
 1034 
 1035                 /* skip over pages we've already gotten or don't want */
 1036                 /* skip over pages we don't _have_ to get */
 1037                 if (pps[lcv] != NULL || (lcv != centeridx &&
 1038                     (flags & PGO_ALLPAGES) == 0))
 1039                         continue;
 1040 
 1041                 /*
 1042                  * we have yet to locate the current page (pps[lcv]).   we first
 1043                  * look for a page that is already at the current offset.   if
 1044                  * we fine a page, we check to see if it is busy or released.
 1045                  * if that is the case, then we sleep on the page until it is
 1046                  * no longer busy or released and repeat the lookup.    if the
 1047                  * page we found is neither busy nor released, then we busy it
 1048                  * (so we own it) and plug it into pps[lcv].   this breaks the
 1049                  * following while loop and indicates we are ready to move on
 1050                  * to the next page in the "lcv" loop above.
 1051                  *
 1052                  * if we exit the while loop with pps[lcv] still set to NULL,
 1053                  * then it means that we allocated a new busy/fake/clean page
 1054                  * ptmp in the object and we need to do I/O to fill in the data.
 1055                  */
 1056                 while (pps[lcv] == NULL) {      /* top of "pps" while loop */
 1057                         /* look for a current page */
 1058                         ptmp = uvm_pagelookup(uobj, current_offset);
 1059 
 1060                         /* nope?   allocate one now (if we can) */
 1061                         if (ptmp == NULL) {
 1062                                 ptmp = uvm_pagealloc(uobj, current_offset,
 1063                                     NULL, 0);
 1064 
 1065                                 /* out of RAM? */
 1066                                 if (ptmp == NULL) {
 1067                                         uvm_wait("uvn_getpage");
 1068 
 1069                                         /* goto top of pps while loop */
 1070                                         continue;
 1071                                 }
 1072 
 1073                                 /*
 1074                                  * got new page ready for I/O.  break pps
 1075                                  * while loop.  pps[lcv] is still NULL.
 1076                                  */
 1077                                 break;
 1078                         }
 1079 
 1080                         /* page is there, see if we need to wait on it */
 1081                         if ((ptmp->pg_flags & PG_BUSY) != 0) {
 1082                                 uvm_pagewait(ptmp, uobj->vmobjlock, "uvn_get");
 1083                                 rw_enter(uobj->vmobjlock, RW_WRITE);
 1084                                 continue;       /* goto top of pps while loop */
 1085                         }
 1086 
 1087                         /*
 1088                          * if we get here then the page has become resident
 1089                          * and unbusy between steps 1 and 2.  we busy it
 1090                          * now (so we own it) and set pps[lcv] (so that we
 1091                          * exit the while loop).
 1092                          */
 1093                         atomic_setbits_int(&ptmp->pg_flags, PG_BUSY);
 1094                         UVM_PAGE_OWN(ptmp, "uvn_get2");
 1095                         pps[lcv] = ptmp;
 1096                 }
 1097 
 1098                 /*
 1099                  * if we own the a valid page at the correct offset, pps[lcv]
 1100                  * will point to it.   nothing more to do except go to the
 1101                  * next page.
 1102                  */
 1103                 if (pps[lcv])
 1104                         continue;                       /* next lcv */
 1105 
 1106                 /*
 1107                  * we have a "fake/busy/clean" page that we just allocated.  do
 1108                  * I/O to fill it with valid data.
 1109                  */
 1110                 result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1,
 1111                     PGO_SYNCIO|PGO_NOWAIT, UIO_READ);
 1112 
 1113                 /*
 1114                  * I/O done.  because we used syncio the result can not be
 1115                  * PEND or AGAIN.
 1116                  */
 1117                 if (result != VM_PAGER_OK) {
 1118                         if (ptmp->pg_flags & PG_WANTED)
 1119                                 wakeup(ptmp);
 1120 
 1121                         atomic_clearbits_int(&ptmp->pg_flags,
 1122                             PG_WANTED|PG_BUSY);
 1123                         UVM_PAGE_OWN(ptmp, NULL);
 1124                         uvm_lock_pageq();
 1125                         uvm_pagefree(ptmp);
 1126                         uvm_unlock_pageq();
 1127                         rw_exit(uobj->vmobjlock);
 1128                         return result;
 1129                 }
 1130 
 1131                 /*
 1132                  * we got the page!   clear the fake flag (indicates valid
 1133                  * data now in page) and plug into our result array.   note
 1134                  * that page is still busy.
 1135                  *
 1136                  * it is the callers job to:
 1137                  * => check if the page is released
 1138                  * => unbusy the page
 1139                  * => activate the page
 1140                  */
 1141 
 1142                 /* data is valid ... */
 1143                 atomic_clearbits_int(&ptmp->pg_flags, PG_FAKE);
 1144                 pmap_clear_modify(ptmp);                /* ... and clean */
 1145                 pps[lcv] = ptmp;
 1146 
 1147         }
 1148 
 1149 
 1150         rw_exit(uobj->vmobjlock);
 1151         return (VM_PAGER_OK);
 1152 }
 1153 
 1154 /*
 1155  * uvn_io: do I/O to a vnode
 1156  *
 1157  * => prefer map unlocked (not required)
 1158  * => flags: PGO_SYNCIO -- use sync. I/O
 1159  * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync.
 1160  *      [thus we never do async i/o!  see iodone comment]
 1161  */
 1162 
 1163 int
 1164 uvn_io(struct uvm_vnode *uvn, vm_page_t *pps, int npages, int flags, int rw)
 1165 {
 1166         struct uvm_object *uobj = &uvn->u_obj;
 1167         struct vnode *vn;
 1168         struct uio uio;
 1169         struct iovec iov;
 1170         vaddr_t kva;
 1171         off_t file_offset;
 1172         int waitf, result, mapinflags;
 1173         size_t got, wanted;
 1174         int vnlocked, netunlocked = 0;
 1175         int lkflags = (flags & PGO_NOWAIT) ? LK_NOWAIT : 0;
 1176         voff_t uvnsize;
 1177 
 1178         KASSERT(rw_write_held(uobj->vmobjlock));
 1179 
 1180         /* init values */
 1181         waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
 1182         vn = uvn->u_vnode;
 1183         file_offset = pps[0]->offset;
 1184 
 1185         /* check for sync'ing I/O. */
 1186         while (uvn->u_flags & UVM_VNODE_IOSYNC) {
 1187                 if (waitf == M_NOWAIT) {
 1188                         return VM_PAGER_AGAIN;
 1189                 }
 1190                 uvn->u_flags |= UVM_VNODE_IOSYNCWANTED;
 1191                 rwsleep_nsec(&uvn->u_flags, uobj->vmobjlock, PVM, "uvn_iosync",
 1192                     INFSLP);
 1193         }
 1194 
 1195         /* check size */
 1196         if (file_offset >= uvn->u_size) {
 1197                 return VM_PAGER_BAD;
 1198         }
 1199 
 1200         /* first try and map the pages in (without waiting) */
 1201         mapinflags = (rw == UIO_READ) ?
 1202             UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
 1203 
 1204         kva = uvm_pagermapin(pps, npages, mapinflags);
 1205         if (kva == 0 && waitf == M_NOWAIT) {
 1206                 return VM_PAGER_AGAIN;
 1207         }
 1208 
 1209         /*
 1210          * ok, now bump u_nio up.   at this point we are done with uvn
 1211          * and can unlock it.   if we still don't have a kva, try again
 1212          * (this time with sleep ok).
 1213          */
 1214         uvn->u_nio++;                   /* we have an I/O in progress! */
 1215         vnlocked = (uvn->u_flags & UVM_VNODE_VNISLOCKED);
 1216         uvnsize = uvn->u_size;
 1217         rw_exit(uobj->vmobjlock);
 1218         if (kva == 0)
 1219                 kva = uvm_pagermapin(pps, npages,
 1220                     mapinflags | UVMPAGER_MAPIN_WAITOK);
 1221 
 1222         /*
 1223          * ok, mapped in.  our pages are PG_BUSY so they are not going to
 1224          * get touched (so we can look at "offset" without having to lock
 1225          * the object).  set up for I/O.
 1226          */
 1227         /* fill out uio/iov */
 1228         iov.iov_base = (caddr_t) kva;
 1229         wanted = (size_t)npages << PAGE_SHIFT;
 1230         if (file_offset + wanted > uvnsize)
 1231                 wanted = uvnsize - file_offset; /* XXX: needed? */
 1232         iov.iov_len = wanted;
 1233         uio.uio_iov = &iov;
 1234         uio.uio_iovcnt = 1;
 1235         uio.uio_offset = file_offset;
 1236         uio.uio_segflg = UIO_SYSSPACE;
 1237         uio.uio_rw = rw;
 1238         uio.uio_resid = wanted;
 1239         uio.uio_procp = curproc;
 1240 
 1241         /*
 1242          * This process may already have the NET_LOCK(), if we
 1243          * faulted in copyin() or copyout() in the network stack.
 1244          */
 1245         if (rw_status(&netlock) == RW_WRITE) {
 1246                 NET_UNLOCK();
 1247                 netunlocked = 1;
 1248         }
 1249 
 1250         /* do the I/O!  (XXX: curproc?) */
 1251         /*
 1252          * This process may already have this vnode locked, if we faulted in
 1253          * copyin() or copyout() on a region backed by this vnode
 1254          * while doing I/O to the vnode.  If this is the case, don't
 1255          * panic.. instead, return the error to the user.
 1256          *
 1257          * XXX this is a stopgap to prevent a panic.
 1258          * Ideally, this kind of operation *should* work.
 1259          */
 1260         result = 0;
 1261         KERNEL_LOCK();
 1262         if (!vnlocked)
 1263                 result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL | lkflags);
 1264         if (result == 0) {
 1265                 /* NOTE: vnode now locked! */
 1266                 if (rw == UIO_READ)
 1267                         result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
 1268                 else
 1269                         result = VOP_WRITE(vn, &uio,
 1270                             (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
 1271                             curproc->p_ucred);
 1272 
 1273                 if (!vnlocked)
 1274                         VOP_UNLOCK(vn);
 1275 
 1276         }
 1277         KERNEL_UNLOCK();
 1278 
 1279         if (netunlocked)
 1280                 NET_LOCK();
 1281 
 1282 
 1283         /* NOTE: vnode now unlocked (unless vnislocked) */
 1284         /*
 1285          * result == unix style errno (0 == OK!)
 1286          *
 1287          * zero out rest of buffer (if needed)
 1288          */
 1289         if (result == 0) {
 1290                 got = wanted - uio.uio_resid;
 1291 
 1292                 if (wanted && got == 0) {
 1293                         result = EIO;           /* XXX: error? */
 1294                 } else if (got < PAGE_SIZE * npages && rw == UIO_READ) {
 1295                         memset((void *) (kva + got), 0,
 1296                                ((size_t)npages << PAGE_SHIFT) - got);
 1297                 }
 1298         }
 1299 
 1300         /* now remove pager mapping */
 1301         uvm_pagermapout(kva, npages);
 1302 
 1303         /* now clean up the object (i.e. drop I/O count) */
 1304         rw_enter(uobj->vmobjlock, RW_WRITE);
 1305         uvn->u_nio--;                   /* I/O DONE! */
 1306         if ((uvn->u_flags & UVM_VNODE_IOSYNC) != 0 && uvn->u_nio == 0) {
 1307                 wakeup(&uvn->u_nio);
 1308         }
 1309 
 1310         if (result == 0) {
 1311                 return VM_PAGER_OK;
 1312         } else if (result == EBUSY) {
 1313                 KASSERT(flags & PGO_NOWAIT);
 1314                 return VM_PAGER_AGAIN;
 1315         } else {
 1316                 if (rebooting) {
 1317                         KERNEL_LOCK();
 1318                         while (rebooting)
 1319                                 tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP);
 1320                         KERNEL_UNLOCK();
 1321                 }
 1322                 return VM_PAGER_ERROR;
 1323         }
 1324 }
 1325 
 1326 /*
 1327  * uvm_vnp_uncache: disable "persisting" in a vnode... when last reference
 1328  * is gone we will kill the object (flushing dirty pages back to the vnode
 1329  * if needed).
 1330  *
 1331  * => returns TRUE if there was no uvm_object attached or if there was
 1332  *      one and we killed it [i.e. if there is no active uvn]
 1333  * => called with the vnode VOP_LOCK'd [we will unlock it for I/O, if
 1334  *      needed]
 1335  *
 1336  * => XXX: given that we now kill uvn's when a vnode is recycled (without
 1337  *      having to hold a reference on the vnode) and given a working
 1338  *      uvm_vnp_sync(), how does that effect the need for this function?
 1339  *      [XXXCDC: seems like it can die?]
 1340  *
 1341  * => XXX: this function should DIE once we merge the VM and buffer
 1342  *      cache.
 1343  *
 1344  * research shows that this is called in the following places:
 1345  * ext2fs_truncate, ffs_truncate, detrunc[msdosfs]: called when vnode
 1346  *      changes sizes
 1347  * ext2fs_write, WRITE [ufs_readwrite], msdosfs_write: called when we
 1348  *      are written to
 1349  * ex2fs_chmod, ufs_chmod: called if VTEXT vnode and the sticky bit
 1350  *      is off
 1351  * ffs_realloccg: when we can't extend the current block and have
 1352  *      to allocate a new one we call this [XXX: why?]
 1353  * nfsrv_rename, rename_files: called when the target filename is there
 1354  *      and we want to remove it
 1355  * nfsrv_remove, sys_unlink: called on file we are removing
 1356  * nfsrv_access: if VTEXT and we want WRITE access and we don't uncache
 1357  *      then return "text busy"
 1358  * nfs_open: seems to uncache any file opened with nfs
 1359  * vn_writechk: if VTEXT vnode and can't uncache return "text busy"
 1360  * fusefs_open: uncaches any file that is opened
 1361  * fusefs_write: uncaches on every write
 1362  */
 1363 
 1364 int
 1365 uvm_vnp_uncache(struct vnode *vp)
 1366 {
 1367         struct uvm_vnode *uvn = vp->v_uvm;
 1368         struct uvm_object *uobj = &uvn->u_obj;
 1369 
 1370         /* lock uvn part of the vnode and check if we need to do anything */
 1371 
 1372         rw_enter(uobj->vmobjlock, RW_WRITE);
 1373         if ((uvn->u_flags & UVM_VNODE_VALID) == 0 ||
 1374                         (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) {
 1375                 rw_exit(uobj->vmobjlock);
 1376                 return TRUE;
 1377         }
 1378 
 1379         /*
 1380          * we have a valid, non-blocked uvn.   clear persist flag.
 1381          * if uvn is currently active we can return now.
 1382          */
 1383         uvn->u_flags &= ~UVM_VNODE_CANPERSIST;
 1384         if (uvn->u_obj.uo_refs) {
 1385                 rw_exit(uobj->vmobjlock);
 1386                 return FALSE;
 1387         }
 1388 
 1389         /*
 1390          * uvn is currently persisting!   we have to gain a reference to
 1391          * it so that we can call uvn_detach to kill the uvn.
 1392          */
 1393         vref(vp);                       /* seems ok, even with VOP_LOCK */
 1394         uvn->u_obj.uo_refs++;           /* value is now 1 */
 1395         rw_exit(uobj->vmobjlock);
 1396 
 1397 #ifdef VFSLCKDEBUG
 1398         /*
 1399          * carry over sanity check from old vnode pager: the vnode should
 1400          * be VOP_LOCK'd, and we confirm it here.
 1401          */
 1402         if ((vp->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp))
 1403                 panic("uvm_vnp_uncache: vnode not locked!");
 1404 #endif
 1405 
 1406         /*
 1407          * now drop our reference to the vnode.   if we have the sole
 1408          * reference to the vnode then this will cause it to die [as we
 1409          * just cleared the persist flag].   we have to unlock the vnode
 1410          * while we are doing this as it may trigger I/O.
 1411          *
 1412          * XXX: it might be possible for uvn to get reclaimed while we are
 1413          * unlocked causing us to return TRUE when we should not.   we ignore
 1414          * this as a false-positive return value doesn't hurt us.
 1415          */
 1416         VOP_UNLOCK(vp);
 1417         uvn_detach(&uvn->u_obj);
 1418         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1419 
 1420         return TRUE;
 1421 }
 1422 
 1423 /*
 1424  * uvm_vnp_setsize: grow or shrink a vnode uvn
 1425  *
 1426  * grow   => just update size value
 1427  * shrink => toss un-needed pages
 1428  *
 1429  * => we assume that the caller has a reference of some sort to the
 1430  *      vnode in question so that it will not be yanked out from under
 1431  *      us.
 1432  *
 1433  * called from:
 1434  *  => truncate fns (ext2fs_truncate, ffs_truncate, detrunc[msdos],
 1435  *     fusefs_setattr)
 1436  *  => "write" fns (ext2fs_write, WRITE [ufs/ufs], msdosfs_write, nfs_write
 1437  *     fusefs_write)
 1438  *  => ffs_balloc [XXX: why? doesn't WRITE handle?]
 1439  *  => NFS: nfs_loadattrcache, nfs_getattrcache, nfs_setattr
 1440  *  => union fs: union_newsize
 1441  */
 1442 
 1443 void
 1444 uvm_vnp_setsize(struct vnode *vp, off_t newsize)
 1445 {
 1446         struct uvm_vnode *uvn = vp->v_uvm;
 1447         struct uvm_object *uobj = &uvn->u_obj;
 1448 
 1449         KERNEL_ASSERT_LOCKED();
 1450 
 1451         rw_enter(uobj->vmobjlock, RW_WRITE);
 1452 
 1453         /* lock uvn and check for valid object, and if valid: do it! */
 1454         if (uvn->u_flags & UVM_VNODE_VALID) {
 1455 
 1456                 /*
 1457                  * now check if the size has changed: if we shrink we had better
 1458                  * toss some pages...
 1459                  */
 1460 
 1461                 if (uvn->u_size > newsize) {
 1462                         (void)uvn_flush(&uvn->u_obj, newsize,
 1463                             uvn->u_size, PGO_FREE);
 1464                 }
 1465                 uvn->u_size = newsize;
 1466         }
 1467         rw_exit(uobj->vmobjlock);
 1468 }
 1469 
 1470 /*
 1471  * uvm_vnp_sync: flush all dirty VM pages back to their backing vnodes.
 1472  *
 1473  * => called from sys_sync with no VM structures locked
 1474  * => only one process can do a sync at a time (because the uvn
 1475  *    structure only has one queue for sync'ing).  we ensure this
 1476  *    by holding the uvn_sync_lock while the sync is in progress.
 1477  *    other processes attempting a sync will sleep on this lock
 1478  *    until we are done.
 1479  */
 1480 void
 1481 uvm_vnp_sync(struct mount *mp)
 1482 {
 1483         struct uvm_vnode *uvn;
 1484         struct vnode *vp;
 1485 
 1486         /*
 1487          * step 1: ensure we are only ones using the uvn_sync_q by locking
 1488          * our lock...
 1489          */
 1490         rw_enter_write(&uvn_sync_lock);
 1491 
 1492         /*
 1493          * step 2: build up a simpleq of uvns of interest based on the
 1494          * write list.   we gain a reference to uvns of interest. 
 1495          */
 1496         SIMPLEQ_INIT(&uvn_sync_q);
 1497         LIST_FOREACH(uvn, &uvn_wlist, u_wlist) {
 1498                 vp = uvn->u_vnode;
 1499                 if (mp && vp->v_mount != mp)
 1500                         continue;
 1501 
 1502                 /*
 1503                  * If the vnode is "blocked" it means it must be dying, which
 1504                  * in turn means its in the process of being flushed out so
 1505                  * we can safely skip it.
 1506                  *
 1507                  * note that uvn must already be valid because we found it on
 1508                  * the wlist (this also means it can't be ALOCK'd).
 1509                  */
 1510                 if ((uvn->u_flags & UVM_VNODE_BLOCKED) != 0)
 1511                         continue;
 1512 
 1513                 /*
 1514                  * gain reference.   watch out for persisting uvns (need to
 1515                  * regain vnode REF).
 1516                  */
 1517                 if (uvn->u_obj.uo_refs == 0)
 1518                         vref(vp);
 1519                 uvn->u_obj.uo_refs++;
 1520 
 1521                 SIMPLEQ_INSERT_HEAD(&uvn_sync_q, uvn, u_syncq);
 1522         }
 1523 
 1524         /* step 3: we now have a list of uvn's that may need cleaning. */
 1525         SIMPLEQ_FOREACH(uvn, &uvn_sync_q, u_syncq) {
 1526                 rw_enter(uvn->u_obj.vmobjlock, RW_WRITE);
 1527 #ifdef DEBUG
 1528                 if (uvn->u_flags & UVM_VNODE_DYING) {
 1529                         printf("uvm_vnp_sync: dying vnode on sync list\n");
 1530                 }
 1531 #endif
 1532                 uvn_flush(&uvn->u_obj, 0, 0, PGO_CLEANIT|PGO_ALLPAGES|PGO_DOACTCLUST);
 1533 
 1534                 /*
 1535                  * if we have the only reference and we just cleaned the uvn,
 1536                  * then we can pull it out of the UVM_VNODE_WRITEABLE state
 1537                  * thus allowing us to avoid thinking about flushing it again
 1538                  * on later sync ops.
 1539                  */
 1540                 if (uvn->u_obj.uo_refs == 1 &&
 1541                     (uvn->u_flags & UVM_VNODE_WRITEABLE)) {
 1542                         LIST_REMOVE(uvn, u_wlist);
 1543                         uvn->u_flags &= ~UVM_VNODE_WRITEABLE;
 1544                 }
 1545                 rw_exit(uvn->u_obj.vmobjlock);
 1546 
 1547                 /* now drop our reference to the uvn */
 1548                 uvn_detach(&uvn->u_obj);
 1549         }
 1550 
 1551         rw_exit_write(&uvn_sync_lock);
 1552 }
Cache object: 672c0ecd42ed62973a6f511911a45e30
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/uvm/uvm_vnode.c

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_vnode.c