uvm_fault.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: uvm_fault.c,v 1.231 2022/10/26 23:27:32 riastradh Exp $        */
    2 
    3 /*
    4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   26  *
   27  * from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp
   28  */
   29 
   30 /*
   31  * uvm_fault.c: fault handler
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __KERNEL_RCSID(0, "$NetBSD: uvm_fault.c,v 1.231 2022/10/26 23:27:32 riastradh Exp $");
   36 
   37 #include "opt_uvmhist.h"
   38 
   39 #include <sys/param.h>
   40 #include <sys/systm.h>
   41 #include <sys/atomic.h>
   42 #include <sys/kernel.h>
   43 #include <sys/mman.h>
   44 
   45 #include <uvm/uvm.h>
   46 #include <uvm/uvm_pdpolicy.h>
   47 
   48 /*
   49  *
   50  * a word on page faults:
   51  *
   52  * types of page faults we handle:
   53  *
   54  * CASE 1: upper layer faults                   CASE 2: lower layer faults
   55  *
   56  *    CASE 1A         CASE 1B                  CASE 2A        CASE 2B
   57  *    read/write1     write>1                  read/write   +-cow_write/zero
   58  *         |             |                         |        |
   59  *      +--|--+       +--|--+     +-----+       +  |  +     | +-----+
   60  * amap |  V  |       |  ---------> new |          |        | |  ^  |
   61  *      +-----+       +-----+     +-----+       +  |  +     | +--|--+
   62  *                                                 |        |    |
   63  *      +-----+       +-----+                   +--|--+     | +--|--+
   64  * uobj | d/c |       | d/c |                   |  V  |     +----+  |
   65  *      +-----+       +-----+                   +-----+       +-----+
   66  *
   67  * d/c = don't care
   68  *
   69  *   case [0]: layerless fault
   70  *      no amap or uobj is present.   this is an error.
   71  *
   72  *   case [1]: upper layer fault [anon active]
   73  *     1A: [read] or [write with anon->an_ref == 1]
   74  *              I/O takes place in upper level anon and uobj is not touched.
   75  *     1B: [write with anon->an_ref > 1]
   76  *              new anon is alloc'd and data is copied off ["COW"]
   77  *
   78  *   case [2]: lower layer fault [uobj]
   79  *     2A: [read on non-NULL uobj] or [write to non-copy_on_write area]
   80  *              I/O takes place directly in object.
   81  *     2B: [write to copy_on_write] or [read on NULL uobj]
   82  *              data is "promoted" from uobj to a new anon.
   83  *              if uobj is null, then we zero fill.
   84  *
   85  * we follow the standard UVM locking protocol ordering:
   86  *
   87  * MAPS => AMAP => UOBJ => ANON => PAGE QUEUES (PQ)
   88  * we hold a PG_BUSY page if we unlock for I/O
   89  *
   90  *
   91  * the code is structured as follows:
   92  *
   93  *     - init the "IN" params in the ufi structure
   94  *   ReFault: (ERESTART returned to the loop in uvm_fault_internal)
   95  *     - do lookups [locks maps], check protection, handle needs_copy
   96  *     - check for case 0 fault (error)
   97  *     - establish "range" of fault
   98  *     - if we have an amap lock it and extract the anons
   99  *     - if sequential advice deactivate pages behind us
  100  *     - at the same time check pmap for unmapped areas and anon for pages
  101  *       that we could map in (and do map it if found)
  102  *     - check object for resident pages that we could map in
  103  *     - if (case 2) goto Case2
  104  *     - >>> handle case 1
  105  *           - ensure source anon is resident in RAM
  106  *           - if case 1B alloc new anon and copy from source
  107  *           - map the correct page in
  108  *   Case2:
  109  *     - >>> handle case 2
  110  *           - ensure source page is resident (if uobj)
  111  *           - if case 2B alloc new anon and copy from source (could be zero
  112  *              fill if uobj == NULL)
  113  *           - map the correct page in
  114  *     - done!
  115  *
  116  * note on paging:
  117  *   if we have to do I/O we place a PG_BUSY page in the correct object,
  118  * unlock everything, and do the I/O.   when I/O is done we must reverify
  119  * the state of the world before assuming that our data structures are
  120  * valid.   [because mappings could change while the map is unlocked]
  121  *
  122  *  alternative 1: unbusy the page in question and restart the page fault
  123  *    from the top (ReFault).   this is easy but does not take advantage
  124  *    of the information that we already have from our previous lookup,
  125  *    although it is possible that the "hints" in the vm_map will help here.
  126  *
  127  * alternative 2: the system already keeps track of a "version" number of
  128  *    a map.   [i.e. every time you write-lock a map (e.g. to change a
  129  *    mapping) you bump the version number up by one...]   so, we can save
  130  *    the version number of the map before we release the lock and start I/O.
  131  *    then when I/O is done we can relock and check the version numbers
  132  *    to see if anything changed.    this might save us some over 1 because
  133  *    we don't have to unbusy the page and may be less compares(?).
  134  *
  135  * alternative 3: put in backpointers or a way to "hold" part of a map
  136  *    in place while I/O is in progress.   this could be complex to
  137  *    implement (especially with structures like amap that can be referenced
  138  *    by multiple map entries, and figuring out what should wait could be
  139  *    complex as well...).
  140  *
  141  * we use alternative 2.  given that we are multi-threaded now we may want
  142  * to reconsider the choice.
  143  */
  144 
  145 /*
  146  * local data structures
  147  */
  148 
  149 struct uvm_advice {
  150         int advice;
  151         int nback;
  152         int nforw;
  153 };
  154 
  155 /*
  156  * page range array:
  157  * note: index in array must match "advice" value
  158  * XXX: borrowed numbers from freebsd.   do they work well for us?
  159  */
  160 
  161 static const struct uvm_advice uvmadvice[] = {
  162         { UVM_ADV_NORMAL, 3, 4 },
  163         { UVM_ADV_RANDOM, 0, 0 },
  164         { UVM_ADV_SEQUENTIAL, 8, 7},
  165 };
  166 
  167 #define UVM_MAXRANGE 16 /* must be MAX() of nback+nforw+1 */
  168 
  169 /*
  170  * private prototypes
  171  */
  172 
  173 /*
  174  * inline functions
  175  */
  176 
  177 /*
  178  * uvmfault_anonflush: try and deactivate pages in specified anons
  179  *
  180  * => does not have to deactivate page if it is busy
  181  */
  182 
  183 static inline void
  184 uvmfault_anonflush(struct vm_anon **anons, int n)
  185 {
  186         int lcv;
  187         struct vm_page *pg;
  188 
  189         for (lcv = 0; lcv < n; lcv++) {
  190                 if (anons[lcv] == NULL)
  191                         continue;
  192                 KASSERT(rw_lock_held(anons[lcv]->an_lock));
  193                 pg = anons[lcv]->an_page;
  194                 if (pg && (pg->flags & PG_BUSY) == 0) {
  195                         uvm_pagelock(pg);
  196                         uvm_pagedeactivate(pg);
  197                         uvm_pageunlock(pg);
  198                 }
  199         }
  200 }
  201 
  202 /*
  203  * normal functions
  204  */
  205 
  206 /*
  207  * uvmfault_amapcopy: clear "needs_copy" in a map.
  208  *
  209  * => called with VM data structures unlocked (usually, see below)
  210  * => we get a write lock on the maps and clear needs_copy for a VA
  211  * => if we are out of RAM we sleep (waiting for more)
  212  */
  213 
  214 static void
  215 uvmfault_amapcopy(struct uvm_faultinfo *ufi)
  216 {
  217         for (;;) {
  218 
  219                 /*
  220                  * no mapping?  give up.
  221                  */
  222 
  223                 if (uvmfault_lookup(ufi, true) == false)
  224                         return;
  225 
  226                 /*
  227                  * copy if needed.
  228                  */
  229 
  230                 if (UVM_ET_ISNEEDSCOPY(ufi->entry))
  231                         amap_copy(ufi->map, ufi->entry, AMAP_COPY_NOWAIT,
  232                                 ufi->orig_rvaddr, ufi->orig_rvaddr + 1);
  233 
  234                 /*
  235                  * didn't work?  must be out of RAM.   unlock and sleep.
  236                  */
  237 
  238                 if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
  239                         uvmfault_unlockmaps(ufi, true);
  240                         uvm_wait("fltamapcopy");
  241                         continue;
  242                 }
  243 
  244                 /*
  245                  * got it!   unlock and return.
  246                  */
  247 
  248                 uvmfault_unlockmaps(ufi, true);
  249                 return;
  250         }
  251         /*NOTREACHED*/
  252 }
  253 
  254 /*
  255  * uvmfault_anonget: get data in an anon into a non-busy, non-released
  256  * page in that anon.
  257  *
  258  * => Map, amap and thus anon should be locked by caller.
  259  * => If we fail, we unlock everything and error is returned.
  260  * => If we are successful, return with everything still locked.
  261  * => We do not move the page on the queues [gets moved later].  If we
  262  *    allocate a new page [we_own], it gets put on the queues.  Either way,
  263  *    the result is that the page is on the queues at return time
  264  * => For pages which are on loan from a uvm_object (and thus are not owned
  265  *    by the anon): if successful, return with the owning object locked.
  266  *    The caller must unlock this object when it unlocks everything else.
  267  */
  268 
  269 int
  270 uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
  271     struct vm_anon *anon)
  272 {
  273         struct vm_page *pg;
  274         krw_t lock_type;
  275         int error;
  276 
  277         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
  278         KASSERT(rw_lock_held(anon->an_lock));
  279         KASSERT(anon->an_lock == amap->am_lock);
  280 
  281         /* Increment the counters.*/
  282         cpu_count(CPU_COUNT_FLTANGET, 1);
  283         if (anon->an_page) {
  284                 curlwp->l_ru.ru_minflt++;
  285         } else {
  286                 curlwp->l_ru.ru_majflt++;
  287         }
  288         error = 0;
  289 
  290         /*
  291          * Loop until we get the anon data, or fail.
  292          */
  293 
  294         for (;;) {
  295                 bool we_own, locked;
  296                 /*
  297                  * Note: 'we_own' will become true if we set PG_BUSY on a page.
  298                  */
  299                 we_own = false;
  300                 pg = anon->an_page;
  301 
  302                 /*
  303                  * If there is a resident page and it is loaned, then anon
  304                  * may not own it.  Call out to uvm_anon_lockloanpg() to
  305                  * identify and lock the real owner of the page.
  306                  */
  307 
  308                 if (pg && pg->loan_count)
  309                         pg = uvm_anon_lockloanpg(anon);
  310 
  311                 /*
  312                  * Is page resident?  Make sure it is not busy/released.
  313                  */
  314 
  315                 lock_type = rw_lock_op(anon->an_lock);
  316                 if (pg) {
  317 
  318                         /*
  319                          * at this point, if the page has a uobject [meaning
  320                          * we have it on loan], then that uobject is locked
  321                          * by us!   if the page is busy, we drop all the
  322                          * locks (including uobject) and try again.
  323                          */
  324 
  325                         if ((pg->flags & PG_BUSY) == 0) {
  326                                 UVMHIST_LOG(maphist, "<- OK",0,0,0,0);
  327                                 return 0;
  328                         }
  329                         cpu_count(CPU_COUNT_FLTPGWAIT, 1);
  330 
  331                         /*
  332                          * The last unlock must be an atomic unlock and wait
  333                          * on the owner of page.
  334                          */
  335 
  336                         if (pg->uobject) {
  337                                 /* Owner of page is UVM object. */
  338                                 uvmfault_unlockall(ufi, amap, NULL);
  339                                 UVMHIST_LOG(maphist, " unlock+wait on uobj",0,
  340                                     0,0,0);
  341                                 uvm_pagewait(pg, pg->uobject->vmobjlock, "anonget1");
  342                         } else {
  343                                 /* Owner of page is anon. */
  344                                 uvmfault_unlockall(ufi, NULL, NULL);
  345                                 UVMHIST_LOG(maphist, " unlock+wait on anon",0,
  346                                     0,0,0);
  347                                 uvm_pagewait(pg, anon->an_lock, "anonget2");
  348                         }
  349                 } else {
  350 #if defined(VMSWAP)
  351                         /*
  352                          * No page, therefore allocate one.  A write lock is
  353                          * required for this.  If the caller didn't supply
  354                          * one, fail now and have them retry.
  355                          */
  356 
  357                         if (lock_type == RW_READER) {
  358                                 return ENOLCK;
  359                         }
  360                         pg = uvm_pagealloc(NULL,
  361                             ufi != NULL ? ufi->orig_rvaddr : 0,
  362                             anon, ufi != NULL ? UVM_FLAG_COLORMATCH : 0);
  363                         if (pg == NULL) {
  364                                 /* Out of memory.  Wait a little. */
  365                                 uvmfault_unlockall(ufi, amap, NULL);
  366                                 cpu_count(CPU_COUNT_FLTNORAM, 1);
  367                                 UVMHIST_LOG(maphist, "  noram -- UVM_WAIT",0,
  368                                     0,0,0);
  369                                 if (!uvm_reclaimable()) {
  370                                         return ENOMEM;
  371                                 }
  372                                 uvm_wait("flt_noram1");
  373                         } else {
  374                                 /* PG_BUSY bit is set. */
  375                                 we_own = true;
  376                                 uvmfault_unlockall(ufi, amap, NULL);
  377 
  378                                 /*
  379                                  * Pass a PG_BUSY+PG_FAKE clean page into
  380                                  * the uvm_swap_get() function with all data
  381                                  * structures unlocked.  Note that it is OK
  382                                  * to read an_swslot here, because we hold
  383                                  * PG_BUSY on the page.
  384                                  */
  385                                 cpu_count(CPU_COUNT_PAGEINS, 1);
  386                                 error = uvm_swap_get(pg, anon->an_swslot,
  387                                     PGO_SYNCIO);
  388 
  389                                 /*
  390                                  * We clean up after the I/O below in the
  391                                  * 'we_own' case.
  392                                  */
  393                         }
  394 #else
  395                         panic("%s: no page", __func__);
  396 #endif /* defined(VMSWAP) */
  397                 }
  398 
  399                 /*
  400                  * Re-lock the map and anon.
  401                  */
  402 
  403                 locked = uvmfault_relock(ufi);
  404                 if (locked || we_own) {
  405                         rw_enter(anon->an_lock, lock_type);
  406                 }
  407 
  408                 /*
  409                  * If we own the page (i.e. we set PG_BUSY), then we need
  410                  * to clean up after the I/O.  There are three cases to
  411                  * consider:
  412                  *
  413                  * 1) Page was released during I/O: free anon and ReFault.
  414                  * 2) I/O not OK.  Free the page and cause the fault to fail.
  415                  * 3) I/O OK!  Activate the page and sync with the non-we_own
  416                  *    case (i.e. drop anon lock if not locked).
  417                  */
  418 
  419                 if (we_own) {
  420                         KASSERT(lock_type == RW_WRITER);
  421 #if defined(VMSWAP)
  422                         if (error) {
  423 
  424                                 /*
  425                                  * Remove the swap slot from the anon and
  426                                  * mark the anon as having no real slot.
  427                                  * Do not free the swap slot, thus preventing
  428                                  * it from being used again.
  429                                  */
  430 
  431                                 if (anon->an_swslot > 0) {
  432                                         uvm_swap_markbad(anon->an_swslot, 1);
  433                                 }
  434                                 anon->an_swslot = SWSLOT_BAD;
  435 
  436                                 if ((pg->flags & PG_RELEASED) != 0) {
  437                                         goto released;
  438                                 }
  439 
  440                                 /*
  441                                  * Note: page was never !PG_BUSY, so it
  442                                  * cannot be mapped and thus no need to
  443                                  * pmap_page_protect() it.
  444                                  */
  445 
  446                                 uvm_pagefree(pg);
  447 
  448                                 if (locked) {
  449                                         uvmfault_unlockall(ufi, NULL, NULL);
  450                                 }
  451                                 rw_exit(anon->an_lock);
  452                                 UVMHIST_LOG(maphist, "<- ERROR", 0,0,0,0);
  453                                 return error;
  454                         }
  455 
  456                         if ((pg->flags & PG_RELEASED) != 0) {
  457 released:
  458                                 KASSERT(anon->an_ref == 0);
  459 
  460                                 /*
  461                                  * Released while we had unlocked amap.
  462                                  */
  463 
  464                                 if (locked) {
  465                                         uvmfault_unlockall(ufi, NULL, NULL);
  466                                 }
  467                                 uvm_anon_release(anon);
  468 
  469                                 if (error) {
  470                                         UVMHIST_LOG(maphist,
  471                                             "<- ERROR/RELEASED", 0,0,0,0);
  472                                         return error;
  473                                 }
  474 
  475                                 UVMHIST_LOG(maphist, "<- RELEASED", 0,0,0,0);
  476                                 return ERESTART;
  477                         }
  478 
  479                         /*
  480                          * We have successfully read the page, activate it.
  481                          */
  482 
  483                         uvm_pagelock(pg);
  484                         uvm_pageactivate(pg);
  485                         uvm_pagewakeup(pg);
  486                         uvm_pageunlock(pg);
  487                         pg->flags &= ~(PG_BUSY|PG_FAKE);
  488                         uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
  489                         UVM_PAGE_OWN(pg, NULL);
  490 #else
  491                         panic("%s: we_own", __func__);
  492 #endif /* defined(VMSWAP) */
  493                 }
  494 
  495                 /*
  496                  * We were not able to re-lock the map - restart the fault.
  497                  */
  498 
  499                 if (!locked) {
  500                         if (we_own) {
  501                                 rw_exit(anon->an_lock);
  502                         }
  503                         UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0);
  504                         return ERESTART;
  505                 }
  506 
  507                 /*
  508                  * Verify that no one has touched the amap and moved
  509                  * the anon on us.
  510                  */
  511 
  512                 if (ufi != NULL && amap_lookup(&ufi->entry->aref,
  513                     ufi->orig_rvaddr - ufi->entry->start) != anon) {
  514 
  515                         uvmfault_unlockall(ufi, amap, NULL);
  516                         UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0);
  517                         return ERESTART;
  518                 }
  519 
  520                 /*
  521                  * Retry..
  522                  */
  523 
  524                 cpu_count(CPU_COUNT_FLTANRETRY, 1);
  525                 continue;
  526         }
  527         /*NOTREACHED*/
  528 }
  529 
  530 /*
  531  * uvmfault_promote: promote data to a new anon.  used for 1B and 2B.
  532  *
  533  *      1. allocate an anon and a page.
  534  *      2. fill its contents.
  535  *      3. put it into amap.
  536  *
  537  * => if we fail (result != 0) we unlock everything.
  538  * => on success, return a new locked anon via 'nanon'.
  539  *    (*nanon)->an_page will be a resident, locked, dirty page.
  540  * => it's caller's responsibility to put the promoted nanon->an_page to the
  541  *    page queue.
  542  */
  543 
  544 static int
  545 uvmfault_promote(struct uvm_faultinfo *ufi,
  546     struct vm_anon *oanon,
  547     struct vm_page *uobjpage,
  548     struct vm_anon **nanon, /* OUT: allocated anon */
  549     struct vm_anon **spare)
  550 {
  551         struct vm_amap *amap = ufi->entry->aref.ar_amap;
  552         struct uvm_object *uobj;
  553         struct vm_anon *anon;
  554         struct vm_page *pg;
  555         struct vm_page *opg;
  556         int error;
  557         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
  558 
  559         if (oanon) {
  560                 /* anon COW */
  561                 opg = oanon->an_page;
  562                 KASSERT(opg != NULL);
  563                 KASSERT(opg->uobject == NULL || opg->loan_count > 0);
  564         } else if (uobjpage != PGO_DONTCARE) {
  565                 /* object-backed COW */
  566                 opg = uobjpage;
  567                 KASSERT(rw_lock_held(opg->uobject->vmobjlock));
  568         } else {
  569                 /* ZFOD */
  570                 opg = NULL;
  571         }
  572         if (opg != NULL) {
  573                 uobj = opg->uobject;
  574         } else {
  575                 uobj = NULL;
  576         }
  577 
  578         KASSERT(amap != NULL);
  579         KASSERT(uobjpage != NULL);
  580         KASSERT(rw_write_held(amap->am_lock));
  581         KASSERT(oanon == NULL || amap->am_lock == oanon->an_lock);
  582         KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
  583 
  584         if (*spare != NULL) {
  585                 anon = *spare;
  586                 *spare = NULL;
  587         } else {
  588                 anon = uvm_analloc();
  589         }
  590         if (anon) {
  591 
  592                 /*
  593                  * The new anon is locked.
  594                  *
  595                  * if opg == NULL, we want a zero'd, dirty page,
  596                  * so have uvm_pagealloc() do that for us.
  597                  */
  598 
  599                 KASSERT(anon->an_lock == NULL);
  600                 anon->an_lock = amap->am_lock;
  601                 pg = uvm_pagealloc(NULL, ufi->orig_rvaddr, anon,
  602                     UVM_FLAG_COLORMATCH | (opg == NULL ? UVM_PGA_ZERO : 0));
  603                 if (pg == NULL) {
  604                         anon->an_lock = NULL;
  605                 }
  606         } else {
  607                 pg = NULL;
  608         }
  609 
  610         /*
  611          * out of memory resources?
  612          */
  613 
  614         if (pg == NULL) {
  615                 /* save anon for the next try. */
  616                 if (anon != NULL) {
  617                         *spare = anon;
  618                 }
  619 
  620                 /* unlock and fail ... */
  621                 uvmfault_unlockall(ufi, amap, uobj);
  622                 if (!uvm_reclaimable()) {
  623                         UVMHIST_LOG(maphist, "out of VM", 0,0,0,0);
  624                         cpu_count(CPU_COUNT_FLTNOANON, 1);
  625                         error = ENOMEM;
  626                         goto done;
  627                 }
  628 
  629                 UVMHIST_LOG(maphist, "out of RAM, waiting for more", 0,0,0,0);
  630                 cpu_count(CPU_COUNT_FLTNORAM, 1);
  631                 uvm_wait("flt_noram5");
  632                 error = ERESTART;
  633                 goto done;
  634         }
  635 
  636         /* copy page [pg now dirty] */
  637         if (opg) {
  638                 uvm_pagecopy(opg, pg);
  639         }
  640         KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY);
  641 
  642         amap_add(&ufi->entry->aref, ufi->orig_rvaddr - ufi->entry->start, anon,
  643             oanon != NULL);
  644 
  645         /*
  646          * from this point on am_lock won't be dropped until the page is
  647          * entered, so it's safe to unbusy the page up front.
  648          *
  649          * uvm_fault_{upper,lower}_done will activate or enqueue the page.
  650          */
  651 
  652         pg = anon->an_page;
  653         pg->flags &= ~(PG_BUSY|PG_FAKE);
  654         UVM_PAGE_OWN(pg, NULL);
  655 
  656         *nanon = anon;
  657         error = 0;
  658 done:
  659         return error;
  660 }
  661 
  662 /*
  663  * Update statistics after fault resolution.
  664  * - maxrss
  665  */
  666 void
  667 uvmfault_update_stats(struct uvm_faultinfo *ufi)
  668 {
  669         struct vm_map           *map;
  670         struct vmspace          *vm;
  671         struct proc             *p;
  672         vsize_t                  res;
  673 
  674         map = ufi->orig_map;
  675 
  676         p = curproc;
  677         KASSERT(p != NULL);
  678         vm = p->p_vmspace;
  679 
  680         if (&vm->vm_map != map)
  681                 return;
  682 
  683         res = pmap_resident_count(map->pmap);
  684         if (vm->vm_rssmax < res)
  685                 vm->vm_rssmax = res;
  686 }
  687 
  688 /*
  689  *   F A U L T   -   m a i n   e n t r y   p o i n t
  690  */
  691 
  692 /*
  693  * uvm_fault: page fault handler
  694  *
  695  * => called from MD code to resolve a page fault
  696  * => VM data structures usually should be unlocked.   however, it is
  697  *      possible to call here with the main map locked if the caller
  698  *      gets a write lock, sets it recursive, and then calls us (c.f.
  699  *      uvm_map_pageable).   this should be avoided because it keeps
  700  *      the map locked off during I/O.
  701  * => MUST NEVER BE CALLED IN INTERRUPT CONTEXT
  702  */
  703 
  704 #define MASK(entry)     (UVM_ET_ISCOPYONWRITE(entry) ? \
  705                          ~VM_PROT_WRITE : VM_PROT_ALL)
  706 
  707 /* fault_flag values passed from uvm_fault_wire to uvm_fault_internal */
  708 #define UVM_FAULT_WIRE          (1 << 0)
  709 #define UVM_FAULT_MAXPROT       (1 << 1)
  710 
  711 struct uvm_faultctx {
  712 
  713         /*
  714          * the following members are set up by uvm_fault_check() and
  715          * read-only after that.
  716          *
  717          * note that narrow is used by uvm_fault_check() to change
  718          * the behaviour after ERESTART.
  719          *
  720          * most of them might change after RESTART if the underlying
  721          * map entry has been changed behind us.  an exception is
  722          * wire_paging, which does never change.
  723          */
  724         vm_prot_t access_type;
  725         vaddr_t startva;
  726         int npages;
  727         int centeridx;
  728         bool narrow;            /* work on a single requested page only */
  729         bool wire_mapping;      /* request a PMAP_WIRED mapping
  730                                    (UVM_FAULT_WIRE or VM_MAPENT_ISWIRED) */
  731         bool wire_paging;       /* request uvm_pagewire
  732                                    (true for UVM_FAULT_WIRE) */
  733         bool cow_now;           /* VM_PROT_WRITE is actually requested
  734                                    (ie. should break COW and page loaning) */
  735 
  736         /*
  737          * enter_prot is set up by uvm_fault_check() and clamped
  738          * (ie. drop the VM_PROT_WRITE bit) in various places in case
  739          * of !cow_now.
  740          */
  741         vm_prot_t enter_prot;   /* prot at which we want to enter pages in */
  742 
  743         /*
  744          * the following member is for uvmfault_promote() and ERESTART.
  745          */
  746         struct vm_anon *anon_spare;
  747 
  748         /*
  749          * the following is actually a uvm_fault_lower() internal.
  750          * it's here merely for debugging.
  751          * (or due to the mechanical separation of the function?)
  752          */
  753         bool promote;
  754 
  755         /*
  756          * type of lock to acquire on objects in both layers.
  757          */
  758         krw_t lower_lock_type;
  759         krw_t upper_lock_type;
  760 };
  761 
  762 static inline int       uvm_fault_check(
  763                             struct uvm_faultinfo *, struct uvm_faultctx *,
  764                             struct vm_anon ***, bool);
  765 
  766 static int              uvm_fault_upper(
  767                             struct uvm_faultinfo *, struct uvm_faultctx *,
  768                             struct vm_anon **);
  769 static inline int       uvm_fault_upper_lookup(
  770                             struct uvm_faultinfo *, const struct uvm_faultctx *,
  771                             struct vm_anon **, struct vm_page **);
  772 static inline void      uvm_fault_upper_neighbor(
  773                             struct uvm_faultinfo *, const struct uvm_faultctx *,
  774                             vaddr_t, struct vm_page *, bool);
  775 static inline int       uvm_fault_upper_loan(
  776                             struct uvm_faultinfo *, struct uvm_faultctx *,
  777                             struct vm_anon *, struct uvm_object **);
  778 static inline int       uvm_fault_upper_promote(
  779                             struct uvm_faultinfo *, struct uvm_faultctx *,
  780                             struct uvm_object *, struct vm_anon *);
  781 static inline int       uvm_fault_upper_direct(
  782                             struct uvm_faultinfo *, struct uvm_faultctx *,
  783                             struct uvm_object *, struct vm_anon *);
  784 static int              uvm_fault_upper_enter(
  785                             struct uvm_faultinfo *, const struct uvm_faultctx *,
  786                             struct uvm_object *, struct vm_anon *,
  787                             struct vm_page *, struct vm_anon *);
  788 static inline void      uvm_fault_upper_done(
  789                             struct uvm_faultinfo *, const struct uvm_faultctx *,
  790                             struct vm_anon *, struct vm_page *);
  791 
  792 static int              uvm_fault_lower(
  793                             struct uvm_faultinfo *, struct uvm_faultctx *,
  794                             struct vm_page **);
  795 static inline void      uvm_fault_lower_lookup(
  796                             struct uvm_faultinfo *, const struct uvm_faultctx *,
  797                             struct vm_page **);
  798 static inline void      uvm_fault_lower_neighbor(
  799                             struct uvm_faultinfo *, const struct uvm_faultctx *,
  800                             vaddr_t, struct vm_page *);
  801 static inline int       uvm_fault_lower_io(
  802                             struct uvm_faultinfo *, struct uvm_faultctx *,
  803                             struct uvm_object **, struct vm_page **);
  804 static inline int       uvm_fault_lower_direct(
  805                             struct uvm_faultinfo *, struct uvm_faultctx *,
  806                             struct uvm_object *, struct vm_page *);
  807 static inline int       uvm_fault_lower_direct_loan(
  808                             struct uvm_faultinfo *, struct uvm_faultctx *,
  809                             struct uvm_object *, struct vm_page **,
  810                             struct vm_page **);
  811 static inline int       uvm_fault_lower_promote(
  812                             struct uvm_faultinfo *, struct uvm_faultctx *,
  813                             struct uvm_object *, struct vm_page *);
  814 static int              uvm_fault_lower_enter(
  815                             struct uvm_faultinfo *, const struct uvm_faultctx *,
  816                             struct uvm_object *,
  817                             struct vm_anon *, struct vm_page *);
  818 static inline void      uvm_fault_lower_done(
  819                             struct uvm_faultinfo *, const struct uvm_faultctx *,
  820                             struct uvm_object *, struct vm_page *);
  821 
  822 int
  823 uvm_fault_internal(struct vm_map *orig_map, vaddr_t vaddr,
  824     vm_prot_t access_type, int fault_flag)
  825 {
  826         struct uvm_faultinfo ufi;
  827         struct uvm_faultctx flt = {
  828                 .access_type = access_type,
  829 
  830                 /* don't look for neighborhood * pages on "wire" fault */
  831                 .narrow = (fault_flag & UVM_FAULT_WIRE) != 0,
  832 
  833                 /* "wire" fault causes wiring of both mapping and paging */
  834                 .wire_mapping = (fault_flag & UVM_FAULT_WIRE) != 0,
  835                 .wire_paging = (fault_flag & UVM_FAULT_WIRE) != 0,
  836 
  837                 /*
  838                  * default lock type to acquire on upper & lower layer
  839                  * objects: reader.  this can be upgraded at any point
  840                  * during the fault from read -> write and uvm_faultctx
  841                  * changed to match, but is never downgraded write -> read.
  842                  */
  843 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
  844                 .upper_lock_type = RW_WRITER,
  845                 .lower_lock_type = RW_WRITER,
  846 #else
  847                 .upper_lock_type = RW_READER,
  848                 .lower_lock_type = RW_READER,
  849 #endif
  850         };
  851         const bool maxprot = (fault_flag & UVM_FAULT_MAXPROT) != 0;
  852         struct vm_anon *anons_store[UVM_MAXRANGE], **anons;
  853         struct vm_page *pages_store[UVM_MAXRANGE], **pages;
  854         int error;
  855 
  856         UVMHIST_FUNC(__func__);
  857         UVMHIST_CALLARGS(maphist, "(map=%#jx, vaddr=%#jx, at=%jd, ff=%jd)",
  858               (uintptr_t)orig_map, vaddr, access_type, fault_flag);
  859 
  860         /* Don't count anything until user interaction is possible */
  861         kpreempt_disable();
  862         if (__predict_true(start_init_exec)) {
  863                 struct cpu_info *ci = curcpu();
  864                 CPU_COUNT(CPU_COUNT_NFAULT, 1);
  865                 /* Don't flood RNG subsystem with samples. */
  866                 if (++(ci->ci_faultrng) == 503) {
  867                         ci->ci_faultrng = 0;
  868                         rnd_add_uint32(&curcpu()->ci_data.cpu_uvm->rs,
  869                             sizeof(vaddr_t) == sizeof(uint32_t) ?
  870                             (uint32_t)vaddr : sizeof(vaddr_t) ==
  871                             sizeof(uint64_t) ?
  872                             (uint32_t)vaddr :
  873                             (uint32_t)ci->ci_counts[CPU_COUNT_NFAULT]);
  874                 }
  875         }
  876         kpreempt_enable();
  877 
  878         /*
  879          * init the IN parameters in the ufi
  880          */
  881 
  882         ufi.orig_map = orig_map;
  883         ufi.orig_rvaddr = trunc_page(vaddr);
  884         ufi.orig_size = PAGE_SIZE;      /* can't get any smaller than this */
  885 
  886         error = ERESTART;
  887         while (error == ERESTART) { /* ReFault: */
  888                 anons = anons_store;
  889                 pages = pages_store;
  890 
  891                 error = uvm_fault_check(&ufi, &flt, &anons, maxprot);
  892                 if (error != 0)
  893                         continue;
  894 
  895                 error = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
  896                 if (error != 0)
  897                         continue;
  898 
  899                 if (pages[flt.centeridx] == PGO_DONTCARE)
  900                         error = uvm_fault_upper(&ufi, &flt, anons);
  901                 else {
  902                         struct uvm_object * const uobj =
  903                             ufi.entry->object.uvm_obj;
  904 
  905                         if (uobj && uobj->pgops->pgo_fault != NULL) {
  906                                 /*
  907                                  * invoke "special" fault routine.
  908                                  */
  909                                 rw_enter(uobj->vmobjlock, RW_WRITER);
  910                                 /* locked: maps(read), amap(if there), uobj */
  911                                 error = uobj->pgops->pgo_fault(&ufi,
  912                                     flt.startva, pages, flt.npages,
  913                                     flt.centeridx, flt.access_type,
  914                                     PGO_LOCKED|PGO_SYNCIO);
  915 
  916                                 /*
  917                                  * locked: nothing, pgo_fault has unlocked
  918                                  * everything
  919                                  */
  920 
  921                                 /*
  922                                  * object fault routine responsible for
  923                                  * pmap_update().
  924                                  */
  925 
  926                                 /*
  927                                  * Wake up the pagedaemon if the fault method
  928                                  * failed for lack of memory but some can be
  929                                  * reclaimed.
  930                                  */
  931                                 if (error == ENOMEM && uvm_reclaimable()) {
  932                                         uvm_wait("pgo_fault");
  933                                         error = ERESTART;
  934                                 }
  935                         } else {
  936                                 error = uvm_fault_lower(&ufi, &flt, pages);
  937                         }
  938                 }
  939         }
  940 
  941         if (flt.anon_spare != NULL) {
  942                 flt.anon_spare->an_ref--;
  943                 KASSERT(flt.anon_spare->an_ref == 0);
  944                 KASSERT(flt.anon_spare->an_lock == NULL);
  945                 uvm_anfree(flt.anon_spare);
  946         }
  947         return error;
  948 }
  949 
  950 /*
  951  * uvm_fault_check: check prot, handle needs-copy, etc.
  952  *
  953  *      1. lookup entry.
  954  *      2. check protection.
  955  *      3. adjust fault condition (mainly for simulated fault).
  956  *      4. handle needs-copy (lazy amap copy).
  957  *      5. establish range of interest for neighbor fault (aka pre-fault).
  958  *      6. look up anons (if amap exists).
  959  *      7. flush pages (if MADV_SEQUENTIAL)
  960  *
  961  * => called with nothing locked.
  962  * => if we fail (result != 0) we unlock everything.
  963  * => initialize/adjust many members of flt.
  964  */
  965 
  966 static int
  967 uvm_fault_check(
  968         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
  969         struct vm_anon ***ranons, bool maxprot)
  970 {
  971         struct vm_amap *amap;
  972         struct uvm_object *uobj;
  973         vm_prot_t check_prot;
  974         int nback, nforw;
  975         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
  976 
  977         /*
  978          * lookup and lock the maps
  979          */
  980 
  981         if (uvmfault_lookup(ufi, false) == false) {
  982                 UVMHIST_LOG(maphist, "<- no mapping @ %#jx", ufi->orig_rvaddr,
  983                     0,0,0);
  984                 return EFAULT;
  985         }
  986         /* locked: maps(read) */
  987 
  988 #ifdef DIAGNOSTIC
  989         if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0) {
  990                 printf("Page fault on non-pageable map:\n");
  991                 printf("ufi->map = %p\n", ufi->map);
  992                 printf("ufi->orig_map = %p\n", ufi->orig_map);
  993                 printf("ufi->orig_rvaddr = %#lx\n", (u_long) ufi->orig_rvaddr);
  994                 panic("uvm_fault: (ufi->map->flags & VM_MAP_PAGEABLE) == 0");
  995         }
  996 #endif
  997 
  998         /*
  999          * check protection
 1000          */
 1001 
 1002         check_prot = maxprot ?
 1003             ufi->entry->max_protection : ufi->entry->protection;
 1004         if ((check_prot & flt->access_type) != flt->access_type) {
 1005                 UVMHIST_LOG(maphist,
 1006                     "<- protection failure (prot=%#jx, access=%#jx)",
 1007                     ufi->entry->protection, flt->access_type, 0, 0);
 1008                 uvmfault_unlockmaps(ufi, false);
 1009                 return EFAULT;
 1010         }
 1011 
 1012         /*
 1013          * "enter_prot" is the protection we want to enter the page in at.
 1014          * for certain pages (e.g. copy-on-write pages) this protection can
 1015          * be more strict than ufi->entry->protection.  "wired" means either
 1016          * the entry is wired or we are fault-wiring the pg.
 1017          */
 1018 
 1019         flt->enter_prot = ufi->entry->protection;
 1020         if (VM_MAPENT_ISWIRED(ufi->entry)) {
 1021                 flt->wire_mapping = true;
 1022                 flt->wire_paging = true;
 1023                 flt->narrow = true;
 1024         }
 1025 
 1026         if (flt->wire_mapping) {
 1027                 flt->access_type = flt->enter_prot; /* full access for wired */
 1028                 flt->cow_now = (check_prot & VM_PROT_WRITE) != 0;
 1029         } else {
 1030                 flt->cow_now = (flt->access_type & VM_PROT_WRITE) != 0;
 1031         }
 1032 
 1033         if (flt->wire_paging) {
 1034                 /* wiring pages requires a write lock. */
 1035                 flt->upper_lock_type = RW_WRITER;
 1036                 flt->lower_lock_type = RW_WRITER;
 1037         }
 1038 
 1039         flt->promote = false;
 1040 
 1041         /*
 1042          * handle "needs_copy" case.   if we need to copy the amap we will
 1043          * have to drop our readlock and relock it with a write lock.  (we
 1044          * need a write lock to change anything in a map entry [e.g.
 1045          * needs_copy]).
 1046          */
 1047 
 1048         if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
 1049                 if (flt->cow_now || (ufi->entry->object.uvm_obj == NULL)) {
 1050                         KASSERT(!maxprot);
 1051                         /* need to clear */
 1052                         UVMHIST_LOG(maphist,
 1053                             "  need to clear needs_copy and refault",0,0,0,0);
 1054                         uvmfault_unlockmaps(ufi, false);
 1055                         uvmfault_amapcopy(ufi);
 1056                         cpu_count(CPU_COUNT_FLTAMCOPY, 1);
 1057                         return ERESTART;
 1058 
 1059                 } else {
 1060 
 1061                         /*
 1062                          * ensure that we pmap_enter page R/O since
 1063                          * needs_copy is still true
 1064                          */
 1065 
 1066                         flt->enter_prot &= ~VM_PROT_WRITE;
 1067                 }
 1068         }
 1069 
 1070         /*
 1071          * identify the players
 1072          */
 1073 
 1074         amap = ufi->entry->aref.ar_amap;        /* upper layer */
 1075         uobj = ufi->entry->object.uvm_obj;      /* lower layer */
 1076 
 1077         /*
 1078          * check for a case 0 fault.  if nothing backing the entry then
 1079          * error now.
 1080          */
 1081 
 1082         if (amap == NULL && uobj == NULL) {
 1083                 uvmfault_unlockmaps(ufi, false);
 1084                 UVMHIST_LOG(maphist,"<- no backing store, no overlay",0,0,0,0);
 1085                 return EFAULT;
 1086         }
 1087 
 1088         /*
 1089          * for a case 2B fault waste no time on adjacent pages because
 1090          * they are likely already entered.
 1091          */
 1092 
 1093         if (uobj != NULL && amap != NULL &&
 1094             (flt->access_type & VM_PROT_WRITE) != 0) {
 1095                 /* wide fault (!narrow) */
 1096                 flt->narrow = true;
 1097         }
 1098 
 1099         /*
 1100          * establish range of interest based on advice from mapper
 1101          * and then clip to fit map entry.   note that we only want
 1102          * to do this the first time through the fault.   if we
 1103          * ReFault we will disable this by setting "narrow" to true.
 1104          */
 1105 
 1106         if (flt->narrow == false) {
 1107 
 1108                 /* wide fault (!narrow) */
 1109                 KASSERT(uvmadvice[ufi->entry->advice].advice ==
 1110                          ufi->entry->advice);
 1111                 nback = MIN(uvmadvice[ufi->entry->advice].nback,
 1112                     (ufi->orig_rvaddr - ufi->entry->start) >> PAGE_SHIFT);
 1113                 flt->startva = ufi->orig_rvaddr - (nback << PAGE_SHIFT);
 1114                 /*
 1115                  * note: "-1" because we don't want to count the
 1116                  * faulting page as forw
 1117                  */
 1118                 nforw = MIN(uvmadvice[ufi->entry->advice].nforw,
 1119                             ((ufi->entry->end - ufi->orig_rvaddr) >>
 1120                              PAGE_SHIFT) - 1);
 1121                 flt->npages = nback + nforw + 1;
 1122                 flt->centeridx = nback;
 1123 
 1124                 flt->narrow = true;     /* ensure only once per-fault */
 1125 
 1126         } else {
 1127 
 1128                 /* narrow fault! */
 1129                 nback = nforw = 0;
 1130                 flt->startva = ufi->orig_rvaddr;
 1131                 flt->npages = 1;
 1132                 flt->centeridx = 0;
 1133 
 1134         }
 1135         /* offset from entry's start to pgs' start */
 1136         const voff_t eoff = flt->startva - ufi->entry->start;
 1137 
 1138         /* locked: maps(read) */
 1139         UVMHIST_LOG(maphist, "  narrow=%jd, back=%jd, forw=%jd, startva=%#jx",
 1140                     flt->narrow, nback, nforw, flt->startva);
 1141         UVMHIST_LOG(maphist, "  entry=%#jx, amap=%#jx, obj=%#jx",
 1142             (uintptr_t)ufi->entry, (uintptr_t)amap, (uintptr_t)uobj, 0);
 1143 
 1144         /*
 1145          * guess at the most suitable lock types to acquire.
 1146          * if we've got an amap then lock it and extract current anons.
 1147          */
 1148 
 1149         if (amap) {
 1150                 if ((amap_flags(amap) & AMAP_SHARED) == 0) {
 1151                         /*
 1152                          * the amap isn't shared.  get a writer lock to
 1153                          * avoid the cost of upgrading the lock later if
 1154                          * needed.
 1155                          *
 1156                          * XXX nice for PostgreSQL, but consider threads.
 1157                          */
 1158                         flt->upper_lock_type = RW_WRITER;
 1159                 } else if ((flt->access_type & VM_PROT_WRITE) != 0) {
 1160                         /*
 1161                          * assume we're about to COW.
 1162                          */
 1163                         flt->upper_lock_type = RW_WRITER;
 1164                 }
 1165                 amap_lock(amap, flt->upper_lock_type);
 1166                 amap_lookups(&ufi->entry->aref, eoff, *ranons, flt->npages);
 1167         } else {
 1168                 if ((flt->access_type & VM_PROT_WRITE) != 0) {
 1169                         /*
 1170                          * we are about to dirty the object and that
 1171                          * requires a write lock.
 1172                          */
 1173                         flt->lower_lock_type = RW_WRITER;
 1174                 }
 1175                 *ranons = NULL; /* to be safe */
 1176         }
 1177 
 1178         /* locked: maps(read), amap(if there) */
 1179         KASSERT(amap == NULL ||
 1180             rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 1181 
 1182         /*
 1183          * for MADV_SEQUENTIAL mappings we want to deactivate the back pages
 1184          * now and then forget about them (for the rest of the fault).
 1185          */
 1186 
 1187         if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) {
 1188 
 1189                 UVMHIST_LOG(maphist, "  MADV_SEQUENTIAL: flushing backpages",
 1190                     0,0,0,0);
 1191                 /* flush back-page anons? */
 1192                 if (amap)
 1193                         uvmfault_anonflush(*ranons, nback);
 1194 
 1195                 /*
 1196                  * flush object?  change lock type to RW_WRITER, to avoid
 1197                  * excessive competition between read/write locks if many
 1198                  * threads doing "sequential access".
 1199                  */
 1200                 if (uobj) {
 1201                         voff_t uoff;
 1202 
 1203                         flt->lower_lock_type = RW_WRITER;
 1204                         uoff = ufi->entry->offset + eoff;
 1205                         rw_enter(uobj->vmobjlock, RW_WRITER);
 1206                         (void) (uobj->pgops->pgo_put)(uobj, uoff, uoff +
 1207                                     (nback << PAGE_SHIFT), PGO_DEACTIVATE);
 1208                 }
 1209 
 1210                 /* now forget about the backpages */
 1211                 if (amap)
 1212                         *ranons += nback;
 1213                 flt->startva += (nback << PAGE_SHIFT);
 1214                 flt->npages -= nback;
 1215                 flt->centeridx = 0;
 1216         }
 1217         /*
 1218          * => startva is fixed
 1219          * => npages is fixed
 1220          */
 1221         KASSERT(flt->startva <= ufi->orig_rvaddr);
 1222         KASSERT(ufi->orig_rvaddr + ufi->orig_size <=
 1223             flt->startva + (flt->npages << PAGE_SHIFT));
 1224         return 0;
 1225 }
 1226 
 1227 /*
 1228  * uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer
 1229  */
 1230 
 1231 static inline int
 1232 uvm_fault_upper_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 1233     struct vm_amap *amap, struct uvm_object *uobj)
 1234 {
 1235         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1236 
 1237         KASSERT(amap != NULL);
 1238         KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock));
 1239 
 1240         /*
 1241          * fast path.
 1242          */
 1243 
 1244         if (__predict_true(flt->upper_lock_type == RW_WRITER)) {
 1245                 return 0;
 1246         }
 1247 
 1248         /*
 1249          * otherwise try for the upgrade.  if we don't get it, unlock
 1250          * everything, restart the fault and next time around get a writer
 1251          * lock.
 1252          */
 1253 
 1254         flt->upper_lock_type = RW_WRITER;
 1255         if (__predict_false(!rw_tryupgrade(amap->am_lock))) {
 1256                 uvmfault_unlockall(ufi, amap, uobj);
 1257                 cpu_count(CPU_COUNT_FLTNOUP, 1);
 1258                 UVMHIST_LOG(maphist, "  !upgrade upper", 0, 0,0,0);
 1259                 return ERESTART;
 1260         }
 1261         cpu_count(CPU_COUNT_FLTUP, 1);
 1262         KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock));
 1263         return 0;
 1264 }
 1265 
 1266 /*
 1267  * uvm_fault_upper_lookup: look up existing h/w mapping and amap.
 1268  *
 1269  * iterate range of interest:
 1270  *      1. check if h/w mapping exists.  if yes, we don't care
 1271  *      2. check if anon exists.  if not, page is lower.
 1272  *      3. if anon exists, enter h/w mapping for neighbors.
 1273  *
 1274  * => called with amap locked (if exists).
 1275  */
 1276 
 1277 static int
 1278 uvm_fault_upper_lookup(
 1279         struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
 1280         struct vm_anon **anons, struct vm_page **pages)
 1281 {
 1282         struct vm_amap *amap = ufi->entry->aref.ar_amap;
 1283         int lcv;
 1284         vaddr_t currva;
 1285         bool shadowed __unused;
 1286         bool entered;
 1287         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1288 
 1289         /* locked: maps(read), amap(if there) */
 1290         KASSERT(amap == NULL ||
 1291             rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 1292 
 1293         /*
 1294          * map in the backpages and frontpages we found in the amap in hopes
 1295          * of preventing future faults.    we also init the pages[] array as
 1296          * we go.
 1297          */
 1298 
 1299         currva = flt->startva;
 1300         shadowed = false;
 1301         entered = false;
 1302         for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
 1303                 /*
 1304                  * unmapped or center page.   check if any anon at this level.
 1305                  */
 1306                 if (amap == NULL || anons[lcv] == NULL) {
 1307                         pages[lcv] = NULL;
 1308                         continue;
 1309                 }
 1310 
 1311                 /*
 1312                  * check for present page and map if possible.
 1313                  */
 1314 
 1315                 pages[lcv] = PGO_DONTCARE;
 1316                 if (lcv == flt->centeridx) {    /* save center for later! */
 1317                         shadowed = true;
 1318                         continue;
 1319                 }
 1320 
 1321                 struct vm_anon *anon = anons[lcv];
 1322                 struct vm_page *pg = anon->an_page;
 1323 
 1324                 KASSERT(anon->an_lock == amap->am_lock);
 1325 
 1326                 /*
 1327                  * ignore loaned and busy pages.
 1328                  * don't play with VAs that are already mapped.
 1329                  */
 1330 
 1331                 if (pg && pg->loan_count == 0 && (pg->flags & PG_BUSY) == 0 &&
 1332                     !pmap_extract(ufi->orig_map->pmap, currva, NULL)) {
 1333                         uvm_fault_upper_neighbor(ufi, flt, currva,
 1334                             pg, anon->an_ref > 1);
 1335                         entered = true;
 1336                 }
 1337         }
 1338         if (entered) {
 1339                 pmap_update(ufi->orig_map->pmap);
 1340         }
 1341 
 1342         /* locked: maps(read), amap(if there) */
 1343         KASSERT(amap == NULL ||
 1344             rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 1345         /* (shadowed == true) if there is an anon at the faulting address */
 1346         UVMHIST_LOG(maphist, "  shadowed=%jd, will_get=%jd", shadowed,
 1347             (ufi->entry->object.uvm_obj && shadowed != false),0,0);
 1348 
 1349         return 0;
 1350 }
 1351 
 1352 /*
 1353  * uvm_fault_upper_neighbor: enter single upper neighbor page.
 1354  *
 1355  * => called with amap and anon locked.
 1356  */
 1357 
 1358 static void
 1359 uvm_fault_upper_neighbor(
 1360         struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
 1361         vaddr_t currva, struct vm_page *pg, bool readonly)
 1362 {
 1363         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1364 
 1365         /* locked: amap, anon */
 1366 
 1367         KASSERT(pg->uobject == NULL);
 1368         KASSERT(pg->uanon != NULL);
 1369         KASSERT(rw_lock_op(pg->uanon->an_lock) == flt->upper_lock_type);
 1370         KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN);
 1371 
 1372         /*
 1373          * there wasn't a direct fault on the page, so avoid the cost of
 1374          * activating it.
 1375          */
 1376 
 1377         if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) {
 1378                 uvm_pagelock(pg);
 1379                 uvm_pageenqueue(pg);
 1380                 uvm_pageunlock(pg);
 1381         }
 1382 
 1383         UVMHIST_LOG(maphist,
 1384             "  MAPPING: n anon: pm=%#jx, va=%#jx, pg=%#jx",
 1385             (uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
 1386         cpu_count(CPU_COUNT_FLTNAMAP, 1);
 1387 
 1388         /*
 1389          * Since this page isn't the page that's actually faulting,
 1390          * ignore pmap_enter() failures; it's not critical that we
 1391          * enter these right now.
 1392          */
 1393 
 1394         (void) pmap_enter(ufi->orig_map->pmap, currva,
 1395             VM_PAGE_TO_PHYS(pg),
 1396             readonly ? (flt->enter_prot & ~VM_PROT_WRITE) :
 1397             flt->enter_prot,
 1398             PMAP_CANFAIL | (flt->wire_mapping ? PMAP_WIRED : 0));
 1399 }
 1400 
 1401 /*
 1402  * uvm_fault_upper: handle upper fault.
 1403  *
 1404  *      1. acquire anon lock.
 1405  *      2. get anon.  let uvmfault_anonget do the dirty work.
 1406  *      3. handle loan.
 1407  *      4. dispatch direct or promote handlers.
 1408  */
 1409 
 1410 static int
 1411 uvm_fault_upper(
 1412         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 1413         struct vm_anon **anons)
 1414 {
 1415         struct vm_amap * const amap = ufi->entry->aref.ar_amap;
 1416         struct vm_anon * const anon = anons[flt->centeridx];
 1417         struct uvm_object *uobj;
 1418         int error;
 1419         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1420 
 1421         /* locked: maps(read), amap, anon */
 1422         KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 1423         KASSERT(anon->an_lock == amap->am_lock);
 1424 
 1425         /*
 1426          * handle case 1: fault on an anon in our amap
 1427          */
 1428 
 1429         UVMHIST_LOG(maphist, "  case 1 fault: anon=%#jx",
 1430             (uintptr_t)anon, 0, 0, 0);
 1431 
 1432         /*
 1433          * no matter if we have case 1A or case 1B we are going to need to
 1434          * have the anon's memory resident.   ensure that now.
 1435          */
 1436 
 1437         /*
 1438          * let uvmfault_anonget do the dirty work.
 1439          * if it fails (!OK) it will unlock everything for us.
 1440          * if it succeeds, locks are still valid and locked.
 1441          * also, if it is OK, then the anon's page is on the queues.
 1442          * if the page is on loan from a uvm_object, then anonget will
 1443          * lock that object for us if it does not fail.
 1444          */
 1445  retry:
 1446         error = uvmfault_anonget(ufi, amap, anon);
 1447         switch (error) {
 1448         case 0:
 1449                 break;
 1450 
 1451         case ERESTART:
 1452                 return ERESTART;
 1453 
 1454         case EAGAIN:
 1455                 kpause("fltagain1", false, hz/2, NULL);
 1456                 return ERESTART;
 1457 
 1458         case ENOLCK:
 1459                 /* it needs a write lock: retry */
 1460                 error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
 1461                 if (error != 0) {
 1462                         return error;
 1463                 }
 1464                 KASSERT(rw_write_held(amap->am_lock));
 1465                 goto retry;
 1466 
 1467         default:
 1468                 return error;
 1469         }
 1470 
 1471         /*
 1472          * uobj is non null if the page is on loan from an object (i.e. uobj)
 1473          */
 1474 
 1475         uobj = anon->an_page->uobject;  /* locked by anonget if !NULL */
 1476 
 1477         /* locked: maps(read), amap, anon, uobj(if one) */
 1478         KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 1479         KASSERT(anon->an_lock == amap->am_lock);
 1480         KASSERT(uobj == NULL ||
 1481             rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
 1482 
 1483         /*
 1484          * special handling for loaned pages
 1485          */
 1486 
 1487         if (anon->an_page->loan_count) {
 1488                 error = uvm_fault_upper_loan(ufi, flt, anon, &uobj);
 1489                 if (error != 0)
 1490                         return error;
 1491         }
 1492 
 1493         /*
 1494          * if we are case 1B then we will need to allocate a new blank
 1495          * anon to transfer the data into.   note that we have a lock
 1496          * on anon, so no one can busy or release the page until we are done.
 1497          * also note that the ref count can't drop to zero here because
 1498          * it is > 1 and we are only dropping one ref.
 1499          *
 1500          * in the (hopefully very rare) case that we are out of RAM we
 1501          * will unlock, wait for more RAM, and refault.
 1502          *
 1503          * if we are out of anon VM we kill the process (XXX: could wait?).
 1504          */
 1505 
 1506         if (flt->cow_now && anon->an_ref > 1) {
 1507                 flt->promote = true;
 1508                 error = uvm_fault_upper_promote(ufi, flt, uobj, anon);
 1509         } else {
 1510                 error = uvm_fault_upper_direct(ufi, flt, uobj, anon);
 1511         }
 1512         return error;
 1513 }
 1514 
 1515 /*
 1516  * uvm_fault_upper_loan: handle loaned upper page.
 1517  *
 1518  *      1. if not cow'ing now, simply adjust flt->enter_prot.
 1519  *      2. if cow'ing now, and if ref count is 1, break loan.
 1520  */
 1521 
 1522 static int
 1523 uvm_fault_upper_loan(
 1524         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 1525         struct vm_anon *anon, struct uvm_object **ruobj)
 1526 {
 1527         struct vm_amap * const amap = ufi->entry->aref.ar_amap;
 1528         int error = 0;
 1529         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1530 
 1531         if (!flt->cow_now) {
 1532 
 1533                 /*
 1534                  * for read faults on loaned pages we just cap the
 1535                  * protection at read-only.
 1536                  */
 1537 
 1538                 flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
 1539 
 1540         } else {
 1541                 /*
 1542                  * note that we can't allow writes into a loaned page!
 1543                  *
 1544                  * if we have a write fault on a loaned page in an
 1545                  * anon then we need to look at the anon's ref count.
 1546                  * if it is greater than one then we are going to do
 1547                  * a normal copy-on-write fault into a new anon (this
 1548                  * is not a problem).  however, if the reference count
 1549                  * is one (a case where we would normally allow a
 1550                  * write directly to the page) then we need to kill
 1551                  * the loan before we continue.
 1552                  */
 1553 
 1554                 /* >1 case is already ok */
 1555                 if (anon->an_ref == 1) {
 1556                         /* breaking loan requires a write lock. */
 1557                         error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
 1558                         if (error != 0) {
 1559                                 return error;
 1560                         }
 1561                         KASSERT(rw_write_held(amap->am_lock));
 1562 
 1563                         error = uvm_loanbreak_anon(anon, *ruobj);
 1564                         if (error != 0) {
 1565                                 uvmfault_unlockall(ufi, amap, *ruobj);
 1566                                 uvm_wait("flt_noram2");
 1567                                 return ERESTART;
 1568                         }
 1569                         /* if we were a loan receiver uobj is gone */
 1570                         if (*ruobj)
 1571                                 *ruobj = NULL;
 1572                 }
 1573         }
 1574         return error;
 1575 }
 1576 
 1577 /*
 1578  * uvm_fault_upper_promote: promote upper page.
 1579  *
 1580  *      1. call uvmfault_promote.
 1581  *      2. enqueue page.
 1582  *      3. deref.
 1583  *      4. pass page to uvm_fault_upper_enter.
 1584  */
 1585 
 1586 static int
 1587 uvm_fault_upper_promote(
 1588         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 1589         struct uvm_object *uobj, struct vm_anon *anon)
 1590 {
 1591         struct vm_amap * const amap = ufi->entry->aref.ar_amap;
 1592         struct vm_anon * const oanon = anon;
 1593         struct vm_page *pg;
 1594         int error;
 1595         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1596 
 1597         UVMHIST_LOG(maphist, "  case 1B: COW fault",0,0,0,0);
 1598         cpu_count(CPU_COUNT_FLT_ACOW, 1);
 1599 
 1600         /* promoting requires a write lock. */
 1601         error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
 1602         if (error != 0) {
 1603                 return error;
 1604         }
 1605         KASSERT(rw_write_held(amap->am_lock));
 1606 
 1607         error = uvmfault_promote(ufi, oanon, PGO_DONTCARE, &anon,
 1608             &flt->anon_spare);
 1609         switch (error) {
 1610         case 0:
 1611                 break;
 1612         case ERESTART:
 1613                 return ERESTART;
 1614         default:
 1615                 return error;
 1616         }
 1617         pg = anon->an_page;
 1618 
 1619         KASSERT(anon->an_lock == oanon->an_lock);
 1620         KASSERT((pg->flags & (PG_BUSY | PG_FAKE)) == 0);
 1621 
 1622         /* deref: can not drop to zero here by defn! */
 1623         KASSERT(oanon->an_ref > 1);
 1624         oanon->an_ref--;
 1625 
 1626         /*
 1627          * note: oanon is still locked, as is the new anon.  we
 1628          * need to check for this later when we unlock oanon; if
 1629          * oanon != anon, we'll have to unlock anon, too.
 1630          */
 1631 
 1632         return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon);
 1633 }
 1634 
 1635 /*
 1636  * uvm_fault_upper_direct: handle direct fault.
 1637  */
 1638 
 1639 static int
 1640 uvm_fault_upper_direct(
 1641         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 1642         struct uvm_object *uobj, struct vm_anon *anon)
 1643 {
 1644         struct vm_anon * const oanon = anon;
 1645         struct vm_page *pg;
 1646         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1647 
 1648         cpu_count(CPU_COUNT_FLT_ANON, 1);
 1649         pg = anon->an_page;
 1650         if (anon->an_ref > 1)     /* disallow writes to ref > 1 anons */
 1651                 flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
 1652 
 1653         return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon);
 1654 }
 1655 
 1656 /*
 1657  * uvm_fault_upper_enter: enter h/w mapping of upper page.
 1658  */
 1659 
 1660 static int
 1661 uvm_fault_upper_enter(
 1662         struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
 1663         struct uvm_object *uobj, struct vm_anon *anon, struct vm_page *pg,
 1664         struct vm_anon *oanon)
 1665 {
 1666         struct pmap *pmap = ufi->orig_map->pmap;
 1667         vaddr_t va = ufi->orig_rvaddr;
 1668         struct vm_amap * const amap = ufi->entry->aref.ar_amap;
 1669         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1670 
 1671         /* locked: maps(read), amap, oanon, anon(if different from oanon) */
 1672         KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 1673         KASSERT(anon->an_lock == amap->am_lock);
 1674         KASSERT(oanon->an_lock == amap->am_lock);
 1675         KASSERT(uobj == NULL ||
 1676             rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
 1677         KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN);
 1678 
 1679         /*
 1680          * now map the page in.
 1681          */
 1682 
 1683         UVMHIST_LOG(maphist,
 1684             "  MAPPING: anon: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd",
 1685             (uintptr_t)pmap, va, (uintptr_t)pg, flt->promote);
 1686         if (pmap_enter(pmap, va, VM_PAGE_TO_PHYS(pg),
 1687             flt->enter_prot, flt->access_type | PMAP_CANFAIL |
 1688             (flt->wire_mapping ? PMAP_WIRED : 0)) != 0) {
 1689 
 1690                 /*
 1691                  * If pmap_enter() fails, it must not leave behind an existing
 1692                  * pmap entry.  In particular, a now-stale entry for a different
 1693                  * page would leave the pmap inconsistent with the vm_map.
 1694                  * This is not to imply that pmap_enter() should remove an
 1695                  * existing mapping in such a situation (since that could create
 1696                  * different problems, eg. if the existing mapping is wired),
 1697                  * but rather that the pmap should be designed such that it
 1698                  * never needs to fail when the new mapping is replacing an
 1699                  * existing mapping and the new page has no existing mappings.
 1700                  *
 1701                  * XXX This can't be asserted safely any more because many
 1702                  * LWPs and/or many processes could simultaneously fault on
 1703                  * the same VA and some might succeed.
 1704                  */
 1705 
 1706                 /* KASSERT(!pmap_extract(pmap, va, NULL)); */
 1707 
 1708                 /*
 1709                  * ensure that the page is queued in the case that
 1710                  * we just promoted.
 1711                  */
 1712 
 1713                 uvm_pagelock(pg);
 1714                 uvm_pageenqueue(pg);
 1715                 uvm_pageunlock(pg);
 1716 
 1717                 /*
 1718                  * No need to undo what we did; we can simply think of
 1719                  * this as the pmap throwing away the mapping information.
 1720                  *
 1721                  * We do, however, have to go through the ReFault path,
 1722                  * as the map may change while we're asleep.
 1723                  */
 1724 
 1725                 uvmfault_unlockall(ufi, amap, uobj);
 1726                 if (!uvm_reclaimable()) {
 1727                         UVMHIST_LOG(maphist,
 1728                             "<- failed.  out of VM",0,0,0,0);
 1729                         /* XXX instrumentation */
 1730                         return ENOMEM;
 1731                 }
 1732                 /* XXX instrumentation */
 1733                 uvm_wait("flt_pmfail1");
 1734                 return ERESTART;
 1735         }
 1736 
 1737         uvm_fault_upper_done(ufi, flt, anon, pg);
 1738 
 1739         /*
 1740          * done case 1!  finish up by unlocking everything and returning success
 1741          */
 1742 
 1743         pmap_update(pmap);
 1744         uvmfault_unlockall(ufi, amap, uobj);
 1745         return 0;
 1746 }
 1747 
 1748 /*
 1749  * uvm_fault_upper_done: queue upper center page.
 1750  */
 1751 
 1752 static void
 1753 uvm_fault_upper_done(
 1754         struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
 1755         struct vm_anon *anon, struct vm_page *pg)
 1756 {
 1757         const bool wire_paging = flt->wire_paging;
 1758 
 1759         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1760 
 1761         /*
 1762          * ... update the page queues.
 1763          */
 1764 
 1765         if (wire_paging) {
 1766                 uvm_pagelock(pg);
 1767                 uvm_pagewire(pg);
 1768                 uvm_pageunlock(pg);
 1769 
 1770                 /*
 1771                  * since the now-wired page cannot be paged out,
 1772                  * release its swap resources for others to use.
 1773                  * and since an anon with no swap cannot be clean,
 1774                  * mark it dirty now.
 1775                  */
 1776 
 1777                 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
 1778                 uvm_anon_dropswap(anon);
 1779         } else if (uvmpdpol_pageactivate_p(pg)) {
 1780                 /*
 1781                  * avoid re-activating the page unless needed,
 1782                  * to avoid false sharing on multiprocessor.
 1783                  */
 1784 
 1785                 uvm_pagelock(pg);
 1786                 uvm_pageactivate(pg);
 1787                 uvm_pageunlock(pg);
 1788         }
 1789 }
 1790 
 1791 /*
 1792  * uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer
 1793  */
 1794 
 1795 static inline int
 1796 uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 1797     struct vm_amap *amap, struct uvm_object *uobj, struct vm_page *uobjpage)
 1798 {
 1799 
 1800         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1801 
 1802         KASSERT(uobj != NULL);
 1803         KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock));
 1804 
 1805         /*
 1806          * fast path.
 1807          */
 1808 
 1809         if (__predict_true(flt->lower_lock_type == RW_WRITER)) {
 1810                 return 0;
 1811         }
 1812 
 1813         /*
 1814          * otherwise try for the upgrade.  if we don't get it, unlock
 1815          * everything, restart the fault and next time around get a writer
 1816          * lock.
 1817          */
 1818 
 1819         flt->lower_lock_type = RW_WRITER;
 1820         if (__predict_false(!rw_tryupgrade(uobj->vmobjlock))) {
 1821                 uvmfault_unlockall(ufi, amap, uobj);
 1822                 cpu_count(CPU_COUNT_FLTNOUP, 1);
 1823                 UVMHIST_LOG(maphist, "  !upgrade lower", 0, 0,0,0);
 1824                 return ERESTART;
 1825         }
 1826         cpu_count(CPU_COUNT_FLTUP, 1);
 1827         KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock));
 1828         return 0;
 1829 }
 1830 
 1831 /*
 1832  * uvm_fault_lower: handle lower fault.
 1833  *
 1834  *      1. check uobj
 1835  *      1.1. if null, ZFOD.
 1836  *      1.2. if not null, look up unnmapped neighbor pages.
 1837  *      2. for center page, check if promote.
 1838  *      2.1. ZFOD always needs promotion.
 1839  *      2.2. other uobjs, when entry is marked COW (usually MAP_PRIVATE vnode).
 1840  *      3. if uobj is not ZFOD and page is not found, do i/o.
 1841  *      4. dispatch either direct / promote fault.
 1842  */
 1843 
 1844 static int
 1845 uvm_fault_lower(
 1846         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 1847         struct vm_page **pages)
 1848 {
 1849         struct vm_amap *amap __diagused = ufi->entry->aref.ar_amap;
 1850         struct uvm_object *uobj = ufi->entry->object.uvm_obj;
 1851         struct vm_page *uobjpage;
 1852         int error;
 1853         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1854 
 1855         /*
 1856          * now, if the desired page is not shadowed by the amap and we have
 1857          * a backing object that does not have a special fault routine, then
 1858          * we ask (with pgo_get) the object for resident pages that we care
 1859          * about and attempt to map them in.  we do not let pgo_get block
 1860          * (PGO_LOCKED).
 1861          */
 1862 
 1863         if (uobj == NULL) {
 1864                 /* zero fill; don't care neighbor pages */
 1865                 uobjpage = NULL;
 1866         } else {
 1867                 uvm_fault_lower_lookup(ufi, flt, pages);
 1868                 uobjpage = pages[flt->centeridx];
 1869         }
 1870 
 1871         /*
 1872          * note that at this point we are done with any front or back pages.
 1873          * we are now going to focus on the center page (i.e. the one we've
 1874          * faulted on).  if we have faulted on the upper (anon) layer
 1875          * [i.e. case 1], then the anon we want is anons[centeridx] (we have
 1876          * not touched it yet).  if we have faulted on the bottom (uobj)
 1877          * layer [i.e. case 2] and the page was both present and available,
 1878          * then we've got a pointer to it as "uobjpage" and we've already
 1879          * made it BUSY.
 1880          */
 1881 
 1882         /*
 1883          * locked:
 1884          * maps(read), amap(if there), uobj(if !null), uobjpage(if !null)
 1885          */
 1886         KASSERT(amap == NULL ||
 1887             rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 1888         KASSERT(uobj == NULL ||
 1889             rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
 1890 
 1891         /*
 1892          * note that uobjpage can not be PGO_DONTCARE at this point.  we now
 1893          * set uobjpage to PGO_DONTCARE if we are doing a zero fill.  if we
 1894          * have a backing object, check and see if we are going to promote
 1895          * the data up to an anon during the fault.
 1896          */
 1897 
 1898         if (uobj == NULL) {
 1899                 uobjpage = PGO_DONTCARE;
 1900                 flt->promote = true;            /* always need anon here */
 1901         } else {
 1902                 KASSERT(uobjpage != PGO_DONTCARE);
 1903                 flt->promote = flt->cow_now && UVM_ET_ISCOPYONWRITE(ufi->entry);
 1904         }
 1905         UVMHIST_LOG(maphist, "  case 2 fault: promote=%jd, zfill=%jd",
 1906             flt->promote, (uobj == NULL), 0,0);
 1907 
 1908         /*
 1909          * if uobjpage is not null then we do not need to do I/O to get the
 1910          * uobjpage.
 1911          *
 1912          * if uobjpage is null, then we need to unlock and ask the pager to
 1913          * get the data for us.   once we have the data, we need to reverify
 1914          * the state the world.   we are currently not holding any resources.
 1915          */
 1916 
 1917         if (uobjpage) {
 1918                 /* update rusage counters */
 1919                 curlwp->l_ru.ru_minflt++;
 1920         } else {
 1921                 error = uvm_fault_lower_io(ufi, flt, &uobj, &uobjpage);
 1922                 if (error != 0)
 1923                         return error;
 1924         }
 1925 
 1926         /*
 1927          * locked:
 1928          * maps(read), amap(if !null), uobj(if !null), uobjpage(if uobj)
 1929          */
 1930         KASSERT(amap == NULL ||
 1931             rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 1932         KASSERT(uobj == NULL ||
 1933             rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
 1934 
 1935         /*
 1936          * notes:
 1937          *  - at this point uobjpage can not be NULL
 1938          *  - at this point uobjpage can not be PG_RELEASED (since we checked
 1939          *  for it above)
 1940          *  - at this point uobjpage could be waited on (handle later)
 1941          *  - uobjpage can be from a different object if tmpfs (vnode vs UAO)
 1942          */
 1943 
 1944         KASSERT(uobjpage != NULL);
 1945         KASSERT(uobj == NULL ||
 1946             uobjpage->uobject->vmobjlock == uobj->vmobjlock);
 1947         KASSERT(uobj == NULL || !UVM_OBJ_IS_CLEAN(uobjpage->uobject) ||
 1948             uvm_pagegetdirty(uobjpage) == UVM_PAGE_STATUS_CLEAN);
 1949 
 1950         if (!flt->promote) {
 1951                 error = uvm_fault_lower_direct(ufi, flt, uobj, uobjpage);
 1952         } else {
 1953                 error = uvm_fault_lower_promote(ufi, flt, uobj, uobjpage);
 1954         }
 1955         return error;
 1956 }
 1957 
 1958 /*
 1959  * uvm_fault_lower_lookup: look up on-memory uobj pages.
 1960  *
 1961  *      1. get on-memory pages.
 1962  *      2. if failed, give up (get only center page later).
 1963  *      3. if succeeded, enter h/w mapping of neighbor pages.
 1964  */
 1965 
 1966 static void
 1967 uvm_fault_lower_lookup(
 1968         struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
 1969         struct vm_page **pages)
 1970 {
 1971         struct uvm_object *uobj = ufi->entry->object.uvm_obj;
 1972         int lcv, gotpages;
 1973         vaddr_t currva;
 1974         bool entered;
 1975         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 1976 
 1977         rw_enter(uobj->vmobjlock, flt->lower_lock_type);
 1978 
 1979         /*
 1980          * Locked: maps(read), amap(if there), uobj
 1981          */
 1982 
 1983         cpu_count(CPU_COUNT_FLTLGET, 1);
 1984         gotpages = flt->npages;
 1985         (void) uobj->pgops->pgo_get(uobj,
 1986             ufi->entry->offset + flt->startva - ufi->entry->start,
 1987             pages, &gotpages, flt->centeridx,
 1988             flt->access_type & MASK(ufi->entry), ufi->entry->advice,
 1989             PGO_LOCKED);
 1990 
 1991         KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
 1992 
 1993         /*
 1994          * check for pages to map, if we got any
 1995          */
 1996 
 1997         if (gotpages == 0) {
 1998                 pages[flt->centeridx] = NULL;
 1999                 return;
 2000         }
 2001 
 2002         entered = false;
 2003         currva = flt->startva;
 2004         for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
 2005                 struct vm_page *curpg;
 2006 
 2007                 curpg = pages[lcv];
 2008                 if (curpg == NULL || curpg == PGO_DONTCARE) {
 2009                         continue;
 2010                 }
 2011 
 2012                 /*
 2013                  * in the case of tmpfs, the pages might be from a different
 2014                  * uvm_object.  just make sure that they have the same lock.
 2015                  */
 2016 
 2017                 KASSERT(curpg->uobject->vmobjlock == uobj->vmobjlock);
 2018                 KASSERT((curpg->flags & PG_BUSY) == 0);
 2019 
 2020                 /*
 2021                  * leave the centre page for later.  don't screw with
 2022                  * existing mappings (needless & expensive).
 2023                  */
 2024 
 2025                 if (lcv == flt->centeridx) {
 2026                         UVMHIST_LOG(maphist, "  got uobjpage (%#jx) "
 2027                             "with locked get", (uintptr_t)curpg, 0, 0, 0);
 2028                 } else if (!pmap_extract(ufi->orig_map->pmap, currva, NULL)) {
 2029                         uvm_fault_lower_neighbor(ufi, flt, currva, curpg);
 2030                         entered = true;
 2031                 }
 2032         }
 2033         if (entered) {
 2034                 pmap_update(ufi->orig_map->pmap);
 2035         }
 2036 }
 2037 
 2038 /*
 2039  * uvm_fault_lower_neighbor: enter h/w mapping of lower neighbor page.
 2040  */
 2041 
 2042 static void
 2043 uvm_fault_lower_neighbor(
 2044         struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
 2045         vaddr_t currva, struct vm_page *pg)
 2046 {
 2047         const bool readonly = uvm_pagereadonly_p(pg) || pg->loan_count > 0;
 2048         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 2049 
 2050         /* locked: maps(read), amap(if there), uobj */
 2051 
 2052         /*
 2053          * calling pgo_get with PGO_LOCKED returns us pages which
 2054          * are neither busy nor released, so we don't need to check
 2055          * for this.  we can just directly enter the pages.
 2056          *
 2057          * there wasn't a direct fault on the page, so avoid the cost of
 2058          * activating it.
 2059          */
 2060 
 2061         if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) {
 2062                 uvm_pagelock(pg);
 2063                 uvm_pageenqueue(pg);
 2064                 uvm_pageunlock(pg);
 2065         }
 2066 
 2067         UVMHIST_LOG(maphist,
 2068             "  MAPPING: n obj: pm=%#jx, va=%#jx, pg=%#jx",
 2069             (uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
 2070         cpu_count(CPU_COUNT_FLTNOMAP, 1);
 2071 
 2072         /*
 2073          * Since this page isn't the page that's actually faulting,
 2074          * ignore pmap_enter() failures; it's not critical that we
 2075          * enter these right now.
 2076          * NOTE: page can't be waited on or PG_RELEASED because we've
 2077          * held the lock the whole time we've had the handle.
 2078          */
 2079         KASSERT((pg->flags & PG_PAGEOUT) == 0);
 2080         KASSERT((pg->flags & PG_RELEASED) == 0);
 2081         KASSERT(!UVM_OBJ_IS_CLEAN(pg->uobject) ||
 2082             uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN);
 2083         KASSERT((pg->flags & PG_BUSY) == 0);
 2084         KASSERT(rw_lock_op(pg->uobject->vmobjlock) == flt->lower_lock_type);
 2085 
 2086         const vm_prot_t mapprot =
 2087             readonly ? (flt->enter_prot & ~VM_PROT_WRITE) :
 2088             flt->enter_prot & MASK(ufi->entry);
 2089         const u_int mapflags =
 2090             PMAP_CANFAIL | (flt->wire_mapping ? (mapprot | PMAP_WIRED) : 0);
 2091         (void) pmap_enter(ufi->orig_map->pmap, currva,
 2092             VM_PAGE_TO_PHYS(pg), mapprot, mapflags);
 2093 }
 2094 
 2095 /*
 2096  * uvm_fault_lower_io: get lower page from backing store.
 2097  *
 2098  *      1. unlock everything, because i/o will block.
 2099  *      2. call pgo_get.
 2100  *      3. if failed, recover.
 2101  *      4. if succeeded, relock everything and verify things.
 2102  */
 2103 
 2104 static int
 2105 uvm_fault_lower_io(
 2106         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 2107         struct uvm_object **ruobj, struct vm_page **ruobjpage)
 2108 {
 2109         struct vm_amap * const amap = ufi->entry->aref.ar_amap;
 2110         struct uvm_object *uobj = *ruobj;
 2111         struct vm_page *pg;
 2112         bool locked;
 2113         int gotpages;
 2114         int error;
 2115         voff_t uoff;
 2116         vm_prot_t access_type;
 2117         int advice;
 2118         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 2119 
 2120         /* update rusage counters */
 2121         curlwp->l_ru.ru_majflt++;
 2122 
 2123         /* grab everything we need from the entry before we unlock */
 2124         uoff = (ufi->orig_rvaddr - ufi->entry->start) + ufi->entry->offset;
 2125         access_type = flt->access_type & MASK(ufi->entry);
 2126         advice = ufi->entry->advice;
 2127 
 2128         /* Locked: maps(read), amap(if there), uobj */
 2129         KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
 2130 
 2131         /* Upgrade to a write lock if needed. */
 2132         error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, NULL);
 2133         if (error != 0) {
 2134                 return error;
 2135         }
 2136         uvmfault_unlockall(ufi, amap, NULL);
 2137 
 2138         /* Locked: uobj(write) */
 2139         KASSERT(rw_write_held(uobj->vmobjlock));
 2140 
 2141         cpu_count(CPU_COUNT_FLTGET, 1);
 2142         gotpages = 1;
 2143         pg = NULL;
 2144         error = uobj->pgops->pgo_get(uobj, uoff, &pg, &gotpages,
 2145             0, access_type, advice, PGO_SYNCIO);
 2146         /* locked: pg(if no error) */
 2147 
 2148         /*
 2149          * recover from I/O
 2150          */
 2151 
 2152         if (error) {
 2153                 if (error == EAGAIN) {
 2154                         UVMHIST_LOG(maphist,
 2155                             "  pgo_get says TRY AGAIN!",0,0,0,0);
 2156                         kpause("fltagain2", false, hz/2, NULL);
 2157                         return ERESTART;
 2158                 }
 2159 
 2160 #if 0
 2161                 KASSERT(error != ERESTART);
 2162 #else
 2163                 /* XXXUEBS don't re-fault? */
 2164                 if (error == ERESTART)
 2165                         error = EIO;
 2166 #endif
 2167 
 2168                 UVMHIST_LOG(maphist, "<- pgo_get failed (code %jd)",
 2169                     error, 0,0,0);
 2170                 return error;
 2171         }
 2172 
 2173         /*
 2174          * re-verify the state of the world by first trying to relock
 2175          * the maps.  always relock the object.
 2176          */
 2177 
 2178         locked = uvmfault_relock(ufi);
 2179         if (locked && amap)
 2180                 amap_lock(amap, flt->upper_lock_type);
 2181 
 2182         /* might be changed */
 2183         uobj = pg->uobject;
 2184 
 2185         rw_enter(uobj->vmobjlock, flt->lower_lock_type);
 2186         KASSERT((pg->flags & PG_BUSY) != 0);
 2187         KASSERT(flt->lower_lock_type == RW_WRITER);
 2188 
 2189         uvm_pagelock(pg);
 2190         uvm_pageactivate(pg);
 2191         uvm_pageunlock(pg);
 2192 
 2193         /* locked(locked): maps(read), amap(if !null), uobj, pg */
 2194         /* locked(!locked): uobj, pg */
 2195 
 2196         /*
 2197          * verify that the page has not be released and re-verify
 2198          * that amap slot is still free.   if there is a problem,
 2199          * we unlock and clean up.
 2200          */
 2201 
 2202         if ((pg->flags & PG_RELEASED) != 0 ||
 2203             (locked && amap && amap_lookup(&ufi->entry->aref,
 2204               ufi->orig_rvaddr - ufi->entry->start))) {
 2205                 if (locked)
 2206                         uvmfault_unlockall(ufi, amap, NULL);
 2207                 locked = false;
 2208         }
 2209 
 2210         /*
 2211          * unbusy/release the page.
 2212          */
 2213 
 2214         if ((pg->flags & PG_RELEASED) == 0) {
 2215                 pg->flags &= ~PG_BUSY;
 2216                 uvm_pagelock(pg);
 2217                 uvm_pagewakeup(pg);
 2218                 uvm_pageunlock(pg);
 2219                 UVM_PAGE_OWN(pg, NULL);
 2220         } else {
 2221                 cpu_count(CPU_COUNT_FLTPGRELE, 1);
 2222                 uvm_pagefree(pg);
 2223         }
 2224 
 2225         /*
 2226          * didn't get the lock?   retry.
 2227          */
 2228 
 2229         if (locked == false) {
 2230                 UVMHIST_LOG(maphist,
 2231                     "  wasn't able to relock after fault: retry",
 2232                     0,0,0,0);
 2233                 rw_exit(uobj->vmobjlock);
 2234                 return ERESTART;
 2235         }
 2236 
 2237         /*
 2238          * we have the data in pg.  we are holding object lock (so the page
 2239          * can't be released on us).
 2240          */
 2241 
 2242         /* locked: maps(read), amap(if !null), uobj */
 2243 
 2244         *ruobj = uobj;
 2245         *ruobjpage = pg;
 2246         return 0;
 2247 }
 2248 
 2249 /*
 2250  * uvm_fault_lower_direct: fault lower center page
 2251  *
 2252  *      1. adjust flt->enter_prot.
 2253  *      2. if page is loaned, resolve.
 2254  */
 2255 
 2256 int
 2257 uvm_fault_lower_direct(
 2258         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 2259         struct uvm_object *uobj, struct vm_page *uobjpage)
 2260 {
 2261         struct vm_page *pg;
 2262         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 2263 
 2264         /*
 2265          * we are not promoting.   if the mapping is COW ensure that we
 2266          * don't give more access than we should (e.g. when doing a read
 2267          * fault on a COPYONWRITE mapping we want to map the COW page in
 2268          * R/O even though the entry protection could be R/W).
 2269          *
 2270          * set "pg" to the page we want to map in (uobjpage, usually)
 2271          */
 2272 
 2273         cpu_count(CPU_COUNT_FLT_OBJ, 1);
 2274         if (UVM_ET_ISCOPYONWRITE(ufi->entry) ||
 2275             UVM_OBJ_NEEDS_WRITEFAULT(uobjpage->uobject))
 2276                 flt->enter_prot &= ~VM_PROT_WRITE;
 2277         pg = uobjpage;          /* map in the actual object */
 2278 
 2279         KASSERT(uobjpage != PGO_DONTCARE);
 2280 
 2281         /*
 2282          * we are faulting directly on the page.   be careful
 2283          * about writing to loaned pages...
 2284          */
 2285 
 2286         if (uobjpage->loan_count) {
 2287                 uvm_fault_lower_direct_loan(ufi, flt, uobj, &pg, &uobjpage);
 2288         }
 2289         KASSERT(pg == uobjpage);
 2290         KASSERT((pg->flags & PG_BUSY) == 0);
 2291         return uvm_fault_lower_enter(ufi, flt, uobj, NULL, pg);
 2292 }
 2293 
 2294 /*
 2295  * uvm_fault_lower_direct_loan: resolve loaned page.
 2296  *
 2297  *      1. if not cow'ing, adjust flt->enter_prot.
 2298  *      2. if cow'ing, break loan.
 2299  */
 2300 
 2301 static int
 2302 uvm_fault_lower_direct_loan(
 2303         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 2304         struct uvm_object *uobj, struct vm_page **rpg,
 2305         struct vm_page **ruobjpage)
 2306 {
 2307         struct vm_amap * const amap = ufi->entry->aref.ar_amap;
 2308         struct vm_page *pg;
 2309         struct vm_page *uobjpage = *ruobjpage;
 2310         int error;
 2311         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 2312 
 2313         if (!flt->cow_now) {
 2314                 /* read fault: cap the protection at readonly */
 2315                 /* cap! */
 2316                 flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
 2317         } else {
 2318                 /*
 2319                  * write fault: must break the loan here.  to do this
 2320                  * we need a write lock on the object.
 2321                  */
 2322 
 2323                 error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, uobjpage);
 2324                 if (error != 0) {
 2325                         return error;
 2326                 }
 2327                 KASSERT(rw_write_held(uobj->vmobjlock));
 2328 
 2329                 pg = uvm_loanbreak(uobjpage);
 2330                 if (pg == NULL) {
 2331 
 2332                         uvmfault_unlockall(ufi, amap, uobj);
 2333                         UVMHIST_LOG(maphist,
 2334                           "  out of RAM breaking loan, waiting",
 2335                           0,0,0,0);
 2336                         cpu_count(CPU_COUNT_FLTNORAM, 1);
 2337                         uvm_wait("flt_noram4");
 2338                         return ERESTART;
 2339                 }
 2340                 *rpg = pg;
 2341                 *ruobjpage = pg;
 2342 
 2343                 /*
 2344                  * drop ownership of page while still holding object lock,
 2345                  * which won't be dropped until the page is entered.
 2346                  */
 2347 
 2348                 uvm_pagelock(pg);
 2349                 uvm_pagewakeup(pg);
 2350                 uvm_pageunlock(pg);
 2351                 pg->flags &= ~PG_BUSY;
 2352                 UVM_PAGE_OWN(pg, NULL);
 2353         }
 2354         return 0;
 2355 }
 2356 
 2357 /*
 2358  * uvm_fault_lower_promote: promote lower page.
 2359  *
 2360  *      1. call uvmfault_promote.
 2361  *      2. fill in data.
 2362  *      3. if not ZFOD, dispose old page.
 2363  */
 2364 
 2365 int
 2366 uvm_fault_lower_promote(
 2367         struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
 2368         struct uvm_object *uobj, struct vm_page *uobjpage)
 2369 {
 2370         struct vm_amap * const amap = ufi->entry->aref.ar_amap;
 2371         struct vm_anon *anon;
 2372         struct vm_page *pg;
 2373         int error;
 2374         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 2375 
 2376         KASSERT(amap != NULL);
 2377 
 2378         /* promoting requires a write lock. */
 2379         error = uvm_fault_upper_upgrade(ufi, flt, amap, uobj);
 2380         if (error != 0) {
 2381                 return error;
 2382         }
 2383         KASSERT(rw_write_held(amap->am_lock));
 2384         KASSERT(uobj == NULL ||
 2385             rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
 2386 
 2387         /*
 2388          * If we are going to promote the data to an anon we
 2389          * allocate a blank anon here and plug it into our amap.
 2390          */
 2391         error = uvmfault_promote(ufi, NULL, uobjpage, &anon, &flt->anon_spare);
 2392         switch (error) {
 2393         case 0:
 2394                 break;
 2395         case ERESTART:
 2396                 return ERESTART;
 2397         default:
 2398                 return error;
 2399         }
 2400 
 2401         pg = anon->an_page;
 2402 
 2403         /*
 2404          * Fill in the data.
 2405          */
 2406 
 2407         if (uobjpage != PGO_DONTCARE) {
 2408                 cpu_count(CPU_COUNT_FLT_PRCOPY, 1);
 2409 
 2410                 /*
 2411                  * promote to shared amap?  make sure all sharing
 2412                  * procs see it
 2413                  */
 2414 
 2415                 if ((amap_flags(amap) & AMAP_SHARED) != 0) {
 2416                         pmap_page_protect(uobjpage, VM_PROT_NONE);
 2417                         /*
 2418                          * XXX: PAGE MIGHT BE WIRED!
 2419                          */
 2420                 }
 2421 
 2422                 UVMHIST_LOG(maphist,
 2423                     "  promote uobjpage %#jx to anon/page %#jx/%#jx",
 2424                     (uintptr_t)uobjpage, (uintptr_t)anon, (uintptr_t)pg, 0);
 2425 
 2426         } else {
 2427                 cpu_count(CPU_COUNT_FLT_PRZERO, 1);
 2428 
 2429                 /*
 2430                  * Page is zero'd and marked dirty by
 2431                  * uvmfault_promote().
 2432                  */
 2433 
 2434                 UVMHIST_LOG(maphist,"  zero fill anon/page %#jx/%#jx",
 2435                     (uintptr_t)anon, (uintptr_t)pg, 0, 0);
 2436         }
 2437 
 2438         return uvm_fault_lower_enter(ufi, flt, uobj, anon, pg);
 2439 }
 2440 
 2441 /*
 2442  * uvm_fault_lower_enter: enter h/w mapping of lower page or anon page promoted
 2443  * from the lower page.
 2444  */
 2445 
 2446 int
 2447 uvm_fault_lower_enter(
 2448         struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
 2449         struct uvm_object *uobj,
 2450         struct vm_anon *anon, struct vm_page *pg)
 2451 {
 2452         struct vm_amap * const amap = ufi->entry->aref.ar_amap;
 2453         const bool readonly = uvm_pagereadonly_p(pg);
 2454         int error;
 2455         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 2456 
 2457         /*
 2458          * Locked:
 2459          *
 2460          *      maps(read), amap(if !null), uobj(if !null),
 2461          *      anon(if !null), pg(if anon), unlock_uobj(if !null)
 2462          *
 2463          * anon must be write locked (promotion).  uobj can be either.
 2464          *
 2465          * Note: pg is either the uobjpage or the new page in the new anon.
 2466          */
 2467 
 2468         KASSERT(amap == NULL ||
 2469             rw_lock_op(amap->am_lock) == flt->upper_lock_type);
 2470         KASSERT(uobj == NULL ||
 2471             rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
 2472         KASSERT(anon == NULL || anon->an_lock == amap->am_lock);
 2473 
 2474         /*
 2475          * note that pg can't be PG_RELEASED or PG_BUSY since we did
 2476          * not drop the object lock since the last time we checked.
 2477          */
 2478 
 2479         KASSERT((pg->flags & PG_RELEASED) == 0);
 2480         KASSERT((pg->flags & PG_BUSY) == 0);
 2481 
 2482         /*
 2483          * all resources are present.   we can now map it in and free our
 2484          * resources.
 2485          */
 2486 
 2487         UVMHIST_LOG(maphist,
 2488             "  MAPPING: case2: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd",
 2489             (uintptr_t)ufi->orig_map->pmap, ufi->orig_rvaddr,
 2490             (uintptr_t)pg, flt->promote);
 2491         KASSERTMSG((flt->access_type & VM_PROT_WRITE) == 0 || !readonly,
 2492             "promote=%u cow_now=%u access_type=%x enter_prot=%x cow=%u "
 2493             "entry=%p map=%p orig_rvaddr=%p pg=%p",
 2494             flt->promote, flt->cow_now, flt->access_type, flt->enter_prot,
 2495             UVM_ET_ISCOPYONWRITE(ufi->entry), ufi->entry, ufi->orig_map,
 2496             (void *)ufi->orig_rvaddr, pg);
 2497         KASSERT((flt->access_type & VM_PROT_WRITE) == 0 || !readonly);
 2498         if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
 2499             VM_PAGE_TO_PHYS(pg),
 2500             readonly ? flt->enter_prot & ~VM_PROT_WRITE : flt->enter_prot,
 2501             flt->access_type | PMAP_CANFAIL |
 2502             (flt->wire_mapping ? PMAP_WIRED : 0)) != 0) {
 2503 
 2504                 /*
 2505                  * No need to undo what we did; we can simply think of
 2506                  * this as the pmap throwing away the mapping information.
 2507                  *
 2508                  * We do, however, have to go through the ReFault path,
 2509                  * as the map may change while we're asleep.
 2510                  */
 2511 
 2512                 /*
 2513                  * ensure that the page is queued in the case that
 2514                  * we just promoted the page.
 2515                  */
 2516 
 2517                 if (anon != NULL) {
 2518                         uvm_pagelock(pg);
 2519                         uvm_pageenqueue(pg);
 2520                         uvm_pagewakeup(pg);
 2521                         uvm_pageunlock(pg);
 2522                 }
 2523 
 2524                 uvmfault_unlockall(ufi, amap, uobj);
 2525                 if (!uvm_reclaimable()) {
 2526                         UVMHIST_LOG(maphist,
 2527                             "<- failed.  out of VM",0,0,0,0);
 2528                         /* XXX instrumentation */
 2529                         error = ENOMEM;
 2530                         return error;
 2531                 }
 2532                 /* XXX instrumentation */
 2533                 uvm_wait("flt_pmfail2");
 2534                 return ERESTART;
 2535         }
 2536 
 2537         uvm_fault_lower_done(ufi, flt, uobj, pg);
 2538         pmap_update(ufi->orig_map->pmap);
 2539         uvmfault_unlockall(ufi, amap, uobj);
 2540 
 2541         UVMHIST_LOG(maphist, "<- done (SUCCESS!)",0,0,0,0);
 2542         return 0;
 2543 }
 2544 
 2545 /*
 2546  * uvm_fault_lower_done: queue lower center page.
 2547  */
 2548 
 2549 void
 2550 uvm_fault_lower_done(
 2551         struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
 2552         struct uvm_object *uobj, struct vm_page *pg)
 2553 {
 2554 
 2555         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 2556 
 2557         if (flt->wire_paging) {
 2558                 uvm_pagelock(pg);
 2559                 uvm_pagewire(pg);
 2560                 uvm_pageunlock(pg);
 2561                 if (pg->flags & PG_AOBJ) {
 2562 
 2563                         /*
 2564                          * since the now-wired page cannot be paged out,
 2565                          * release its swap resources for others to use.
 2566                          * since an aobj page with no swap cannot be clean,
 2567                          * mark it dirty now.
 2568                          *
 2569                          * use pg->uobject here.  if the page is from a
 2570                          * tmpfs vnode, the pages are backed by its UAO and
 2571                          * not the vnode.
 2572                          */
 2573 
 2574                         KASSERT(uobj != NULL);
 2575                         KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock);
 2576                         uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
 2577                         uao_dropswap(pg->uobject, pg->offset >> PAGE_SHIFT);
 2578                 }
 2579         } else if (uvmpdpol_pageactivate_p(pg)) {
 2580                 /*
 2581                  * avoid re-activating the page unless needed,
 2582                  * to avoid false sharing on multiprocessor.
 2583                  */
 2584 
 2585                 uvm_pagelock(pg);
 2586                 uvm_pageactivate(pg);
 2587                 uvm_pageunlock(pg);
 2588         }
 2589 }
 2590 
 2591 
 2592 /*
 2593  * uvm_fault_wire: wire down a range of virtual addresses in a map.
 2594  *
 2595  * => map may be read-locked by caller, but MUST NOT be write-locked.
 2596  * => if map is read-locked, any operations which may cause map to
 2597  *      be write-locked in uvm_fault() must be taken care of by
 2598  *      the caller.  See uvm_map_pageable().
 2599  */
 2600 
 2601 int
 2602 uvm_fault_wire(struct vm_map *map, vaddr_t start, vaddr_t end,
 2603     vm_prot_t access_type, int maxprot)
 2604 {
 2605         vaddr_t va;
 2606         int error;
 2607 
 2608         /*
 2609          * now fault it in a page at a time.   if the fault fails then we have
 2610          * to undo what we have done.   note that in uvm_fault VM_PROT_NONE
 2611          * is replaced with the max protection if fault_type is VM_FAULT_WIRE.
 2612          */
 2613 
 2614         /*
 2615          * XXX work around overflowing a vaddr_t.  this prevents us from
 2616          * wiring the last page in the address space, though.
 2617          */
 2618         if (start > end) {
 2619                 return EFAULT;
 2620         }
 2621 
 2622         for (va = start; va < end; va += PAGE_SIZE) {
 2623                 error = uvm_fault_internal(map, va, access_type,
 2624                     (maxprot ? UVM_FAULT_MAXPROT : 0) | UVM_FAULT_WIRE);
 2625                 if (error) {
 2626                         if (va != start) {
 2627                                 uvm_fault_unwire(map, start, va);
 2628                         }
 2629                         return error;
 2630                 }
 2631         }
 2632         return 0;
 2633 }
 2634 
 2635 /*
 2636  * uvm_fault_unwire(): unwire range of virtual space.
 2637  */
 2638 
 2639 void
 2640 uvm_fault_unwire(struct vm_map *map, vaddr_t start, vaddr_t end)
 2641 {
 2642         vm_map_lock_read(map);
 2643         uvm_fault_unwire_locked(map, start, end);
 2644         vm_map_unlock_read(map);
 2645 }
 2646 
 2647 /*
 2648  * uvm_fault_unwire_locked(): the guts of uvm_fault_unwire().
 2649  *
 2650  * => map must be at least read-locked.
 2651  */
 2652 
 2653 void
 2654 uvm_fault_unwire_locked(struct vm_map *map, vaddr_t start, vaddr_t end)
 2655 {
 2656         struct vm_map_entry *entry, *oentry;
 2657         pmap_t pmap = vm_map_pmap(map);
 2658         vaddr_t va;
 2659         paddr_t pa;
 2660         struct vm_page *pg;
 2661 
 2662         /*
 2663          * we assume that the area we are unwiring has actually been wired
 2664          * in the first place.   this means that we should be able to extract
 2665          * the PAs from the pmap.   we also lock out the page daemon so that
 2666          * we can call uvm_pageunwire.
 2667          */
 2668 
 2669         /*
 2670          * find the beginning map entry for the region.
 2671          */
 2672 
 2673         KASSERT(start >= vm_map_min(map) && end <= vm_map_max(map));
 2674         if (uvm_map_lookup_entry(map, start, &entry) == false)
 2675                 panic("uvm_fault_unwire_locked: address not in map");
 2676 
 2677         oentry = NULL;
 2678         for (va = start; va < end; va += PAGE_SIZE) {
 2679 
 2680                 /*
 2681                  * find the map entry for the current address.
 2682                  */
 2683 
 2684                 KASSERT(va >= entry->start);
 2685                 while (va >= entry->end) {
 2686                         KASSERT(entry->next != &map->header &&
 2687                                 entry->next->start <= entry->end);
 2688                         entry = entry->next;
 2689                 }
 2690 
 2691                 /*
 2692                  * lock it.
 2693                  */
 2694 
 2695                 if (entry != oentry) {
 2696                         if (oentry != NULL) {
 2697                                 uvm_map_unlock_entry(oentry);
 2698                         }
 2699                         uvm_map_lock_entry(entry, RW_WRITER);
 2700                         oentry = entry;
 2701                 }
 2702 
 2703                 /*
 2704                  * if the entry is no longer wired, tell the pmap.
 2705                  */
 2706 
 2707                 if (!pmap_extract(pmap, va, &pa))
 2708                         continue;
 2709 
 2710                 if (VM_MAPENT_ISWIRED(entry) == 0)
 2711                         pmap_unwire(pmap, va);
 2712 
 2713                 pg = PHYS_TO_VM_PAGE(pa);
 2714                 if (pg) {
 2715                         uvm_pagelock(pg);
 2716                         uvm_pageunwire(pg);
 2717                         uvm_pageunlock(pg);
 2718                 }
 2719         }
 2720 
 2721         if (oentry != NULL) {
 2722                 uvm_map_unlock_entry(entry);
 2723         }
 2724 }
Cache object: 10d7e46bc89ab70b6028a41e62d3455b
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/uvm/uvm_fault.c

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_fault.c