The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_fault.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * (MPSAFE)
    3  *
    4  * Copyright (c) 1991, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * Copyright (c) 1994 John S. Dyson
    7  * All rights reserved.
    8  * Copyright (c) 1994 David Greenman
    9  * All rights reserved.
   10  *
   11  *
   12  * This code is derived from software contributed to Berkeley by
   13  * The Mach Operating System project at Carnegie-Mellon University.
   14  *
   15  * Redistribution and use in source and binary forms, with or without
   16  * modification, are permitted provided that the following conditions
   17  * are met:
   18  * 1. Redistributions of source code must retain the above copyright
   19  *    notice, this list of conditions and the following disclaimer.
   20  * 2. Redistributions in binary form must reproduce the above copyright
   21  *    notice, this list of conditions and the following disclaimer in the
   22  *    documentation and/or other materials provided with the distribution.
   23  * 3. Neither the name of the University nor the names of its contributors
   24  *    may be used to endorse or promote products derived from this software
   25  *    without specific prior written permission.
   26  *
   27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   37  * SUCH DAMAGE.
   38  *
   39  *      from: @(#)vm_fault.c    8.4 (Berkeley) 1/12/94
   40  *
   41  *
   42  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   43  * All rights reserved.
   44  *
   45  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
   46  *
   47  * Permission to use, copy, modify and distribute this software and
   48  * its documentation is hereby granted, provided that both the copyright
   49  * notice and this permission notice appear in all copies of the
   50  * software, derivative works or modified versions, and any portions
   51  * thereof, and that both notices appear in supporting documentation.
   52  *
   53  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   54  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   55  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   56  *
   57  * Carnegie Mellon requests users of this software to return to
   58  *
   59  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   60  *  School of Computer Science
   61  *  Carnegie Mellon University
   62  *  Pittsburgh PA 15213-3890
   63  *
   64  * any improvements or extensions that they make and grant Carnegie the
   65  * rights to redistribute these changes.
   66  *
   67  * $FreeBSD: src/sys/vm/vm_fault.c,v 1.108.2.8 2002/02/26 05:49:27 silby Exp $
   68  * $DragonFly: src/sys/vm/vm_fault.c,v 1.47 2008/07/01 02:02:56 dillon Exp $
   69  */
   70 
   71 /*
   72  *      Page fault handling module.
   73  */
   74 
   75 #include <sys/param.h>
   76 #include <sys/systm.h>
   77 #include <sys/kernel.h>
   78 #include <sys/proc.h>
   79 #include <sys/vnode.h>
   80 #include <sys/resourcevar.h>
   81 #include <sys/vmmeter.h>
   82 #include <sys/vkernel.h>
   83 #include <sys/lock.h>
   84 #include <sys/sysctl.h>
   85 
   86 #include <cpu/lwbuf.h>
   87 
   88 #include <vm/vm.h>
   89 #include <vm/vm_param.h>
   90 #include <vm/pmap.h>
   91 #include <vm/vm_map.h>
   92 #include <vm/vm_object.h>
   93 #include <vm/vm_page.h>
   94 #include <vm/vm_pageout.h>
   95 #include <vm/vm_kern.h>
   96 #include <vm/vm_pager.h>
   97 #include <vm/vnode_pager.h>
   98 #include <vm/vm_extern.h>
   99 
  100 #include <sys/thread2.h>
  101 #include <vm/vm_page2.h>
  102 
  103 struct faultstate {
  104         vm_page_t m;
  105         vm_object_t object;
  106         vm_pindex_t pindex;
  107         vm_prot_t prot;
  108         vm_page_t first_m;
  109         vm_object_t first_object;
  110         vm_prot_t first_prot;
  111         vm_map_t map;
  112         vm_map_entry_t entry;
  113         int lookup_still_valid;
  114         int hardfault;
  115         int fault_flags;
  116         int map_generation;
  117         int shared;
  118         int first_shared;
  119         boolean_t wired;
  120         struct vnode *vp;
  121 };
  122 
  123 static int debug_cluster = 0;
  124 SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, "");
  125 int vm_shared_fault = 1;
  126 TUNABLE_INT("vm.shared_fault", &vm_shared_fault);
  127 SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW, &vm_shared_fault, 0,
  128            "Allow shared token on vm_object");
  129 static long vm_shared_hit = 0;
  130 SYSCTL_LONG(_vm, OID_AUTO, shared_hit, CTLFLAG_RW, &vm_shared_hit, 0,
  131            "Successful shared faults");
  132 static long vm_shared_count = 0;
  133 SYSCTL_LONG(_vm, OID_AUTO, shared_count, CTLFLAG_RW, &vm_shared_count, 0,
  134            "Shared fault attempts");
  135 static long vm_shared_miss = 0;
  136 SYSCTL_LONG(_vm, OID_AUTO, shared_miss, CTLFLAG_RW, &vm_shared_miss, 0,
  137            "Unsuccessful shared faults");
  138 
  139 static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t, int);
  140 static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *,
  141                         vpte_t, int, int);
  142 #if 0
  143 static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t *, int *);
  144 #endif
  145 static void vm_set_nosync(vm_page_t m, vm_map_entry_t entry);
  146 static void vm_prefault(pmap_t pmap, vm_offset_t addra,
  147                         vm_map_entry_t entry, int prot, int fault_flags);
  148 static void vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
  149                         vm_map_entry_t entry, int prot, int fault_flags);
  150 
  151 static __inline void
  152 release_page(struct faultstate *fs)
  153 {
  154         vm_page_deactivate(fs->m);
  155         vm_page_wakeup(fs->m);
  156         fs->m = NULL;
  157 }
  158 
  159 /*
  160  * NOTE: Once unlocked any cached fs->entry becomes invalid, any reuse
  161  *       requires relocking and then checking the timestamp.
  162  *
  163  * NOTE: vm_map_lock_read() does not bump fs->map->timestamp so we do
  164  *       not have to update fs->map_generation here.
  165  *
  166  * NOTE: This function can fail due to a deadlock against the caller's
  167  *       holding of a vm_page BUSY.
  168  */
  169 static __inline int
  170 relock_map(struct faultstate *fs)
  171 {
  172         int error;
  173 
  174         if (fs->lookup_still_valid == FALSE && fs->map) {
  175                 error = vm_map_lock_read_to(fs->map);
  176                 if (error == 0)
  177                         fs->lookup_still_valid = TRUE;
  178         } else {
  179                 error = 0;
  180         }
  181         return error;
  182 }
  183 
  184 static __inline void
  185 unlock_map(struct faultstate *fs)
  186 {
  187         if (fs->lookup_still_valid && fs->map) {
  188                 vm_map_lookup_done(fs->map, fs->entry, 0);
  189                 fs->lookup_still_valid = FALSE;
  190         }
  191 }
  192 
  193 /*
  194  * Clean up after a successful call to vm_fault_object() so another call
  195  * to vm_fault_object() can be made.
  196  */
  197 static void
  198 _cleanup_successful_fault(struct faultstate *fs, int relock)
  199 {
  200         /*
  201          * We allocated a junk page for a COW operation that did
  202          * not occur, the page must be freed.
  203          */
  204         if (fs->object != fs->first_object) {
  205                 KKASSERT(fs->first_shared == 0);
  206                 vm_page_free(fs->first_m);
  207                 vm_object_pip_wakeup(fs->object);
  208                 fs->first_m = NULL;
  209         }
  210 
  211         /*
  212          * Reset fs->object.
  213          */
  214         fs->object = fs->first_object;
  215         if (relock && fs->lookup_still_valid == FALSE) {
  216                 if (fs->map)
  217                         vm_map_lock_read(fs->map);
  218                 fs->lookup_still_valid = TRUE;
  219         }
  220 }
  221 
  222 static void
  223 _unlock_things(struct faultstate *fs, int dealloc)
  224 {
  225         _cleanup_successful_fault(fs, 0);
  226         if (dealloc) {
  227                 /*vm_object_deallocate(fs->first_object);*/
  228                 /*fs->first_object = NULL; drop used later on */
  229         }
  230         unlock_map(fs); 
  231         if (fs->vp != NULL) { 
  232                 vput(fs->vp);
  233                 fs->vp = NULL;
  234         }
  235 }
  236 
  237 #define unlock_things(fs) _unlock_things(fs, 0)
  238 #define unlock_and_deallocate(fs) _unlock_things(fs, 1)
  239 #define cleanup_successful_fault(fs) _cleanup_successful_fault(fs, 1)
  240 
  241 /*
  242  * TRYPAGER 
  243  *
  244  * Determine if the pager for the current object *might* contain the page.
  245  *
  246  * We only need to try the pager if this is not a default object (default
  247  * objects are zero-fill and have no real pager), and if we are not taking
  248  * a wiring fault or if the FS entry is wired.
  249  */
  250 #define TRYPAGER(fs)    \
  251                 (fs->object->type != OBJT_DEFAULT && \
  252                 (((fs->fault_flags & VM_FAULT_WIRE_MASK) == 0) || fs->wired))
  253 
  254 /*
  255  * vm_fault:
  256  *
  257  * Handle a page fault occuring at the given address, requiring the given
  258  * permissions, in the map specified.  If successful, the page is inserted
  259  * into the associated physical map.
  260  *
  261  * NOTE: The given address should be truncated to the proper page address.
  262  *
  263  * KERN_SUCCESS is returned if the page fault is handled; otherwise,
  264  * a standard error specifying why the fault is fatal is returned.
  265  *
  266  * The map in question must be referenced, and remains so.
  267  * The caller may hold no locks.
  268  * No other requirements.
  269  */
  270 int
  271 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags)
  272 {
  273         int result;
  274         vm_pindex_t first_pindex;
  275         struct faultstate fs;
  276         struct lwp *lp;
  277         int growstack;
  278         int retry = 0;
  279 
  280         vm_page_pcpu_cache();
  281         fs.hardfault = 0;
  282         fs.fault_flags = fault_flags;
  283         fs.vp = NULL;
  284         fs.shared = vm_shared_fault;
  285         fs.first_shared = vm_shared_fault;
  286         growstack = 1;
  287         if (vm_shared_fault)
  288                 ++vm_shared_count;
  289 
  290         /*
  291          * vm_map interactions
  292          */
  293         if ((lp = curthread->td_lwp) != NULL)
  294                 lp->lwp_flags |= LWP_PAGING;
  295         lwkt_gettoken(&map->token);
  296 
  297 RetryFault:
  298         /*
  299          * Find the vm_map_entry representing the backing store and resolve
  300          * the top level object and page index.  This may have the side
  301          * effect of executing a copy-on-write on the map entry and/or
  302          * creating a shadow object, but will not COW any actual VM pages.
  303          *
  304          * On success fs.map is left read-locked and various other fields 
  305          * are initialized but not otherwise referenced or locked.
  306          *
  307          * NOTE!  vm_map_lookup will try to upgrade the fault_type to
  308          * VM_FAULT_WRITE if the map entry is a virtual page table and also
  309          * writable, so we can set the 'A'accessed bit in the virtual page
  310          * table entry.
  311          */
  312         fs.map = map;
  313         result = vm_map_lookup(&fs.map, vaddr, fault_type,
  314                                &fs.entry, &fs.first_object,
  315                                &first_pindex, &fs.first_prot, &fs.wired);
  316 
  317         /*
  318          * If the lookup failed or the map protections are incompatible,
  319          * the fault generally fails.  However, if the caller is trying
  320          * to do a user wiring we have more work to do.
  321          */
  322         if (result != KERN_SUCCESS) {
  323                 if (result != KERN_PROTECTION_FAILURE ||
  324                     (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE)
  325                 {
  326                         if (result == KERN_INVALID_ADDRESS && growstack &&
  327                             map != &kernel_map && curproc != NULL) {
  328                                 result = vm_map_growstack(curproc, vaddr);
  329                                 if (result == KERN_SUCCESS) {
  330                                         growstack = 0;
  331                                         ++retry;
  332                                         goto RetryFault;
  333                                 }
  334                                 result = KERN_FAILURE;
  335                         }
  336                         goto done;
  337                 }
  338 
  339                 /*
  340                  * If we are user-wiring a r/w segment, and it is COW, then
  341                  * we need to do the COW operation.  Note that we don't
  342                  * currently COW RO sections now, because it is NOT desirable
  343                  * to COW .text.  We simply keep .text from ever being COW'ed
  344                  * and take the heat that one cannot debug wired .text sections.
  345                  */
  346                 result = vm_map_lookup(&fs.map, vaddr,
  347                                        VM_PROT_READ|VM_PROT_WRITE|
  348                                         VM_PROT_OVERRIDE_WRITE,
  349                                        &fs.entry, &fs.first_object,
  350                                        &first_pindex, &fs.first_prot,
  351                                        &fs.wired);
  352                 if (result != KERN_SUCCESS) {
  353                         result = KERN_FAILURE;
  354                         goto done;
  355                 }
  356 
  357                 /*
  358                  * If we don't COW now, on a user wire, the user will never
  359                  * be able to write to the mapping.  If we don't make this
  360                  * restriction, the bookkeeping would be nearly impossible.
  361                  *
  362                  * XXX We have a shared lock, this will have a MP race but
  363                  * I don't see how it can hurt anything.
  364                  */
  365                 if ((fs.entry->protection & VM_PROT_WRITE) == 0)
  366                         fs.entry->max_protection &= ~VM_PROT_WRITE;
  367         }
  368 
  369         /*
  370          * fs.map is read-locked
  371          *
  372          * Misc checks.  Save the map generation number to detect races.
  373          */
  374         fs.map_generation = fs.map->timestamp;
  375         fs.lookup_still_valid = TRUE;
  376         fs.first_m = NULL;
  377         fs.object = fs.first_object;    /* so unlock_and_deallocate works */
  378 
  379         if (fs.entry->eflags & (MAP_ENTRY_NOFAULT | MAP_ENTRY_KSTACK)) {
  380                 if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
  381                         panic("vm_fault: fault on nofault entry, addr: %p",
  382                               (void *)vaddr);
  383                 }
  384                 if ((fs.entry->eflags & MAP_ENTRY_KSTACK) &&
  385                     vaddr >= fs.entry->start &&
  386                     vaddr < fs.entry->start + PAGE_SIZE) {
  387                         panic("vm_fault: fault on stack guard, addr: %p",
  388                               (void *)vaddr);
  389                 }
  390         }
  391 
  392         /*
  393          * A system map entry may return a NULL object.  No object means
  394          * no pager means an unrecoverable kernel fault.
  395          */
  396         if (fs.first_object == NULL) {
  397                 panic("vm_fault: unrecoverable fault at %p in entry %p",
  398                         (void *)vaddr, fs.entry);
  399         }
  400 
  401         /*
  402          * Fail here if not a trivial anonymous page fault and TDF_NOFAULT
  403          * is set.
  404          */
  405         if ((curthread->td_flags & TDF_NOFAULT) &&
  406             (retry ||
  407              fs.first_object->type == OBJT_VNODE ||
  408              fs.first_object->backing_object)) {
  409                 result = KERN_FAILURE;
  410                 unlock_things(&fs);
  411                 goto done2;
  412         }
  413 
  414         /*
  415          * If the entry is wired we cannot change the page protection.
  416          */
  417         if (fs.wired)
  418                 fault_type = fs.first_prot;
  419 
  420         /*
  421          * We generally want to avoid unnecessary exclusive modes on backing
  422          * and terminal objects because this can seriously interfere with
  423          * heavily fork()'d processes (particularly /bin/sh scripts).
  424          *
  425          * However, we also want to avoid unnecessary retries due to needed
  426          * shared->exclusive promotion for common faults.  Exclusive mode is
  427          * always needed if any page insertion, rename, or free occurs in an
  428          * object (and also indirectly if any I/O is done).
  429          *
  430          * The main issue here is going to be fs.first_shared.  If the
  431          * first_object has a backing object which isn't shadowed and the
  432          * process is single-threaded we might as well use an exclusive
  433          * lock/chain right off the bat.
  434          */
  435         if (fs.first_shared && fs.first_object->backing_object &&
  436             LIST_EMPTY(&fs.first_object->shadow_head) &&
  437             curthread->td_proc && curthread->td_proc->p_nthreads == 1) {
  438                 fs.first_shared = 0;
  439         }
  440 
  441         /*
  442          * swap_pager_unswapped() needs an exclusive object
  443          */
  444         if (fault_flags & (VM_FAULT_UNSWAP | VM_FAULT_DIRTY)) {
  445                 fs.first_shared = 0;
  446         }
  447 
  448         /*
  449          * Obtain a top-level object lock, shared or exclusive depending
  450          * on fs.first_shared.  If a shared lock winds up being insufficient
  451          * we will retry with an exclusive lock.
  452          *
  453          * The vnode pager lock is always shared.
  454          */
  455         if (fs.first_shared)
  456                 vm_object_hold_shared(fs.first_object);
  457         else
  458                 vm_object_hold(fs.first_object);
  459         if (fs.vp == NULL)
  460                 fs.vp = vnode_pager_lock(fs.first_object);
  461 
  462         /*
  463          * The page we want is at (first_object, first_pindex), but if the
  464          * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the
  465          * page table to figure out the actual pindex.
  466          *
  467          * NOTE!  DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION
  468          * ONLY
  469          */
  470         if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
  471                 result = vm_fault_vpagetable(&fs, &first_pindex,
  472                                              fs.entry->aux.master_pde,
  473                                              fault_type, 1);
  474                 if (result == KERN_TRY_AGAIN) {
  475                         vm_object_drop(fs.first_object);
  476                         ++retry;
  477                         goto RetryFault;
  478                 }
  479                 if (result != KERN_SUCCESS)
  480                         goto done;
  481         }
  482 
  483         /*
  484          * Now we have the actual (object, pindex), fault in the page.  If
  485          * vm_fault_object() fails it will unlock and deallocate the FS
  486          * data.   If it succeeds everything remains locked and fs->object
  487          * will have an additional PIP count if it is not equal to
  488          * fs->first_object
  489          *
  490          * vm_fault_object will set fs->prot for the pmap operation.  It is
  491          * allowed to set VM_PROT_WRITE if fault_type == VM_PROT_READ if the
  492          * page can be safely written.  However, it will force a read-only
  493          * mapping for a read fault if the memory is managed by a virtual
  494          * page table.
  495          *
  496          * If the fault code uses the shared object lock shortcut
  497          * we must not try to burst (we can't allocate VM pages).
  498          */
  499         result = vm_fault_object(&fs, first_pindex, fault_type, 1);
  500         if (result == KERN_TRY_AGAIN) {
  501                 vm_object_drop(fs.first_object);
  502                 ++retry;
  503                 goto RetryFault;
  504         }
  505         if (result != KERN_SUCCESS)
  506                 goto done;
  507 
  508         /*
  509          * On success vm_fault_object() does not unlock or deallocate, and fs.m
  510          * will contain a busied page.
  511          *
  512          * Enter the page into the pmap and do pmap-related adjustments.
  513          */
  514         KKASSERT(fs.lookup_still_valid == TRUE);
  515         vm_page_flag_set(fs.m, PG_REFERENCED);
  516         pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired, fs.entry);
  517         mycpu->gd_cnt.v_vm_faults++;
  518         if (curthread->td_lwp)
  519                 ++curthread->td_lwp->lwp_ru.ru_minflt;
  520 
  521         /*KKASSERT(fs.m->queue == PQ_NONE); page-in op may deactivate page */
  522         KKASSERT(fs.m->flags & PG_BUSY);
  523 
  524         /*
  525          * If the page is not wired down, then put it where the pageout daemon
  526          * can find it.
  527          */
  528         if (fs.fault_flags & VM_FAULT_WIRE_MASK) {
  529                 if (fs.wired)
  530                         vm_page_wire(fs.m);
  531                 else
  532                         vm_page_unwire(fs.m, 1);
  533         } else {
  534                 vm_page_activate(fs.m);
  535         }
  536         vm_page_wakeup(fs.m);
  537 
  538         /*
  539          * Burst in a few more pages if possible.  The fs.map should still
  540          * be locked.  To avoid interlocking against a vnode->getblk
  541          * operation we had to be sure to unbusy our primary vm_page above
  542          * first.
  543          *
  544          * A normal burst can continue down backing store, only execute
  545          * if we are holding an exclusive lock, otherwise the exclusive
  546          * locks the burst code gets might cause excessive SMP collisions.
  547          *
  548          * A quick burst can be utilized when there is no backing object
  549          * (i.e. a shared file mmap).
  550          */
  551         if ((fault_flags & VM_FAULT_BURST) &&
  552             (fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 &&
  553             fs.wired == 0) {
  554                 if (fs.first_shared == 0 && fs.shared == 0) {
  555                         vm_prefault(fs.map->pmap, vaddr,
  556                                     fs.entry, fs.prot, fault_flags);
  557                 } else {
  558                         vm_prefault_quick(fs.map->pmap, vaddr,
  559                                           fs.entry, fs.prot, fault_flags);
  560                 }
  561         }
  562 
  563         /*
  564          * Unlock everything, and return
  565          */
  566         unlock_things(&fs);
  567 
  568         if (curthread->td_lwp) {
  569                 if (fs.hardfault) {
  570                         curthread->td_lwp->lwp_ru.ru_majflt++;
  571                 } else {
  572                         curthread->td_lwp->lwp_ru.ru_minflt++;
  573                 }
  574         }
  575 
  576         /*vm_object_deallocate(fs.first_object);*/
  577         /*fs.m = NULL; */
  578         /*fs.first_object = NULL; must still drop later */
  579 
  580         result = KERN_SUCCESS;
  581 done:
  582         if (fs.first_object)
  583                 vm_object_drop(fs.first_object);
  584 done2:
  585         lwkt_reltoken(&map->token);
  586         if (lp)
  587                 lp->lwp_flags &= ~LWP_PAGING;
  588         if (vm_shared_fault && fs.shared == 0)
  589                 ++vm_shared_miss;
  590         return (result);
  591 }
  592 
  593 /*
  594  * Fault in the specified virtual address in the current process map, 
  595  * returning a held VM page or NULL.  See vm_fault_page() for more 
  596  * information.
  597  *
  598  * No requirements.
  599  */
  600 vm_page_t
  601 vm_fault_page_quick(vm_offset_t va, vm_prot_t fault_type, int *errorp)
  602 {
  603         struct lwp *lp = curthread->td_lwp;
  604         vm_page_t m;
  605 
  606         m = vm_fault_page(&lp->lwp_vmspace->vm_map, va, 
  607                           fault_type, VM_FAULT_NORMAL, errorp);
  608         return(m);
  609 }
  610 
  611 /*
  612  * Fault in the specified virtual address in the specified map, doing all
  613  * necessary manipulation of the object store and all necessary I/O.  Return
  614  * a held VM page or NULL, and set *errorp.  The related pmap is not
  615  * updated.
  616  *
  617  * The returned page will be properly dirtied if VM_PROT_WRITE was specified,
  618  * and marked PG_REFERENCED as well.
  619  *
  620  * If the page cannot be faulted writable and VM_PROT_WRITE was specified, an
  621  * error will be returned.
  622  *
  623  * No requirements.
  624  */
  625 vm_page_t
  626 vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
  627               int fault_flags, int *errorp)
  628 {
  629         vm_pindex_t first_pindex;
  630         struct faultstate fs;
  631         int result;
  632         int retry = 0;
  633         vm_prot_t orig_fault_type = fault_type;
  634 
  635         fs.hardfault = 0;
  636         fs.fault_flags = fault_flags;
  637         KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
  638 
  639         /*
  640          * Dive the pmap (concurrency possible).  If we find the
  641          * appropriate page we can terminate early and quickly.
  642          */
  643         fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type);
  644         if (fs.m) {
  645                 *errorp = 0;
  646                 return(fs.m);
  647         }
  648 
  649         /*
  650          * Otherwise take a concurrency hit and do a formal page
  651          * fault.
  652          */
  653         fs.shared = vm_shared_fault;
  654         fs.first_shared = vm_shared_fault;
  655         fs.vp = NULL;
  656         lwkt_gettoken(&map->token);
  657 
  658         /*
  659          * swap_pager_unswapped() needs an exclusive object
  660          */
  661         if (fault_flags & (VM_FAULT_UNSWAP | VM_FAULT_DIRTY)) {
  662                 fs.first_shared = 0;
  663         }
  664 
  665 RetryFault:
  666         /*
  667          * Find the vm_map_entry representing the backing store and resolve
  668          * the top level object and page index.  This may have the side
  669          * effect of executing a copy-on-write on the map entry and/or
  670          * creating a shadow object, but will not COW any actual VM pages.
  671          *
  672          * On success fs.map is left read-locked and various other fields 
  673          * are initialized but not otherwise referenced or locked.
  674          *
  675          * NOTE!  vm_map_lookup will upgrade the fault_type to VM_FAULT_WRITE
  676          * if the map entry is a virtual page table and also writable,
  677          * so we can set the 'A'accessed bit in the virtual page table entry.
  678          */
  679         fs.map = map;
  680         result = vm_map_lookup(&fs.map, vaddr, fault_type,
  681                                &fs.entry, &fs.first_object,
  682                                &first_pindex, &fs.first_prot, &fs.wired);
  683 
  684         if (result != KERN_SUCCESS) {
  685                 *errorp = result;
  686                 fs.m = NULL;
  687                 goto done;
  688         }
  689 
  690         /*
  691          * fs.map is read-locked
  692          *
  693          * Misc checks.  Save the map generation number to detect races.
  694          */
  695         fs.map_generation = fs.map->timestamp;
  696         fs.lookup_still_valid = TRUE;
  697         fs.first_m = NULL;
  698         fs.object = fs.first_object;    /* so unlock_and_deallocate works */
  699 
  700         if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
  701                 panic("vm_fault: fault on nofault entry, addr: %lx",
  702                     (u_long)vaddr);
  703         }
  704 
  705         /*
  706          * A system map entry may return a NULL object.  No object means
  707          * no pager means an unrecoverable kernel fault.
  708          */
  709         if (fs.first_object == NULL) {
  710                 panic("vm_fault: unrecoverable fault at %p in entry %p",
  711                         (void *)vaddr, fs.entry);
  712         }
  713 
  714         /*
  715          * Fail here if not a trivial anonymous page fault and TDF_NOFAULT
  716          * is set.
  717          */
  718         if ((curthread->td_flags & TDF_NOFAULT) &&
  719             (retry ||
  720              fs.first_object->type == OBJT_VNODE ||
  721              fs.first_object->backing_object)) {
  722                 *errorp = KERN_FAILURE;
  723                 unlock_things(&fs);
  724                 goto done2;
  725         }
  726 
  727         /*
  728          * If the entry is wired we cannot change the page protection.
  729          */
  730         if (fs.wired)
  731                 fault_type = fs.first_prot;
  732 
  733         /*
  734          * Make a reference to this object to prevent its disposal while we
  735          * are messing with it.  Once we have the reference, the map is free
  736          * to be diddled.  Since objects reference their shadows (and copies),
  737          * they will stay around as well.
  738          *
  739          * The reference should also prevent an unexpected collapse of the
  740          * parent that might move pages from the current object into the
  741          * parent unexpectedly, resulting in corruption.
  742          *
  743          * Bump the paging-in-progress count to prevent size changes (e.g.
  744          * truncation operations) during I/O.  This must be done after
  745          * obtaining the vnode lock in order to avoid possible deadlocks.
  746          */
  747         if (fs.first_shared)
  748                 vm_object_hold_shared(fs.first_object);
  749         else
  750                 vm_object_hold(fs.first_object);
  751         if (fs.vp == NULL)
  752                 fs.vp = vnode_pager_lock(fs.first_object);      /* shared */
  753 
  754         /*
  755          * The page we want is at (first_object, first_pindex), but if the
  756          * vm_map_entry is VM_MAPTYPE_VPAGETABLE we have to traverse the
  757          * page table to figure out the actual pindex.
  758          *
  759          * NOTE!  DEVELOPMENT IN PROGRESS, THIS IS AN INITIAL IMPLEMENTATION
  760          * ONLY
  761          */
  762         if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
  763                 result = vm_fault_vpagetable(&fs, &first_pindex,
  764                                              fs.entry->aux.master_pde,
  765                                              fault_type, 1);
  766                 if (result == KERN_TRY_AGAIN) {
  767                         vm_object_drop(fs.first_object);
  768                         ++retry;
  769                         goto RetryFault;
  770                 }
  771                 if (result != KERN_SUCCESS) {
  772                         *errorp = result;
  773                         fs.m = NULL;
  774                         goto done;
  775                 }
  776         }
  777 
  778         /*
  779          * Now we have the actual (object, pindex), fault in the page.  If
  780          * vm_fault_object() fails it will unlock and deallocate the FS
  781          * data.   If it succeeds everything remains locked and fs->object
  782          * will have an additinal PIP count if it is not equal to
  783          * fs->first_object
  784          */
  785         fs.m = NULL;
  786         result = vm_fault_object(&fs, first_pindex, fault_type, 1);
  787 
  788         if (result == KERN_TRY_AGAIN) {
  789                 vm_object_drop(fs.first_object);
  790                 ++retry;
  791                 goto RetryFault;
  792         }
  793         if (result != KERN_SUCCESS) {
  794                 *errorp = result;
  795                 fs.m = NULL;
  796                 goto done;
  797         }
  798 
  799         if ((orig_fault_type & VM_PROT_WRITE) &&
  800             (fs.prot & VM_PROT_WRITE) == 0) {
  801                 *errorp = KERN_PROTECTION_FAILURE;
  802                 unlock_and_deallocate(&fs);
  803                 fs.m = NULL;
  804                 goto done;
  805         }
  806 
  807         /*
  808          * DO NOT UPDATE THE PMAP!!!  This function may be called for
  809          * a pmap unrelated to the current process pmap, in which case
  810          * the current cpu core will not be listed in the pmap's pm_active
  811          * mask.  Thus invalidation interlocks will fail to work properly.
  812          *
  813          * (for example, 'ps' uses procfs to read program arguments from
  814          * each process's stack).
  815          *
  816          * In addition to the above this function will be called to acquire
  817          * a page that might already be faulted in, re-faulting it
  818          * continuously is a waste of time.
  819          *
  820          * XXX could this have been the cause of our random seg-fault
  821          *     issues?  procfs accesses user stacks.
  822          */
  823         vm_page_flag_set(fs.m, PG_REFERENCED);
  824 #if 0
  825         pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired, NULL);
  826         mycpu->gd_cnt.v_vm_faults++;
  827         if (curthread->td_lwp)
  828                 ++curthread->td_lwp->lwp_ru.ru_minflt;
  829 #endif
  830 
  831         /*
  832          * On success vm_fault_object() does not unlock or deallocate, and fs.m
  833          * will contain a busied page.  So we must unlock here after having
  834          * messed with the pmap.
  835          */
  836         unlock_things(&fs);
  837 
  838         /*
  839          * Return a held page.  We are not doing any pmap manipulation so do
  840          * not set PG_MAPPED.  However, adjust the page flags according to
  841          * the fault type because the caller may not use a managed pmapping
  842          * (so we don't want to lose the fact that the page will be dirtied
  843          * if a write fault was specified).
  844          */
  845         vm_page_hold(fs.m);
  846         vm_page_activate(fs.m);
  847         if (fault_type & VM_PROT_WRITE)
  848                 vm_page_dirty(fs.m);
  849 
  850         if (curthread->td_lwp) {
  851                 if (fs.hardfault) {
  852                         curthread->td_lwp->lwp_ru.ru_majflt++;
  853                 } else {
  854                         curthread->td_lwp->lwp_ru.ru_minflt++;
  855                 }
  856         }
  857 
  858         /*
  859          * Unlock everything, and return the held page.
  860          */
  861         vm_page_wakeup(fs.m);
  862         /*vm_object_deallocate(fs.first_object);*/
  863         /*fs.first_object = NULL; */
  864         *errorp = 0;
  865 
  866 done:
  867         if (fs.first_object)
  868                 vm_object_drop(fs.first_object);
  869 done2:
  870         lwkt_reltoken(&map->token);
  871         return(fs.m);
  872 }
  873 
  874 /*
  875  * Fault in the specified (object,offset), dirty the returned page as
  876  * needed.  If the requested fault_type cannot be done NULL and an
  877  * error is returned.
  878  *
  879  * A held (but not busied) page is returned.
  880  *
  881  * The passed in object must be held as specified by the shared
  882  * argument.
  883  */
  884 vm_page_t
  885 vm_fault_object_page(vm_object_t object, vm_ooffset_t offset,
  886                      vm_prot_t fault_type, int fault_flags,
  887                      int *sharedp, int *errorp)
  888 {
  889         int result;
  890         vm_pindex_t first_pindex;
  891         struct faultstate fs;
  892         struct vm_map_entry entry;
  893 
  894         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
  895         bzero(&entry, sizeof(entry));
  896         entry.object.vm_object = object;
  897         entry.maptype = VM_MAPTYPE_NORMAL;
  898         entry.protection = entry.max_protection = fault_type;
  899 
  900         fs.hardfault = 0;
  901         fs.fault_flags = fault_flags;
  902         fs.map = NULL;
  903         fs.shared = vm_shared_fault;
  904         fs.first_shared = *sharedp;
  905         fs.vp = NULL;
  906         KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
  907 
  908         /*
  909          * Might require swap block adjustments
  910          */
  911         if (fs.first_shared && (fault_flags & (VM_FAULT_UNSWAP | VM_FAULT_DIRTY))) {
  912                 fs.first_shared = 0;
  913                 vm_object_upgrade(object);
  914         }
  915 
  916         /*
  917          * Retry loop as needed (typically for shared->exclusive transitions)
  918          */
  919 RetryFault:
  920         *sharedp = fs.first_shared;
  921         first_pindex = OFF_TO_IDX(offset);
  922         fs.first_object = object;
  923         fs.entry = &entry;
  924         fs.first_prot = fault_type;
  925         fs.wired = 0;
  926         /*fs.map_generation = 0; unused */
  927 
  928         /*
  929          * Make a reference to this object to prevent its disposal while we
  930          * are messing with it.  Once we have the reference, the map is free
  931          * to be diddled.  Since objects reference their shadows (and copies),
  932          * they will stay around as well.
  933          *
  934          * The reference should also prevent an unexpected collapse of the
  935          * parent that might move pages from the current object into the
  936          * parent unexpectedly, resulting in corruption.
  937          *
  938          * Bump the paging-in-progress count to prevent size changes (e.g.
  939          * truncation operations) during I/O.  This must be done after
  940          * obtaining the vnode lock in order to avoid possible deadlocks.
  941          */
  942         if (fs.vp == NULL)
  943                 fs.vp = vnode_pager_lock(fs.first_object);
  944 
  945         fs.lookup_still_valid = TRUE;
  946         fs.first_m = NULL;
  947         fs.object = fs.first_object;    /* so unlock_and_deallocate works */
  948 
  949 #if 0
  950         /* XXX future - ability to operate on VM object using vpagetable */
  951         if (fs.entry->maptype == VM_MAPTYPE_VPAGETABLE) {
  952                 result = vm_fault_vpagetable(&fs, &first_pindex,
  953                                              fs.entry->aux.master_pde,
  954                                              fault_type, 0);
  955                 if (result == KERN_TRY_AGAIN) {
  956                         if (fs.first_shared == 0 && *sharedp)
  957                                 vm_object_upgrade(object);
  958                         goto RetryFault;
  959                 }
  960                 if (result != KERN_SUCCESS) {
  961                         *errorp = result;
  962                         return (NULL);
  963                 }
  964         }
  965 #endif
  966 
  967         /*
  968          * Now we have the actual (object, pindex), fault in the page.  If
  969          * vm_fault_object() fails it will unlock and deallocate the FS
  970          * data.   If it succeeds everything remains locked and fs->object
  971          * will have an additinal PIP count if it is not equal to
  972          * fs->first_object
  973          *
  974          * On KERN_TRY_AGAIN vm_fault_object() leaves fs.first_object intact.
  975          * We may have to upgrade its lock to handle the requested fault.
  976          */
  977         result = vm_fault_object(&fs, first_pindex, fault_type, 0);
  978 
  979         if (result == KERN_TRY_AGAIN) {
  980                 if (fs.first_shared == 0 && *sharedp)
  981                         vm_object_upgrade(object);
  982                 goto RetryFault;
  983         }
  984         if (result != KERN_SUCCESS) {
  985                 *errorp = result;
  986                 return(NULL);
  987         }
  988 
  989         if ((fault_type & VM_PROT_WRITE) && (fs.prot & VM_PROT_WRITE) == 0) {
  990                 *errorp = KERN_PROTECTION_FAILURE;
  991                 unlock_and_deallocate(&fs);
  992                 return(NULL);
  993         }
  994 
  995         /*
  996          * On success vm_fault_object() does not unlock or deallocate, so we
  997          * do it here.  Note that the returned fs.m will be busied.
  998          */
  999         unlock_things(&fs);
 1000 
 1001         /*
 1002          * Return a held page.  We are not doing any pmap manipulation so do
 1003          * not set PG_MAPPED.  However, adjust the page flags according to
 1004          * the fault type because the caller may not use a managed pmapping
 1005          * (so we don't want to lose the fact that the page will be dirtied
 1006          * if a write fault was specified).
 1007          */
 1008         vm_page_hold(fs.m);
 1009         vm_page_activate(fs.m);
 1010         if ((fault_type & VM_PROT_WRITE) || (fault_flags & VM_FAULT_DIRTY))
 1011                 vm_page_dirty(fs.m);
 1012         if (fault_flags & VM_FAULT_UNSWAP)
 1013                 swap_pager_unswapped(fs.m);
 1014 
 1015         /*
 1016          * Indicate that the page was accessed.
 1017          */
 1018         vm_page_flag_set(fs.m, PG_REFERENCED);
 1019 
 1020         if (curthread->td_lwp) {
 1021                 if (fs.hardfault) {
 1022                         curthread->td_lwp->lwp_ru.ru_majflt++;
 1023                 } else {
 1024                         curthread->td_lwp->lwp_ru.ru_minflt++;
 1025                 }
 1026         }
 1027 
 1028         /*
 1029          * Unlock everything, and return the held page.
 1030          */
 1031         vm_page_wakeup(fs.m);
 1032         /*vm_object_deallocate(fs.first_object);*/
 1033         /*fs.first_object = NULL; */
 1034 
 1035         *errorp = 0;
 1036         return(fs.m);
 1037 }
 1038 
 1039 /*
 1040  * Translate the virtual page number (first_pindex) that is relative
 1041  * to the address space into a logical page number that is relative to the
 1042  * backing object.  Use the virtual page table pointed to by (vpte).
 1043  *
 1044  * This implements an N-level page table.  Any level can terminate the
 1045  * scan by setting VPTE_PS.   A linear mapping is accomplished by setting
 1046  * VPTE_PS in the master page directory entry set via mcontrol(MADV_SETMAP).
 1047  */
 1048 static
 1049 int
 1050 vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
 1051                     vpte_t vpte, int fault_type, int allow_nofault)
 1052 {
 1053         struct lwbuf *lwb;
 1054         struct lwbuf lwb_cache;
 1055         int vshift = VPTE_FRAME_END - PAGE_SHIFT; /* index bits remaining */
 1056         int result = KERN_SUCCESS;
 1057         vpte_t *ptep;
 1058 
 1059         ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object));
 1060         for (;;) {
 1061                 /*
 1062                  * We cannot proceed if the vpte is not valid, not readable
 1063                  * for a read fault, or not writable for a write fault.
 1064                  */
 1065                 if ((vpte & VPTE_V) == 0) {
 1066                         unlock_and_deallocate(fs);
 1067                         return (KERN_FAILURE);
 1068                 }
 1069                 if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_RW) == 0) {
 1070                         unlock_and_deallocate(fs);
 1071                         return (KERN_FAILURE);
 1072                 }
 1073                 if ((vpte & VPTE_PS) || vshift == 0)
 1074                         break;
 1075                 KKASSERT(vshift >= VPTE_PAGE_BITS);
 1076 
 1077                 /*
 1078                  * Get the page table page.  Nominally we only read the page
 1079                  * table, but since we are actively setting VPTE_M and VPTE_A,
 1080                  * tell vm_fault_object() that we are writing it. 
 1081                  *
 1082                  * There is currently no real need to optimize this.
 1083                  */
 1084                 result = vm_fault_object(fs, (vpte & VPTE_FRAME) >> PAGE_SHIFT,
 1085                                          VM_PROT_READ|VM_PROT_WRITE,
 1086                                          allow_nofault);
 1087                 if (result != KERN_SUCCESS)
 1088                         return (result);
 1089 
 1090                 /*
 1091                  * Process the returned fs.m and look up the page table
 1092                  * entry in the page table page.
 1093                  */
 1094                 vshift -= VPTE_PAGE_BITS;
 1095                 lwb = lwbuf_alloc(fs->m, &lwb_cache);
 1096                 ptep = ((vpte_t *)lwbuf_kva(lwb) +
 1097                         ((*pindex >> vshift) & VPTE_PAGE_MASK));
 1098                 vpte = *ptep;
 1099 
 1100                 /*
 1101                  * Page table write-back.  If the vpte is valid for the
 1102                  * requested operation, do a write-back to the page table.
 1103                  *
 1104                  * XXX VPTE_M is not set properly for page directory pages.
 1105                  * It doesn't get set in the page directory if the page table
 1106                  * is modified during a read access.
 1107                  */
 1108                 vm_page_activate(fs->m);
 1109                 if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) &&
 1110                     (vpte & VPTE_RW)) {
 1111                         if ((vpte & (VPTE_M|VPTE_A)) != (VPTE_M|VPTE_A)) {
 1112                                 atomic_set_long(ptep, VPTE_M | VPTE_A);
 1113                                 vm_page_dirty(fs->m);
 1114                         }
 1115                 }
 1116                 if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V)) {
 1117                         if ((vpte & VPTE_A) == 0) {
 1118                                 atomic_set_long(ptep, VPTE_A);
 1119                                 vm_page_dirty(fs->m);
 1120                         }
 1121                 }
 1122                 lwbuf_free(lwb);
 1123                 vm_page_flag_set(fs->m, PG_REFERENCED);
 1124                 vm_page_wakeup(fs->m);
 1125                 fs->m = NULL;
 1126                 cleanup_successful_fault(fs);
 1127         }
 1128         /*
 1129          * Combine remaining address bits with the vpte.
 1130          */
 1131         /* JG how many bits from each? */
 1132         *pindex = ((vpte & VPTE_FRAME) >> PAGE_SHIFT) +
 1133                   (*pindex & ((1L << vshift) - 1));
 1134         return (KERN_SUCCESS);
 1135 }
 1136 
 1137 
 1138 /*
 1139  * This is the core of the vm_fault code.
 1140  *
 1141  * Do all operations required to fault-in (fs.first_object, pindex).  Run
 1142  * through the shadow chain as necessary and do required COW or virtual
 1143  * copy operations.  The caller has already fully resolved the vm_map_entry
 1144  * and, if appropriate, has created a copy-on-write layer.  All we need to
 1145  * do is iterate the object chain.
 1146  *
 1147  * On failure (fs) is unlocked and deallocated and the caller may return or
 1148  * retry depending on the failure code.  On success (fs) is NOT unlocked or
 1149  * deallocated, fs.m will contained a resolved, busied page, and fs.object
 1150  * will have an additional PIP count if it is not equal to fs.first_object.
 1151  *
 1152  * If locks based on fs->first_shared or fs->shared are insufficient,
 1153  * clear the appropriate field(s) and return RETRY.  COWs require that
 1154  * first_shared be 0, while page allocations (or frees) require that
 1155  * shared be 0.  Renames require that both be 0.
 1156  *
 1157  * fs->first_object must be held on call.
 1158  */
 1159 static
 1160 int
 1161 vm_fault_object(struct faultstate *fs, vm_pindex_t first_pindex,
 1162                 vm_prot_t fault_type, int allow_nofault)
 1163 {
 1164         vm_object_t next_object;
 1165         vm_pindex_t pindex;
 1166         int error;
 1167 
 1168         ASSERT_LWKT_TOKEN_HELD(vm_object_token(fs->first_object));
 1169         fs->prot = fs->first_prot;
 1170         fs->object = fs->first_object;
 1171         pindex = first_pindex;
 1172 
 1173         vm_object_chain_acquire(fs->first_object, fs->shared);
 1174         vm_object_pip_add(fs->first_object, 1);
 1175 
 1176         /* 
 1177          * If a read fault occurs we try to make the page writable if
 1178          * possible.  There are three cases where we cannot make the
 1179          * page mapping writable:
 1180          *
 1181          * (1) The mapping is read-only or the VM object is read-only,
 1182          *     fs->prot above will simply not have VM_PROT_WRITE set.
 1183          *
 1184          * (2) If the mapping is a virtual page table we need to be able
 1185          *     to detect writes so we can set VPTE_M in the virtual page
 1186          *     table.
 1187          *
 1188          * (3) If the VM page is read-only or copy-on-write, upgrading would
 1189          *     just result in an unnecessary COW fault.
 1190          *
 1191          * VM_PROT_VPAGED is set if faulting via a virtual page table and
 1192          * causes adjustments to the 'M'odify bit to also turn off write
 1193          * access to force a re-fault.
 1194          */
 1195         if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) {
 1196                 if ((fault_type & VM_PROT_WRITE) == 0)
 1197                         fs->prot &= ~VM_PROT_WRITE;
 1198         }
 1199 
 1200         if (curthread->td_lwp && curthread->td_lwp->lwp_vmspace &&
 1201             pmap_emulate_ad_bits(&curthread->td_lwp->lwp_vmspace->vm_pmap)) {
 1202                 if ((fault_type & VM_PROT_WRITE) == 0)
 1203                         fs->prot &= ~VM_PROT_WRITE;
 1204         }
 1205 
 1206         /* vm_object_hold(fs->object); implied b/c object == first_object */
 1207 
 1208         for (;;) {
 1209                 /*
 1210                  * The entire backing chain from first_object to object
 1211                  * inclusive is chainlocked.
 1212                  *
 1213                  * If the object is dead, we stop here
 1214                  */
 1215                 if (fs->object->flags & OBJ_DEAD) {
 1216                         vm_object_pip_wakeup(fs->first_object);
 1217                         vm_object_chain_release_all(fs->first_object,
 1218                                                     fs->object);
 1219                         if (fs->object != fs->first_object)
 1220                                 vm_object_drop(fs->object);
 1221                         unlock_and_deallocate(fs);
 1222                         return (KERN_PROTECTION_FAILURE);
 1223                 }
 1224 
 1225                 /*
 1226                  * See if the page is resident.  Wait/Retry if the page is
 1227                  * busy (lots of stuff may have changed so we can't continue
 1228                  * in that case).
 1229                  *
 1230                  * We can theoretically allow the soft-busy case on a read
 1231                  * fault if the page is marked valid, but since such
 1232                  * pages are typically already pmap'd, putting that
 1233                  * special case in might be more effort then it is
 1234                  * worth.  We cannot under any circumstances mess
 1235                  * around with a vm_page_t->busy page except, perhaps,
 1236                  * to pmap it.
 1237                  */
 1238                 fs->m = vm_page_lookup_busy_try(fs->object, pindex,
 1239                                                 TRUE, &error);
 1240                 if (error) {
 1241                         vm_object_pip_wakeup(fs->first_object);
 1242                         vm_object_chain_release_all(fs->first_object,
 1243                                                     fs->object);
 1244                         if (fs->object != fs->first_object)
 1245                                 vm_object_drop(fs->object);
 1246                         unlock_things(fs);
 1247                         vm_page_sleep_busy(fs->m, TRUE, "vmpfw");
 1248                         mycpu->gd_cnt.v_intrans++;
 1249                         /*vm_object_deallocate(fs->first_object);*/
 1250                         /*fs->first_object = NULL;*/
 1251                         fs->m = NULL;
 1252                         return (KERN_TRY_AGAIN);
 1253                 }
 1254                 if (fs->m) {
 1255                         /*
 1256                          * The page is busied for us.
 1257                          *
 1258                          * If reactivating a page from PQ_CACHE we may have
 1259                          * to rate-limit.
 1260                          */
 1261                         int queue = fs->m->queue;
 1262                         vm_page_unqueue_nowakeup(fs->m);
 1263 
 1264                         if ((queue - fs->m->pc) == PQ_CACHE && 
 1265                             vm_page_count_severe()) {
 1266                                 vm_page_activate(fs->m);
 1267                                 vm_page_wakeup(fs->m);
 1268                                 fs->m = NULL;
 1269                                 vm_object_pip_wakeup(fs->first_object);
 1270                                 vm_object_chain_release_all(fs->first_object,
 1271                                                             fs->object);
 1272                                 if (fs->object != fs->first_object)
 1273                                         vm_object_drop(fs->object);
 1274                                 unlock_and_deallocate(fs);
 1275                                 if (allow_nofault == 0 ||
 1276                                     (curthread->td_flags & TDF_NOFAULT) == 0) {
 1277                                         vm_wait_pfault();
 1278                                 }
 1279                                 return (KERN_TRY_AGAIN);
 1280                         }
 1281 
 1282                         /*
 1283                          * If it still isn't completely valid (readable),
 1284                          * or if a read-ahead-mark is set on the VM page,
 1285                          * jump to readrest, else we found the page and
 1286                          * can return.
 1287                          *
 1288                          * We can release the spl once we have marked the
 1289                          * page busy.
 1290                          */
 1291                         if (fs->m->object != &kernel_object) {
 1292                                 if ((fs->m->valid & VM_PAGE_BITS_ALL) !=
 1293                                     VM_PAGE_BITS_ALL) {
 1294                                         goto readrest;
 1295                                 }
 1296                                 if (fs->m->flags & PG_RAM) {
 1297                                         if (debug_cluster)
 1298                                                 kprintf("R");
 1299                                         vm_page_flag_clear(fs->m, PG_RAM);
 1300                                         goto readrest;
 1301                                 }
 1302                         }
 1303                         break; /* break to PAGE HAS BEEN FOUND */
 1304                 }
 1305 
 1306                 /*
 1307                  * Page is not resident, If this is the search termination
 1308                  * or the pager might contain the page, allocate a new page.
 1309                  */
 1310                 if (TRYPAGER(fs) || fs->object == fs->first_object) {
 1311                         /*
 1312                          * Allocating, must be exclusive.
 1313                          */
 1314                         if (fs->object == fs->first_object &&
 1315                             fs->first_shared) {
 1316                                 fs->first_shared = 0;
 1317                                 vm_object_pip_wakeup(fs->first_object);
 1318                                 vm_object_chain_release_all(fs->first_object,
 1319                                                             fs->object);
 1320                                 if (fs->object != fs->first_object)
 1321                                         vm_object_drop(fs->object);
 1322                                 unlock_and_deallocate(fs);
 1323                                 return (KERN_TRY_AGAIN);
 1324                         }
 1325                         if (fs->object != fs->first_object &&
 1326                             fs->shared) {
 1327                                 fs->first_shared = 0;
 1328                                 fs->shared = 0;
 1329                                 vm_object_pip_wakeup(fs->first_object);
 1330                                 vm_object_chain_release_all(fs->first_object,
 1331                                                             fs->object);
 1332                                 if (fs->object != fs->first_object)
 1333                                         vm_object_drop(fs->object);
 1334                                 unlock_and_deallocate(fs);
 1335                                 return (KERN_TRY_AGAIN);
 1336                         }
 1337 
 1338                         /*
 1339                          * If the page is beyond the object size we fail
 1340                          */
 1341                         if (pindex >= fs->object->size) {
 1342                                 vm_object_pip_wakeup(fs->first_object);
 1343                                 vm_object_chain_release_all(fs->first_object,
 1344                                                             fs->object);
 1345                                 if (fs->object != fs->first_object)
 1346                                         vm_object_drop(fs->object);
 1347                                 unlock_and_deallocate(fs);
 1348                                 return (KERN_PROTECTION_FAILURE);
 1349                         }
 1350 
 1351                         /*
 1352                          * Allocate a new page for this object/offset pair.
 1353                          *
 1354                          * It is possible for the allocation to race, so
 1355                          * handle the case.
 1356                          */
 1357                         fs->m = NULL;
 1358                         if (!vm_page_count_severe()) {
 1359                                 fs->m = vm_page_alloc(fs->object, pindex,
 1360                                     ((fs->vp || fs->object->backing_object) ?
 1361                                         VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL :
 1362                                         VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL |
 1363                                         VM_ALLOC_USE_GD | VM_ALLOC_ZERO));
 1364                         }
 1365                         if (fs->m == NULL) {
 1366                                 vm_object_pip_wakeup(fs->first_object);
 1367                                 vm_object_chain_release_all(fs->first_object,
 1368                                                             fs->object);
 1369                                 if (fs->object != fs->first_object)
 1370                                         vm_object_drop(fs->object);
 1371                                 unlock_and_deallocate(fs);
 1372                                 if (allow_nofault == 0 ||
 1373                                     (curthread->td_flags & TDF_NOFAULT) == 0) {
 1374                                         vm_wait_pfault();
 1375                                 }
 1376                                 return (KERN_TRY_AGAIN);
 1377                         }
 1378 
 1379                         /*
 1380                          * Fall through to readrest.  We have a new page which
 1381                          * will have to be paged (since m->valid will be 0).
 1382                          */
 1383                 }
 1384 
 1385 readrest:
 1386                 /*
 1387                  * We have found an invalid or partially valid page, a
 1388                  * page with a read-ahead mark which might be partially or
 1389                  * fully valid (and maybe dirty too), or we have allocated
 1390                  * a new page.
 1391                  *
 1392                  * Attempt to fault-in the page if there is a chance that the
 1393                  * pager has it, and potentially fault in additional pages
 1394                  * at the same time.
 1395                  *
 1396                  * If TRYPAGER is true then fs.m will be non-NULL and busied
 1397                  * for us.
 1398                  */
 1399                 if (TRYPAGER(fs)) {
 1400                         int rv;
 1401                         int seqaccess;
 1402                         u_char behavior = vm_map_entry_behavior(fs->entry);
 1403 
 1404                         if (behavior == MAP_ENTRY_BEHAV_RANDOM)
 1405                                 seqaccess = 0;
 1406                         else
 1407                                 seqaccess = -1;
 1408 
 1409                         /*
 1410                          * Doing I/O may synchronously insert additional
 1411                          * pages so we can't be shared at this point either.
 1412                          *
 1413                          * NOTE: We can't free fs->m here in the allocated
 1414                          *       case (fs->object != fs->first_object) as
 1415                          *       this would require an exclusively locked
 1416                          *       VM object.
 1417                          */
 1418                         if (fs->object == fs->first_object &&
 1419                             fs->first_shared) {
 1420                                 vm_page_deactivate(fs->m);
 1421                                 vm_page_wakeup(fs->m);
 1422                                 fs->m = NULL;
 1423                                 fs->first_shared = 0;
 1424                                 vm_object_pip_wakeup(fs->first_object);
 1425                                 vm_object_chain_release_all(fs->first_object,
 1426                                                             fs->object);
 1427                                 if (fs->object != fs->first_object)
 1428                                         vm_object_drop(fs->object);
 1429                                 unlock_and_deallocate(fs);
 1430                                 return (KERN_TRY_AGAIN);
 1431                         }
 1432                         if (fs->object != fs->first_object &&
 1433                             fs->shared) {
 1434                                 vm_page_deactivate(fs->m);
 1435                                 vm_page_wakeup(fs->m);
 1436                                 fs->m = NULL;
 1437                                 fs->first_shared = 0;
 1438                                 fs->shared = 0;
 1439                                 vm_object_pip_wakeup(fs->first_object);
 1440                                 vm_object_chain_release_all(fs->first_object,
 1441                                                             fs->object);
 1442                                 if (fs->object != fs->first_object)
 1443                                         vm_object_drop(fs->object);
 1444                                 unlock_and_deallocate(fs);
 1445                                 return (KERN_TRY_AGAIN);
 1446                         }
 1447 
 1448                         /*
 1449                          * Avoid deadlocking against the map when doing I/O.
 1450                          * fs.object and the page is PG_BUSY'd.
 1451                          *
 1452                          * NOTE: Once unlocked, fs->entry can become stale
 1453                          *       so this will NULL it out.
 1454                          *
 1455                          * NOTE: fs->entry is invalid until we relock the
 1456                          *       map and verify that the timestamp has not
 1457                          *       changed.
 1458                          */
 1459                         unlock_map(fs);
 1460 
 1461                         /*
 1462                          * Acquire the page data.  We still hold a ref on
 1463                          * fs.object and the page has been PG_BUSY's.
 1464                          *
 1465                          * The pager may replace the page (for example, in
 1466                          * order to enter a fictitious page into the
 1467                          * object).  If it does so it is responsible for
 1468                          * cleaning up the passed page and properly setting
 1469                          * the new page PG_BUSY.
 1470                          *
 1471                          * If we got here through a PG_RAM read-ahead
 1472                          * mark the page may be partially dirty and thus
 1473                          * not freeable.  Don't bother checking to see
 1474                          * if the pager has the page because we can't free
 1475                          * it anyway.  We have to depend on the get_page
 1476                          * operation filling in any gaps whether there is
 1477                          * backing store or not.
 1478                          */
 1479                         rv = vm_pager_get_page(fs->object, &fs->m, seqaccess);
 1480 
 1481                         if (rv == VM_PAGER_OK) {
 1482                                 /*
 1483                                  * Relookup in case pager changed page. Pager
 1484                                  * is responsible for disposition of old page
 1485                                  * if moved.
 1486                                  *
 1487                                  * XXX other code segments do relookups too.
 1488                                  * It's a bad abstraction that needs to be
 1489                                  * fixed/removed.
 1490                                  */
 1491                                 fs->m = vm_page_lookup(fs->object, pindex);
 1492                                 if (fs->m == NULL) {
 1493                                         vm_object_pip_wakeup(fs->first_object);
 1494                                         vm_object_chain_release_all(
 1495                                                 fs->first_object, fs->object);
 1496                                         if (fs->object != fs->first_object)
 1497                                                 vm_object_drop(fs->object);
 1498                                         unlock_and_deallocate(fs);
 1499                                         return (KERN_TRY_AGAIN);
 1500                                 }
 1501                                 ++fs->hardfault;
 1502                                 break; /* break to PAGE HAS BEEN FOUND */
 1503                         }
 1504 
 1505                         /*
 1506                          * Remove the bogus page (which does not exist at this
 1507                          * object/offset); before doing so, we must get back
 1508                          * our object lock to preserve our invariant.
 1509                          *
 1510                          * Also wake up any other process that may want to bring
 1511                          * in this page.
 1512                          *
 1513                          * If this is the top-level object, we must leave the
 1514                          * busy page to prevent another process from rushing
 1515                          * past us, and inserting the page in that object at
 1516                          * the same time that we are.
 1517                          */
 1518                         if (rv == VM_PAGER_ERROR) {
 1519                                 if (curproc) {
 1520                                         kprintf("vm_fault: pager read error, "
 1521                                                 "pid %d (%s)\n",
 1522                                                 curproc->p_pid,
 1523                                                 curproc->p_comm);
 1524                                 } else {
 1525                                         kprintf("vm_fault: pager read error, "
 1526                                                 "thread %p (%s)\n",
 1527                                                 curthread,
 1528                                                 curproc->p_comm);
 1529                                 }
 1530                         }
 1531 
 1532                         /*
 1533                          * Data outside the range of the pager or an I/O error
 1534                          *
 1535                          * The page may have been wired during the pagein,
 1536                          * e.g. by the buffer cache, and cannot simply be
 1537                          * freed.  Call vnode_pager_freepage() to deal with it.
 1538                          *
 1539                          * Also note that we cannot free the page if we are
 1540                          * holding the related object shared. XXX not sure
 1541                          * what to do in that case.
 1542                          */
 1543                         if (fs->object != fs->first_object) {
 1544                                 vnode_pager_freepage(fs->m);
 1545                                 fs->m = NULL;
 1546                                 /*
 1547                                  * XXX - we cannot just fall out at this
 1548                                  * point, m has been freed and is invalid!
 1549                                  */
 1550                         }
 1551                         /*
 1552                          * XXX - the check for kernel_map is a kludge to work
 1553                          * around having the machine panic on a kernel space
 1554                          * fault w/ I/O error.
 1555                          */
 1556                         if (((fs->map != &kernel_map) &&
 1557                             (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) {
 1558                                 if (fs->m) {
 1559                                         if (fs->first_shared) {
 1560                                                 vm_page_deactivate(fs->m);
 1561                                                 vm_page_wakeup(fs->m);
 1562                                         } else {
 1563                                                 vnode_pager_freepage(fs->m);
 1564                                         }
 1565                                         fs->m = NULL;
 1566                                 }
 1567                                 vm_object_pip_wakeup(fs->first_object);
 1568                                 vm_object_chain_release_all(fs->first_object,
 1569                                                             fs->object);
 1570                                 if (fs->object != fs->first_object)
 1571                                         vm_object_drop(fs->object);
 1572                                 unlock_and_deallocate(fs);
 1573                                 if (rv == VM_PAGER_ERROR)
 1574                                         return (KERN_FAILURE);
 1575                                 else
 1576                                         return (KERN_PROTECTION_FAILURE);
 1577                                 /* NOT REACHED */
 1578                         }
 1579                 }
 1580 
 1581                 /*
 1582                  * We get here if the object has a default pager (or unwiring) 
 1583                  * or the pager doesn't have the page.
 1584                  *
 1585                  * fs->first_m will be used for the COW unless we find a
 1586                  * deeper page to be mapped read-only, in which case the
 1587                  * unlock*(fs) will free first_m.
 1588                  */
 1589                 if (fs->object == fs->first_object)
 1590                         fs->first_m = fs->m;
 1591 
 1592                 /*
 1593                  * Move on to the next object.  The chain lock should prevent
 1594                  * the backing_object from getting ripped out from under us.
 1595                  *
 1596                  * The object lock for the next object is governed by
 1597                  * fs->shared.
 1598                  */
 1599                 if ((next_object = fs->object->backing_object) != NULL) {
 1600                         if (fs->shared)
 1601                                 vm_object_hold_shared(next_object);
 1602                         else
 1603                                 vm_object_hold(next_object);
 1604                         vm_object_chain_acquire(next_object, fs->shared);
 1605                         KKASSERT(next_object == fs->object->backing_object);
 1606                         pindex += OFF_TO_IDX(fs->object->backing_object_offset);
 1607                 }
 1608 
 1609                 if (next_object == NULL) {
 1610                         /*
 1611                          * If there's no object left, fill the page in the top
 1612                          * object with zeros.
 1613                          */
 1614                         if (fs->object != fs->first_object) {
 1615 #if 0
 1616                                 if (fs->first_object->backing_object !=
 1617                                     fs->object) {
 1618                                         vm_object_hold(fs->first_object->backing_object);
 1619                                 }
 1620 #endif
 1621                                 vm_object_chain_release_all(
 1622                                         fs->first_object->backing_object,
 1623                                         fs->object);
 1624 #if 0
 1625                                 if (fs->first_object->backing_object !=
 1626                                     fs->object) {
 1627                                         vm_object_drop(fs->first_object->backing_object);
 1628                                 }
 1629 #endif
 1630                                 vm_object_pip_wakeup(fs->object);
 1631                                 vm_object_drop(fs->object);
 1632                                 fs->object = fs->first_object;
 1633                                 pindex = first_pindex;
 1634                                 fs->m = fs->first_m;
 1635                         }
 1636                         fs->first_m = NULL;
 1637 
 1638                         /*
 1639                          * Zero the page if necessary and mark it valid.
 1640                          */
 1641                         if ((fs->m->flags & PG_ZERO) == 0) {
 1642                                 vm_page_zero_fill(fs->m);
 1643                         } else {
 1644 #ifdef PMAP_DEBUG
 1645                                 pmap_page_assertzero(VM_PAGE_TO_PHYS(fs->m));
 1646 #endif
 1647                                 vm_page_flag_clear(fs->m, PG_ZERO);
 1648                                 mycpu->gd_cnt.v_ozfod++;
 1649                         }
 1650                         mycpu->gd_cnt.v_zfod++;
 1651                         fs->m->valid = VM_PAGE_BITS_ALL;
 1652                         break;  /* break to PAGE HAS BEEN FOUND */
 1653                 }
 1654                 if (fs->object != fs->first_object) {
 1655                         vm_object_pip_wakeup(fs->object);
 1656                         vm_object_lock_swap();
 1657                         vm_object_drop(fs->object);
 1658                 }
 1659                 KASSERT(fs->object != next_object,
 1660                         ("object loop %p", next_object));
 1661                 fs->object = next_object;
 1662                 vm_object_pip_add(fs->object, 1);
 1663         }
 1664 
 1665         /*
 1666          * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
 1667          * is held.]
 1668          *
 1669          * object still held.
 1670          *
 1671          * local shared variable may be different from fs->shared.
 1672          *
 1673          * If the page is being written, but isn't already owned by the
 1674          * top-level object, we have to copy it into a new page owned by the
 1675          * top-level object.
 1676          */
 1677         KASSERT((fs->m->flags & PG_BUSY) != 0,
 1678                 ("vm_fault: not busy after main loop"));
 1679 
 1680         if (fs->object != fs->first_object) {
 1681                 /*
 1682                  * We only really need to copy if we want to write it.
 1683                  */
 1684                 if (fault_type & VM_PROT_WRITE) {
 1685                         /*
 1686                          * This allows pages to be virtually copied from a 
 1687                          * backing_object into the first_object, where the 
 1688                          * backing object has no other refs to it, and cannot
 1689                          * gain any more refs.  Instead of a bcopy, we just 
 1690                          * move the page from the backing object to the 
 1691                          * first object.  Note that we must mark the page 
 1692                          * dirty in the first object so that it will go out 
 1693                          * to swap when needed.
 1694                          */
 1695                         if (
 1696                                 /*
 1697                                  * Must be holding exclusive locks
 1698                                  */
 1699                                 fs->first_shared == 0 &&
 1700                                 fs->shared == 0 &&
 1701                                 /*
 1702                                  * Map, if present, has not changed
 1703                                  */
 1704                                 (fs->map == NULL ||
 1705                                 fs->map_generation == fs->map->timestamp) &&
 1706                                 /*
 1707                                  * Only one shadow object
 1708                                  */
 1709                                 (fs->object->shadow_count == 1) &&
 1710                                 /*
 1711                                  * No COW refs, except us
 1712                                  */
 1713                                 (fs->object->ref_count == 1) &&
 1714                                 /*
 1715                                  * No one else can look this object up
 1716                                  */
 1717                                 (fs->object->handle == NULL) &&
 1718                                 /*
 1719                                  * No other ways to look the object up
 1720                                  */
 1721                                 ((fs->object->type == OBJT_DEFAULT) ||
 1722                                  (fs->object->type == OBJT_SWAP)) &&
 1723                                 /*
 1724                                  * We don't chase down the shadow chain
 1725                                  */
 1726                                 (fs->object == fs->first_object->backing_object) &&
 1727 
 1728                                 /*
 1729                                  * grab the lock if we need to
 1730                                  */
 1731                                 (fs->lookup_still_valid ||
 1732                                  fs->map == NULL ||
 1733                                  lockmgr(&fs->map->lock, LK_EXCLUSIVE|LK_NOWAIT) == 0)
 1734                             ) {
 1735                                 /*
 1736                                  * (first_m) and (m) are both busied.  We have
 1737                                  * move (m) into (first_m)'s object/pindex
 1738                                  * in an atomic fashion, then free (first_m).
 1739                                  *
 1740                                  * first_object is held so second remove
 1741                                  * followed by the rename should wind
 1742                                  * up being atomic.  vm_page_free() might
 1743                                  * block so we don't do it until after the
 1744                                  * rename.
 1745                                  */
 1746                                 fs->lookup_still_valid = 1;
 1747                                 vm_page_protect(fs->first_m, VM_PROT_NONE);
 1748                                 vm_page_remove(fs->first_m);
 1749                                 vm_page_rename(fs->m, fs->first_object,
 1750                                                first_pindex);
 1751                                 vm_page_free(fs->first_m);
 1752                                 fs->first_m = fs->m;
 1753                                 fs->m = NULL;
 1754                                 mycpu->gd_cnt.v_cow_optim++;
 1755                         } else {
 1756                                 /*
 1757                                  * Oh, well, lets copy it.
 1758                                  *
 1759                                  * Why are we unmapping the original page
 1760                                  * here?  Well, in short, not all accessors
 1761                                  * of user memory go through the pmap.  The
 1762                                  * procfs code doesn't have access user memory
 1763                                  * via a local pmap, so vm_fault_page*()
 1764                                  * can't call pmap_enter().  And the umtx*()
 1765                                  * code may modify the COW'd page via a DMAP
 1766                                  * or kernel mapping and not via the pmap,
 1767                                  * leaving the original page still mapped
 1768                                  * read-only into the pmap.
 1769                                  *
 1770                                  * So we have to remove the page from at
 1771                                  * least the current pmap if it is in it.
 1772                                  * Just remove it from all pmaps.
 1773                                  */
 1774                                 KKASSERT(fs->first_shared == 0);
 1775                                 vm_page_copy(fs->m, fs->first_m);
 1776                                 vm_page_protect(fs->m, VM_PROT_NONE);
 1777                                 vm_page_event(fs->m, VMEVENT_COW);
 1778                         }
 1779 
 1780                         /*
 1781                          * We no longer need the old page or object.
 1782                          */
 1783                         if (fs->m)
 1784                                 release_page(fs);
 1785 
 1786                         /*
 1787                          * We intend to revert to first_object, undo the
 1788                          * chain lock through to that.
 1789                          */
 1790 #if 0
 1791                         if (fs->first_object->backing_object != fs->object)
 1792                                 vm_object_hold(fs->first_object->backing_object);
 1793 #endif
 1794                         vm_object_chain_release_all(
 1795                                         fs->first_object->backing_object,
 1796                                         fs->object);
 1797 #if 0
 1798                         if (fs->first_object->backing_object != fs->object)
 1799                                 vm_object_drop(fs->first_object->backing_object);
 1800 #endif
 1801 
 1802                         /*
 1803                          * fs->object != fs->first_object due to above 
 1804                          * conditional
 1805                          */
 1806                         vm_object_pip_wakeup(fs->object);
 1807                         vm_object_drop(fs->object);
 1808 
 1809                         /*
 1810                          * Only use the new page below...
 1811                          */
 1812                         mycpu->gd_cnt.v_cow_faults++;
 1813                         fs->m = fs->first_m;
 1814                         fs->object = fs->first_object;
 1815                         pindex = first_pindex;
 1816                 } else {
 1817                         /*
 1818                          * If it wasn't a write fault avoid having to copy
 1819                          * the page by mapping it read-only.
 1820                          */
 1821                         fs->prot &= ~VM_PROT_WRITE;
 1822                 }
 1823         }
 1824 
 1825         /*
 1826          * Relock the map if necessary, then check the generation count.
 1827          * relock_map() will update fs->timestamp to account for the
 1828          * relocking if necessary.
 1829          *
 1830          * If the count has changed after relocking then all sorts of
 1831          * crap may have happened and we have to retry.
 1832          *
 1833          * NOTE: The relock_map() can fail due to a deadlock against
 1834          *       the vm_page we are holding BUSY.
 1835          */
 1836         if (fs->lookup_still_valid == FALSE && fs->map) {
 1837                 if (relock_map(fs) ||
 1838                     fs->map->timestamp != fs->map_generation) {
 1839                         release_page(fs);
 1840                         vm_object_pip_wakeup(fs->first_object);
 1841                         vm_object_chain_release_all(fs->first_object,
 1842                                                     fs->object);
 1843                         if (fs->object != fs->first_object)
 1844                                 vm_object_drop(fs->object);
 1845                         unlock_and_deallocate(fs);
 1846                         return (KERN_TRY_AGAIN);
 1847                 }
 1848         }
 1849 
 1850         /*
 1851          * If the fault is a write, we know that this page is being
 1852          * written NOW so dirty it explicitly to save on pmap_is_modified()
 1853          * calls later.
 1854          *
 1855          * If this is a NOSYNC mmap we do not want to set PG_NOSYNC
 1856          * if the page is already dirty to prevent data written with
 1857          * the expectation of being synced from not being synced.
 1858          * Likewise if this entry does not request NOSYNC then make
 1859          * sure the page isn't marked NOSYNC.  Applications sharing
 1860          * data should use the same flags to avoid ping ponging.
 1861          *
 1862          * Also tell the backing pager, if any, that it should remove
 1863          * any swap backing since the page is now dirty.
 1864          */
 1865         vm_page_activate(fs->m);
 1866         if (fs->prot & VM_PROT_WRITE) {
 1867                 vm_object_set_writeable_dirty(fs->m->object);
 1868                 vm_set_nosync(fs->m, fs->entry);
 1869                 if (fs->fault_flags & VM_FAULT_DIRTY) {
 1870                         vm_page_dirty(fs->m);
 1871                         swap_pager_unswapped(fs->m);
 1872                 }
 1873         }
 1874 
 1875         vm_object_pip_wakeup(fs->first_object);
 1876         vm_object_chain_release_all(fs->first_object, fs->object);
 1877         if (fs->object != fs->first_object)
 1878                 vm_object_drop(fs->object);
 1879 
 1880         /*
 1881          * Page had better still be busy.  We are still locked up and 
 1882          * fs->object will have another PIP reference if it is not equal
 1883          * to fs->first_object.
 1884          */
 1885         KASSERT(fs->m->flags & PG_BUSY,
 1886                 ("vm_fault: page %p not busy!", fs->m));
 1887 
 1888         /*
 1889          * Sanity check: page must be completely valid or it is not fit to
 1890          * map into user space.  vm_pager_get_pages() ensures this.
 1891          */
 1892         if (fs->m->valid != VM_PAGE_BITS_ALL) {
 1893                 vm_page_zero_invalid(fs->m, TRUE);
 1894                 kprintf("Warning: page %p partially invalid on fault\n", fs->m);
 1895         }
 1896         vm_page_flag_clear(fs->m, PG_ZERO);
 1897 
 1898         return (KERN_SUCCESS);
 1899 }
 1900 
 1901 /*
 1902  * Hold each of the physical pages that are mapped by the specified range of
 1903  * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
 1904  * and allow the specified types of access, "prot".  If all of the implied
 1905  * pages are successfully held, then the number of held pages is returned
 1906  * together with pointers to those pages in the array "ma".  However, if any
 1907  * of the pages cannot be held, -1 is returned.
 1908  */
 1909 int
 1910 vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
 1911     vm_prot_t prot, vm_page_t *ma, int max_count)
 1912 {
 1913         vm_offset_t start, end;
 1914         int i, npages, error;
 1915 
 1916         start = trunc_page(addr);
 1917         end = round_page(addr + len);
 1918 
 1919         npages = howmany(end - start, PAGE_SIZE);
 1920 
 1921         if (npages > max_count)
 1922                 return -1;
 1923 
 1924         for (i = 0; i < npages; i++) {
 1925                 // XXX error handling
 1926                 ma[i] = vm_fault_page_quick(start + (i * PAGE_SIZE),
 1927                         prot,
 1928                         &error);
 1929         }
 1930 
 1931         return npages;
 1932 }
 1933 
 1934 /*
 1935  * Wire down a range of virtual addresses in a map.  The entry in question
 1936  * should be marked in-transition and the map must be locked.  We must
 1937  * release the map temporarily while faulting-in the page to avoid a
 1938  * deadlock.  Note that the entry may be clipped while we are blocked but
 1939  * will never be freed.
 1940  *
 1941  * No requirements.
 1942  */
 1943 int
 1944 vm_fault_wire(vm_map_t map, vm_map_entry_t entry, boolean_t user_wire)
 1945 {
 1946         boolean_t fictitious;
 1947         vm_offset_t start;
 1948         vm_offset_t end;
 1949         vm_offset_t va;
 1950         vm_paddr_t pa;
 1951         vm_page_t m;
 1952         pmap_t pmap;
 1953         int rv;
 1954 
 1955         lwkt_gettoken(&map->token);
 1956 
 1957         pmap = vm_map_pmap(map);
 1958         start = entry->start;
 1959         end = entry->end;
 1960         fictitious = entry->object.vm_object &&
 1961                         ((entry->object.vm_object->type == OBJT_DEVICE) ||
 1962                          (entry->object.vm_object->type == OBJT_MGTDEVICE));
 1963         if (entry->eflags & MAP_ENTRY_KSTACK)
 1964                 start += PAGE_SIZE;
 1965         map->timestamp++;
 1966         vm_map_unlock(map);
 1967 
 1968         /*
 1969          * We simulate a fault to get the page and enter it in the physical
 1970          * map.
 1971          */
 1972         for (va = start; va < end; va += PAGE_SIZE) {
 1973                 if (user_wire) {
 1974                         rv = vm_fault(map, va, VM_PROT_READ, 
 1975                                         VM_FAULT_USER_WIRE);
 1976                 } else {
 1977                         rv = vm_fault(map, va, VM_PROT_READ|VM_PROT_WRITE,
 1978                                         VM_FAULT_CHANGE_WIRING);
 1979                 }
 1980                 if (rv) {
 1981                         while (va > start) {
 1982                                 va -= PAGE_SIZE;
 1983                                 if ((pa = pmap_extract(pmap, va)) == 0)
 1984                                         continue;
 1985                                 pmap_change_wiring(pmap, va, FALSE, entry);
 1986                                 if (!fictitious) {
 1987                                         m = PHYS_TO_VM_PAGE(pa);
 1988                                         vm_page_busy_wait(m, FALSE, "vmwrpg");
 1989                                         vm_page_unwire(m, 1);
 1990                                         vm_page_wakeup(m);
 1991                                 }
 1992                         }
 1993                         goto done;
 1994                 }
 1995         }
 1996         rv = KERN_SUCCESS;
 1997 done:
 1998         vm_map_lock(map);
 1999         lwkt_reltoken(&map->token);
 2000         return (rv);
 2001 }
 2002 
 2003 /*
 2004  * Unwire a range of virtual addresses in a map.  The map should be
 2005  * locked.
 2006  */
 2007 void
 2008 vm_fault_unwire(vm_map_t map, vm_map_entry_t entry)
 2009 {
 2010         boolean_t fictitious;
 2011         vm_offset_t start;
 2012         vm_offset_t end;
 2013         vm_offset_t va;
 2014         vm_paddr_t pa;
 2015         vm_page_t m;
 2016         pmap_t pmap;
 2017 
 2018         lwkt_gettoken(&map->token);
 2019 
 2020         pmap = vm_map_pmap(map);
 2021         start = entry->start;
 2022         end = entry->end;
 2023         fictitious = entry->object.vm_object &&
 2024                         ((entry->object.vm_object->type == OBJT_DEVICE) ||
 2025                          (entry->object.vm_object->type == OBJT_MGTDEVICE));
 2026         if (entry->eflags & MAP_ENTRY_KSTACK)
 2027                 start += PAGE_SIZE;
 2028 
 2029         /*
 2030          * Since the pages are wired down, we must be able to get their
 2031          * mappings from the physical map system.
 2032          */
 2033         for (va = start; va < end; va += PAGE_SIZE) {
 2034                 pa = pmap_extract(pmap, va);
 2035                 if (pa != 0) {
 2036                         pmap_change_wiring(pmap, va, FALSE, entry);
 2037                         if (!fictitious) {
 2038                                 m = PHYS_TO_VM_PAGE(pa);
 2039                                 vm_page_busy_wait(m, FALSE, "vmwupg");
 2040                                 vm_page_unwire(m, 1);
 2041                                 vm_page_wakeup(m);
 2042                         }
 2043                 }
 2044         }
 2045         lwkt_reltoken(&map->token);
 2046 }
 2047 
 2048 /*
 2049  * Copy all of the pages from a wired-down map entry to another.
 2050  *
 2051  * The source and destination maps must be locked for write.
 2052  * The source and destination maps token must be held
 2053  * The source map entry must be wired down (or be a sharing map
 2054  * entry corresponding to a main map entry that is wired down).
 2055  *
 2056  * No other requirements.
 2057  *
 2058  * XXX do segment optimization
 2059  */
 2060 void
 2061 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
 2062                     vm_map_entry_t dst_entry, vm_map_entry_t src_entry)
 2063 {
 2064         vm_object_t dst_object;
 2065         vm_object_t src_object;
 2066         vm_ooffset_t dst_offset;
 2067         vm_ooffset_t src_offset;
 2068         vm_prot_t prot;
 2069         vm_offset_t vaddr;
 2070         vm_page_t dst_m;
 2071         vm_page_t src_m;
 2072 
 2073         src_object = src_entry->object.vm_object;
 2074         src_offset = src_entry->offset;
 2075 
 2076         /*
 2077          * Create the top-level object for the destination entry. (Doesn't
 2078          * actually shadow anything - we copy the pages directly.)
 2079          */
 2080         vm_map_entry_allocate_object(dst_entry);
 2081         dst_object = dst_entry->object.vm_object;
 2082 
 2083         prot = dst_entry->max_protection;
 2084 
 2085         /*
 2086          * Loop through all of the pages in the entry's range, copying each
 2087          * one from the source object (it should be there) to the destination
 2088          * object.
 2089          */
 2090         vm_object_hold(src_object);
 2091         vm_object_hold(dst_object);
 2092         for (vaddr = dst_entry->start, dst_offset = 0;
 2093             vaddr < dst_entry->end;
 2094             vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) {
 2095 
 2096                 /*
 2097                  * Allocate a page in the destination object
 2098                  */
 2099                 do {
 2100                         dst_m = vm_page_alloc(dst_object,
 2101                                               OFF_TO_IDX(dst_offset),
 2102                                               VM_ALLOC_NORMAL);
 2103                         if (dst_m == NULL) {
 2104                                 vm_wait(0);
 2105                         }
 2106                 } while (dst_m == NULL);
 2107 
 2108                 /*
 2109                  * Find the page in the source object, and copy it in.
 2110                  * (Because the source is wired down, the page will be in
 2111                  * memory.)
 2112                  */
 2113                 src_m = vm_page_lookup(src_object,
 2114                                        OFF_TO_IDX(dst_offset + src_offset));
 2115                 if (src_m == NULL)
 2116                         panic("vm_fault_copy_wired: page missing");
 2117 
 2118                 vm_page_copy(src_m, dst_m);
 2119                 vm_page_event(src_m, VMEVENT_COW);
 2120 
 2121                 /*
 2122                  * Enter it in the pmap...
 2123                  */
 2124 
 2125                 vm_page_flag_clear(dst_m, PG_ZERO);
 2126                 pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE, dst_entry);
 2127 
 2128                 /*
 2129                  * Mark it no longer busy, and put it on the active list.
 2130                  */
 2131                 vm_page_activate(dst_m);
 2132                 vm_page_wakeup(dst_m);
 2133         }
 2134         vm_object_drop(dst_object);
 2135         vm_object_drop(src_object);
 2136 }
 2137 
 2138 #if 0
 2139 
 2140 /*
 2141  * This routine checks around the requested page for other pages that
 2142  * might be able to be faulted in.  This routine brackets the viable
 2143  * pages for the pages to be paged in.
 2144  *
 2145  * Inputs:
 2146  *      m, rbehind, rahead
 2147  *
 2148  * Outputs:
 2149  *  marray (array of vm_page_t), reqpage (index of requested page)
 2150  *
 2151  * Return value:
 2152  *  number of pages in marray
 2153  */
 2154 static int
 2155 vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead,
 2156                           vm_page_t *marray, int *reqpage)
 2157 {
 2158         int i,j;
 2159         vm_object_t object;
 2160         vm_pindex_t pindex, startpindex, endpindex, tpindex;
 2161         vm_page_t rtm;
 2162         int cbehind, cahead;
 2163 
 2164         object = m->object;
 2165         pindex = m->pindex;
 2166 
 2167         /*
 2168          * we don't fault-ahead for device pager
 2169          */
 2170         if ((object->type == OBJT_DEVICE) ||
 2171             (object->type == OBJT_MGTDEVICE)) {
 2172                 *reqpage = 0;
 2173                 marray[0] = m;
 2174                 return 1;
 2175         }
 2176 
 2177         /*
 2178          * if the requested page is not available, then give up now
 2179          */
 2180         if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
 2181                 *reqpage = 0;   /* not used by caller, fix compiler warn */
 2182                 return 0;
 2183         }
 2184 
 2185         if ((cbehind == 0) && (cahead == 0)) {
 2186                 *reqpage = 0;
 2187                 marray[0] = m;
 2188                 return 1;
 2189         }
 2190 
 2191         if (rahead > cahead) {
 2192                 rahead = cahead;
 2193         }
 2194 
 2195         if (rbehind > cbehind) {
 2196                 rbehind = cbehind;
 2197         }
 2198 
 2199         /*
 2200          * Do not do any readahead if we have insufficient free memory.
 2201          *
 2202          * XXX code was broken disabled before and has instability
 2203          * with this conditonal fixed, so shortcut for now.
 2204          */
 2205         if (burst_fault == 0 || vm_page_count_severe()) {
 2206                 marray[0] = m;
 2207                 *reqpage = 0;
 2208                 return 1;
 2209         }
 2210 
 2211         /*
 2212          * scan backward for the read behind pages -- in memory 
 2213          *
 2214          * Assume that if the page is not found an interrupt will not
 2215          * create it.  Theoretically interrupts can only remove (busy)
 2216          * pages, not create new associations.
 2217          */
 2218         if (pindex > 0) {
 2219                 if (rbehind > pindex) {
 2220                         rbehind = pindex;
 2221                         startpindex = 0;
 2222                 } else {
 2223                         startpindex = pindex - rbehind;
 2224                 }
 2225 
 2226                 vm_object_hold(object);
 2227                 for (tpindex = pindex; tpindex > startpindex; --tpindex) {
 2228                         if (vm_page_lookup(object, tpindex - 1))
 2229                                 break;
 2230                 }
 2231 
 2232                 i = 0;
 2233                 while (tpindex < pindex) {
 2234                         rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM |
 2235                                                              VM_ALLOC_NULL_OK);
 2236                         if (rtm == NULL) {
 2237                                 for (j = 0; j < i; j++) {
 2238                                         vm_page_free(marray[j]);
 2239                                 }
 2240                                 vm_object_drop(object);
 2241                                 marray[0] = m;
 2242                                 *reqpage = 0;
 2243                                 return 1;
 2244                         }
 2245                         marray[i] = rtm;
 2246                         ++i;
 2247                         ++tpindex;
 2248                 }
 2249                 vm_object_drop(object);
 2250         } else {
 2251                 i = 0;
 2252         }
 2253 
 2254         /*
 2255          * Assign requested page
 2256          */
 2257         marray[i] = m;
 2258         *reqpage = i;
 2259         ++i;
 2260 
 2261         /*
 2262          * Scan forwards for read-ahead pages
 2263          */
 2264         tpindex = pindex + 1;
 2265         endpindex = tpindex + rahead;
 2266         if (endpindex > object->size)
 2267                 endpindex = object->size;
 2268 
 2269         vm_object_hold(object);
 2270         while (tpindex < endpindex) {
 2271                 if (vm_page_lookup(object, tpindex))
 2272                         break;
 2273                 rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM |
 2274                                                      VM_ALLOC_NULL_OK);
 2275                 if (rtm == NULL)
 2276                         break;
 2277                 marray[i] = rtm;
 2278                 ++i;
 2279                 ++tpindex;
 2280         }
 2281         vm_object_drop(object);
 2282 
 2283         return (i);
 2284 }
 2285 
 2286 #endif
 2287 
 2288 /*
 2289  * vm_prefault() provides a quick way of clustering pagefaults into a
 2290  * processes address space.  It is a "cousin" of pmap_object_init_pt,
 2291  * except it runs at page fault time instead of mmap time.
 2292  *
 2293  * vm.fast_fault        Enables pre-faulting zero-fill pages
 2294  *
 2295  * vm.prefault_pages    Number of pages (1/2 negative, 1/2 positive) to
 2296  *                      prefault.  Scan stops in either direction when
 2297  *                      a page is found to already exist.
 2298  *
 2299  * This code used to be per-platform pmap_prefault().  It is now
 2300  * machine-independent and enhanced to also pre-fault zero-fill pages
 2301  * (see vm.fast_fault) as well as make them writable, which greatly
 2302  * reduces the number of page faults programs incur.
 2303  *
 2304  * Application performance when pre-faulting zero-fill pages is heavily
 2305  * dependent on the application.  Very tiny applications like /bin/echo
 2306  * lose a little performance while applications of any appreciable size
 2307  * gain performance.  Prefaulting multiple pages also reduces SMP
 2308  * congestion and can improve SMP performance significantly.
 2309  *
 2310  * NOTE!  prot may allow writing but this only applies to the top level
 2311  *        object.  If we wind up mapping a page extracted from a backing
 2312  *        object we have to make sure it is read-only.
 2313  *
 2314  * NOTE!  The caller has already handled any COW operations on the
 2315  *        vm_map_entry via the normal fault code.  Do NOT call this
 2316  *        shortcut unless the normal fault code has run on this entry.
 2317  *
 2318  * The related map must be locked.
 2319  * No other requirements.
 2320  */
 2321 static int vm_prefault_pages = 8;
 2322 SYSCTL_INT(_vm, OID_AUTO, prefault_pages, CTLFLAG_RW, &vm_prefault_pages, 0,
 2323            "Maximum number of pages to pre-fault");
 2324 static int vm_fast_fault = 1;
 2325 SYSCTL_INT(_vm, OID_AUTO, fast_fault, CTLFLAG_RW, &vm_fast_fault, 0,
 2326            "Burst fault zero-fill regions");
 2327 
 2328 /*
 2329  * Set PG_NOSYNC if the map entry indicates so, but only if the page
 2330  * is not already dirty by other means.  This will prevent passive
 2331  * filesystem syncing as well as 'sync' from writing out the page.
 2332  */
 2333 static void
 2334 vm_set_nosync(vm_page_t m, vm_map_entry_t entry)
 2335 {
 2336         if (entry->eflags & MAP_ENTRY_NOSYNC) {
 2337                 if (m->dirty == 0)
 2338                         vm_page_flag_set(m, PG_NOSYNC);
 2339         } else {
 2340                 vm_page_flag_clear(m, PG_NOSYNC);
 2341         }
 2342 }
 2343 
 2344 static void
 2345 vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
 2346             int fault_flags)
 2347 {
 2348         struct lwp *lp;
 2349         vm_page_t m;
 2350         vm_offset_t addr;
 2351         vm_pindex_t index;
 2352         vm_pindex_t pindex;
 2353         vm_object_t object;
 2354         int pprot;
 2355         int i;
 2356         int noneg;
 2357         int nopos;
 2358         int maxpages;
 2359 
 2360         /*
 2361          * Get stable max count value, disabled if set to 0
 2362          */
 2363         maxpages = vm_prefault_pages;
 2364         cpu_ccfence();
 2365         if (maxpages <= 0)
 2366                 return;
 2367 
 2368         /*
 2369          * We do not currently prefault mappings that use virtual page
 2370          * tables.  We do not prefault foreign pmaps.
 2371          */
 2372         if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
 2373                 return;
 2374         lp = curthread->td_lwp;
 2375         if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace)))
 2376                 return;
 2377 
 2378         /*
 2379          * Limit pre-fault count to 1024 pages.
 2380          */
 2381         if (maxpages > 1024)
 2382                 maxpages = 1024;
 2383 
 2384         object = entry->object.vm_object;
 2385         KKASSERT(object != NULL);
 2386         KKASSERT(object == entry->object.vm_object);
 2387         vm_object_hold(object);
 2388         vm_object_chain_acquire(object, 0);
 2389 
 2390         noneg = 0;
 2391         nopos = 0;
 2392         for (i = 0; i < maxpages; ++i) {
 2393                 vm_object_t lobject;
 2394                 vm_object_t nobject;
 2395                 int allocated = 0;
 2396                 int error;
 2397 
 2398                 /*
 2399                  * This can eat a lot of time on a heavily contended
 2400                  * machine so yield on the tick if needed.
 2401                  */
 2402                 if ((i & 7) == 7)
 2403                         lwkt_yield();
 2404 
 2405                 /*
 2406                  * Calculate the page to pre-fault, stopping the scan in
 2407                  * each direction separately if the limit is reached.
 2408                  */
 2409                 if (i & 1) {
 2410                         if (noneg)
 2411                                 continue;
 2412                         addr = addra - ((i + 1) >> 1) * PAGE_SIZE;
 2413                 } else {
 2414                         if (nopos)
 2415                                 continue;
 2416                         addr = addra + ((i + 2) >> 1) * PAGE_SIZE;
 2417                 }
 2418                 if (addr < entry->start) {
 2419                         noneg = 1;
 2420                         if (noneg && nopos)
 2421                                 break;
 2422                         continue;
 2423                 }
 2424                 if (addr >= entry->end) {
 2425                         nopos = 1;
 2426                         if (noneg && nopos)
 2427                                 break;
 2428                         continue;
 2429                 }
 2430 
 2431                 /*
 2432                  * Skip pages already mapped, and stop scanning in that
 2433                  * direction.  When the scan terminates in both directions
 2434                  * we are done.
 2435                  */
 2436                 if (pmap_prefault_ok(pmap, addr) == 0) {
 2437                         if (i & 1)
 2438                                 noneg = 1;
 2439                         else
 2440                                 nopos = 1;
 2441                         if (noneg && nopos)
 2442                                 break;
 2443                         continue;
 2444                 }
 2445 
 2446                 /*
 2447                  * Follow the VM object chain to obtain the page to be mapped
 2448                  * into the pmap.
 2449                  *
 2450                  * If we reach the terminal object without finding a page
 2451                  * and we determine it would be advantageous, then allocate
 2452                  * a zero-fill page for the base object.  The base object
 2453                  * is guaranteed to be OBJT_DEFAULT for this case.
 2454                  *
 2455                  * In order to not have to check the pager via *haspage*()
 2456                  * we stop if any non-default object is encountered.  e.g.
 2457                  * a vnode or swap object would stop the loop.
 2458                  */
 2459                 index = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 2460                 lobject = object;
 2461                 pindex = index;
 2462                 pprot = prot;
 2463 
 2464                 KKASSERT(lobject == entry->object.vm_object);
 2465                 /*vm_object_hold(lobject); implied */
 2466 
 2467                 while ((m = vm_page_lookup_busy_try(lobject, pindex,
 2468                                                     TRUE, &error)) == NULL) {
 2469                         if (lobject->type != OBJT_DEFAULT)
 2470                                 break;
 2471                         if (lobject->backing_object == NULL) {
 2472                                 if (vm_fast_fault == 0)
 2473                                         break;
 2474                                 if ((prot & VM_PROT_WRITE) == 0 ||
 2475                                     vm_page_count_min(0)) {
 2476                                         break;
 2477                                 }
 2478 
 2479                                 /*
 2480                                  * NOTE: Allocated from base object
 2481                                  */
 2482                                 m = vm_page_alloc(object, index,
 2483                                                   VM_ALLOC_NORMAL |
 2484                                                   VM_ALLOC_ZERO |
 2485                                                   VM_ALLOC_USE_GD |
 2486                                                   VM_ALLOC_NULL_OK);
 2487                                 if (m == NULL)
 2488                                         break;
 2489                                 allocated = 1;
 2490                                 pprot = prot;
 2491                                 /* lobject = object .. not needed */
 2492                                 break;
 2493                         }
 2494                         if (lobject->backing_object_offset & PAGE_MASK)
 2495                                 break;
 2496                         nobject = lobject->backing_object;
 2497                         vm_object_hold(nobject);
 2498                         KKASSERT(nobject == lobject->backing_object);
 2499                         pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 2500                         if (lobject != object) {
 2501                                 vm_object_lock_swap();
 2502                                 vm_object_drop(lobject);
 2503                         }
 2504                         lobject = nobject;
 2505                         pprot &= ~VM_PROT_WRITE;
 2506                         vm_object_chain_acquire(lobject, 0);
 2507                 }
 2508 
 2509                 /*
 2510                  * NOTE: A non-NULL (m) will be associated with lobject if
 2511                  *       it was found there, otherwise it is probably a
 2512                  *       zero-fill page associated with the base object.
 2513                  *
 2514                  * Give-up if no page is available.
 2515                  */
 2516                 if (m == NULL) {
 2517                         if (lobject != object) {
 2518 #if 0
 2519                                 if (object->backing_object != lobject)
 2520                                         vm_object_hold(object->backing_object);
 2521 #endif
 2522                                 vm_object_chain_release_all(
 2523                                         object->backing_object, lobject);
 2524 #if 0
 2525                                 if (object->backing_object != lobject)
 2526                                         vm_object_drop(object->backing_object);
 2527 #endif
 2528                                 vm_object_drop(lobject);
 2529                         }
 2530                         break;
 2531                 }
 2532 
 2533                 /*
 2534                  * The object must be marked dirty if we are mapping a
 2535                  * writable page.  m->object is either lobject or object,
 2536                  * both of which are still held.  Do this before we
 2537                  * potentially drop the object.
 2538                  */
 2539                 if (pprot & VM_PROT_WRITE)
 2540                         vm_object_set_writeable_dirty(m->object);
 2541 
 2542                 /*
 2543                  * Do not conditionalize on PG_RAM.  If pages are present in
 2544                  * the VM system we assume optimal caching.  If caching is
 2545                  * not optimal the I/O gravy train will be restarted when we
 2546                  * hit an unavailable page.  We do not want to try to restart
 2547                  * the gravy train now because we really don't know how much
 2548                  * of the object has been cached.  The cost for restarting
 2549                  * the gravy train should be low (since accesses will likely
 2550                  * be I/O bound anyway).
 2551                  */
 2552                 if (lobject != object) {
 2553 #if 0
 2554                         if (object->backing_object != lobject)
 2555                                 vm_object_hold(object->backing_object);
 2556 #endif
 2557                         vm_object_chain_release_all(object->backing_object,
 2558                                                     lobject);
 2559 #if 0
 2560                         if (object->backing_object != lobject)
 2561                                 vm_object_drop(object->backing_object);
 2562 #endif
 2563                         vm_object_drop(lobject);
 2564                 }
 2565 
 2566                 /*
 2567                  * Enter the page into the pmap if appropriate.  If we had
 2568                  * allocated the page we have to place it on a queue.  If not
 2569                  * we just have to make sure it isn't on the cache queue
 2570                  * (pages on the cache queue are not allowed to be mapped).
 2571                  */
 2572                 if (allocated) {
 2573                         /*
 2574                          * Page must be zerod.
 2575                          */
 2576                         if ((m->flags & PG_ZERO) == 0) {
 2577                                 vm_page_zero_fill(m);
 2578                         } else {
 2579 #ifdef PMAP_DEBUG
 2580                                 pmap_page_assertzero(
 2581                                                 VM_PAGE_TO_PHYS(m));
 2582 #endif
 2583                                 vm_page_flag_clear(m, PG_ZERO);
 2584                                 mycpu->gd_cnt.v_ozfod++;
 2585                         }
 2586                         mycpu->gd_cnt.v_zfod++;
 2587                         m->valid = VM_PAGE_BITS_ALL;
 2588 
 2589                         /*
 2590                          * Handle dirty page case
 2591                          */
 2592                         if (pprot & VM_PROT_WRITE)
 2593                                 vm_set_nosync(m, entry);
 2594                         pmap_enter(pmap, addr, m, pprot, 0, entry);
 2595                         mycpu->gd_cnt.v_vm_faults++;
 2596                         if (curthread->td_lwp)
 2597                                 ++curthread->td_lwp->lwp_ru.ru_minflt;
 2598                         vm_page_deactivate(m);
 2599                         if (pprot & VM_PROT_WRITE) {
 2600                                 /*vm_object_set_writeable_dirty(m->object);*/
 2601                                 vm_set_nosync(m, entry);
 2602                                 if (fault_flags & VM_FAULT_DIRTY) {
 2603                                         vm_page_dirty(m);
 2604                                         /*XXX*/
 2605                                         swap_pager_unswapped(m);
 2606                                 }
 2607                         }
 2608                         vm_page_wakeup(m);
 2609                 } else if (error) {
 2610                         /* couldn't busy page, no wakeup */
 2611                 } else if (
 2612                     ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 2613                     (m->flags & PG_FICTITIOUS) == 0) {
 2614                         /*
 2615                          * A fully valid page not undergoing soft I/O can
 2616                          * be immediately entered into the pmap.
 2617                          */
 2618                         if ((m->queue - m->pc) == PQ_CACHE)
 2619                                 vm_page_deactivate(m);
 2620                         if (pprot & VM_PROT_WRITE) {
 2621                                 /*vm_object_set_writeable_dirty(m->object);*/
 2622                                 vm_set_nosync(m, entry);
 2623                                 if (fault_flags & VM_FAULT_DIRTY) {
 2624                                         vm_page_dirty(m);
 2625                                         /*XXX*/
 2626                                         swap_pager_unswapped(m);
 2627                                 }
 2628                         }
 2629                         if (pprot & VM_PROT_WRITE)
 2630                                 vm_set_nosync(m, entry);
 2631                         pmap_enter(pmap, addr, m, pprot, 0, entry);
 2632                         mycpu->gd_cnt.v_vm_faults++;
 2633                         if (curthread->td_lwp)
 2634                                 ++curthread->td_lwp->lwp_ru.ru_minflt;
 2635                         vm_page_wakeup(m);
 2636                 } else {
 2637                         vm_page_wakeup(m);
 2638                 }
 2639         }
 2640         vm_object_chain_release(object);
 2641         vm_object_drop(object);
 2642 }
 2643 
 2644 /*
 2645  * Object can be held shared
 2646  */
 2647 static void
 2648 vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
 2649                   vm_map_entry_t entry, int prot, int fault_flags)
 2650 {
 2651         struct lwp *lp;
 2652         vm_page_t m;
 2653         vm_offset_t addr;
 2654         vm_pindex_t pindex;
 2655         vm_object_t object;
 2656         int i;
 2657         int noneg;
 2658         int nopos;
 2659         int maxpages;
 2660 
 2661         /*
 2662          * Get stable max count value, disabled if set to 0
 2663          */
 2664         maxpages = vm_prefault_pages;
 2665         cpu_ccfence();
 2666         if (maxpages <= 0)
 2667                 return;
 2668 
 2669         /*
 2670          * We do not currently prefault mappings that use virtual page
 2671          * tables.  We do not prefault foreign pmaps.
 2672          */
 2673         if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
 2674                 return;
 2675         lp = curthread->td_lwp;
 2676         if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace)))
 2677                 return;
 2678         object = entry->object.vm_object;
 2679         if (object->backing_object != NULL)
 2680                 return;
 2681         ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
 2682 
 2683         /*
 2684          * Limit pre-fault count to 1024 pages.
 2685          */
 2686         if (maxpages > 1024)
 2687                 maxpages = 1024;
 2688 
 2689         noneg = 0;
 2690         nopos = 0;
 2691         for (i = 0; i < maxpages; ++i) {
 2692                 int error;
 2693 
 2694                 /*
 2695                  * Calculate the page to pre-fault, stopping the scan in
 2696                  * each direction separately if the limit is reached.
 2697                  */
 2698                 if (i & 1) {
 2699                         if (noneg)
 2700                                 continue;
 2701                         addr = addra - ((i + 1) >> 1) * PAGE_SIZE;
 2702                 } else {
 2703                         if (nopos)
 2704                                 continue;
 2705                         addr = addra + ((i + 2) >> 1) * PAGE_SIZE;
 2706                 }
 2707                 if (addr < entry->start) {
 2708                         noneg = 1;
 2709                         if (noneg && nopos)
 2710                                 break;
 2711                         continue;
 2712                 }
 2713                 if (addr >= entry->end) {
 2714                         nopos = 1;
 2715                         if (noneg && nopos)
 2716                                 break;
 2717                         continue;
 2718                 }
 2719 
 2720                 /*
 2721                  * Skip pages already mapped, and stop scanning in that
 2722                  * direction.  When the scan terminates in both directions
 2723                  * we are done.
 2724                  */
 2725                 if (pmap_prefault_ok(pmap, addr) == 0) {
 2726                         if (i & 1)
 2727                                 noneg = 1;
 2728                         else
 2729                                 nopos = 1;
 2730                         if (noneg && nopos)
 2731                                 break;
 2732                         continue;
 2733                 }
 2734 
 2735                 /*
 2736                  * Follow the VM object chain to obtain the page to be mapped
 2737                  * into the pmap.  This version of the prefault code only
 2738                  * works with terminal objects.
 2739                  *
 2740                  * WARNING!  We cannot call swap_pager_unswapped() with a
 2741                  *           shared token.
 2742                  */
 2743                 pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 2744 
 2745                 m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
 2746                 if (m == NULL || error)
 2747                         continue;
 2748 
 2749                 if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 2750                     (m->flags & PG_FICTITIOUS) == 0 &&
 2751                     ((m->flags & PG_SWAPPED) == 0 ||
 2752                      (prot & VM_PROT_WRITE) == 0 ||
 2753                      (fault_flags & VM_FAULT_DIRTY) == 0)) {
 2754                         /*
 2755                          * A fully valid page not undergoing soft I/O can
 2756                          * be immediately entered into the pmap.
 2757                          */
 2758                         if ((m->queue - m->pc) == PQ_CACHE)
 2759                                 vm_page_deactivate(m);
 2760                         if (prot & VM_PROT_WRITE) {
 2761                                 vm_object_set_writeable_dirty(m->object);
 2762                                 vm_set_nosync(m, entry);
 2763                                 if (fault_flags & VM_FAULT_DIRTY) {
 2764                                         vm_page_dirty(m);
 2765                                         /*XXX*/
 2766                                         swap_pager_unswapped(m);
 2767                                 }
 2768                         }
 2769                         pmap_enter(pmap, addr, m, prot, 0, entry);
 2770                         mycpu->gd_cnt.v_vm_faults++;
 2771                         if (curthread->td_lwp)
 2772                                 ++curthread->td_lwp->lwp_ru.ru_minflt;
 2773                 }
 2774                 vm_page_wakeup(m);
 2775         }
 2776 }

Cache object: 5ea5a1438d17c4ed4355aee9a5ec82d2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.