vm_fault.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
    3  *
    4  * Copyright (c) 1991, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * Copyright (c) 1994 John S. Dyson
    7  * All rights reserved.
    8  * Copyright (c) 1994 David Greenman
    9  * All rights reserved.
   10  *
   11  *
   12  * This code is derived from software contributed to Berkeley by
   13  * The Mach Operating System project at Carnegie-Mellon University.
   14  *
   15  * Redistribution and use in source and binary forms, with or without
   16  * modification, are permitted provided that the following conditions
   17  * are met:
   18  * 1. Redistributions of source code must retain the above copyright
   19  *    notice, this list of conditions and the following disclaimer.
   20  * 2. Redistributions in binary form must reproduce the above copyright
   21  *    notice, this list of conditions and the following disclaimer in the
   22  *    documentation and/or other materials provided with the distribution.
   23  * 3. All advertising materials mentioning features or use of this software
   24  *    must display the following acknowledgement:
   25  *      This product includes software developed by the University of
   26  *      California, Berkeley and its contributors.
   27  * 4. Neither the name of the University nor the names of its contributors
   28  *    may be used to endorse or promote products derived from this software
   29  *    without specific prior written permission.
   30  *
   31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   41  * SUCH DAMAGE.
   42  *
   43  *      from: @(#)vm_fault.c    8.4 (Berkeley) 1/12/94
   44  *
   45  *
   46  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   47  * All rights reserved.
   48  *
   49  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
   50  *
   51  * Permission to use, copy, modify and distribute this software and
   52  * its documentation is hereby granted, provided that both the copyright
   53  * notice and this permission notice appear in all copies of the
   54  * software, derivative works or modified versions, and any portions
   55  * thereof, and that both notices appear in supporting documentation.
   56  *
   57  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   58  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   59  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   60  *
   61  * Carnegie Mellon requests users of this software to return to
   62  *
   63  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   64  *  School of Computer Science
   65  *  Carnegie Mellon University
   66  *  Pittsburgh PA 15213-3890
   67  *
   68  * any improvements or extensions that they make and grant Carnegie the
   69  * rights to redistribute these changes.
   70  */
   71 
   72 /*
   73  *      Page fault handling module.
   74  */
   75 
   76 #include <sys/cdefs.h>
   77 __FBSDID("$FreeBSD: releng/12.0/sys/vm/vm_fault.c 338999 2018-09-28 14:11:38Z kib $");
   78 
   79 #include "opt_ktrace.h"
   80 #include "opt_vm.h"
   81 
   82 #include <sys/param.h>
   83 #include <sys/systm.h>
   84 #include <sys/kernel.h>
   85 #include <sys/lock.h>
   86 #include <sys/mman.h>
   87 #include <sys/proc.h>
   88 #include <sys/racct.h>
   89 #include <sys/resourcevar.h>
   90 #include <sys/rwlock.h>
   91 #include <sys/sysctl.h>
   92 #include <sys/vmmeter.h>
   93 #include <sys/vnode.h>
   94 #ifdef KTRACE
   95 #include <sys/ktrace.h>
   96 #endif
   97 
   98 #include <vm/vm.h>
   99 #include <vm/vm_param.h>
  100 #include <vm/pmap.h>
  101 #include <vm/vm_map.h>
  102 #include <vm/vm_object.h>
  103 #include <vm/vm_page.h>
  104 #include <vm/vm_pageout.h>
  105 #include <vm/vm_kern.h>
  106 #include <vm/vm_pager.h>
  107 #include <vm/vm_extern.h>
  108 #include <vm/vm_reserv.h>
  109 
  110 #define PFBAK 4
  111 #define PFFOR 4
  112 
  113 #define VM_FAULT_READ_DEFAULT   (1 + VM_FAULT_READ_AHEAD_INIT)
  114 #define VM_FAULT_READ_MAX       (1 + VM_FAULT_READ_AHEAD_MAX)
  115 
  116 #define VM_FAULT_DONTNEED_MIN   1048576
  117 
  118 struct faultstate {
  119         vm_page_t m;
  120         vm_object_t object;
  121         vm_pindex_t pindex;
  122         vm_page_t first_m;
  123         vm_object_t     first_object;
  124         vm_pindex_t first_pindex;
  125         vm_map_t map;
  126         vm_map_entry_t entry;
  127         int map_generation;
  128         bool lookup_still_valid;
  129         struct vnode *vp;
  130 };
  131 
  132 static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
  133             int ahead);
  134 static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
  135             int backward, int forward, bool obj_locked);
  136 
  137 static inline void
  138 release_page(struct faultstate *fs)
  139 {
  140 
  141         vm_page_xunbusy(fs->m);
  142         vm_page_lock(fs->m);
  143         vm_page_deactivate(fs->m);
  144         vm_page_unlock(fs->m);
  145         fs->m = NULL;
  146 }
  147 
  148 static inline void
  149 unlock_map(struct faultstate *fs)
  150 {
  151 
  152         if (fs->lookup_still_valid) {
  153                 vm_map_lookup_done(fs->map, fs->entry);
  154                 fs->lookup_still_valid = false;
  155         }
  156 }
  157 
  158 static void
  159 unlock_vp(struct faultstate *fs)
  160 {
  161 
  162         if (fs->vp != NULL) {
  163                 vput(fs->vp);
  164                 fs->vp = NULL;
  165         }
  166 }
  167 
  168 static void
  169 unlock_and_deallocate(struct faultstate *fs)
  170 {
  171 
  172         vm_object_pip_wakeup(fs->object);
  173         VM_OBJECT_WUNLOCK(fs->object);
  174         if (fs->object != fs->first_object) {
  175                 VM_OBJECT_WLOCK(fs->first_object);
  176                 vm_page_lock(fs->first_m);
  177                 vm_page_free(fs->first_m);
  178                 vm_page_unlock(fs->first_m);
  179                 vm_object_pip_wakeup(fs->first_object);
  180                 VM_OBJECT_WUNLOCK(fs->first_object);
  181                 fs->first_m = NULL;
  182         }
  183         vm_object_deallocate(fs->first_object);
  184         unlock_map(fs);
  185         unlock_vp(fs);
  186 }
  187 
  188 static void
  189 vm_fault_dirty(vm_map_entry_t entry, vm_page_t m, vm_prot_t prot,
  190     vm_prot_t fault_type, int fault_flags, bool set_wd)
  191 {
  192         bool need_dirty;
  193 
  194         if (((prot & VM_PROT_WRITE) == 0 &&
  195             (fault_flags & VM_FAULT_DIRTY) == 0) ||
  196             (m->oflags & VPO_UNMANAGED) != 0)
  197                 return;
  198 
  199         VM_OBJECT_ASSERT_LOCKED(m->object);
  200 
  201         need_dirty = ((fault_type & VM_PROT_WRITE) != 0 &&
  202             (fault_flags & VM_FAULT_WIRE) == 0) ||
  203             (fault_flags & VM_FAULT_DIRTY) != 0;
  204 
  205         if (set_wd)
  206                 vm_object_set_writeable_dirty(m->object);
  207         else
  208                 /*
  209                  * If two callers of vm_fault_dirty() with set_wd ==
  210                  * FALSE, one for the map entry with MAP_ENTRY_NOSYNC
  211                  * flag set, other with flag clear, race, it is
  212                  * possible for the no-NOSYNC thread to see m->dirty
  213                  * != 0 and not clear VPO_NOSYNC.  Take vm_page lock
  214                  * around manipulation of VPO_NOSYNC and
  215                  * vm_page_dirty() call, to avoid the race and keep
  216                  * m->oflags consistent.
  217                  */
  218                 vm_page_lock(m);
  219 
  220         /*
  221          * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
  222          * if the page is already dirty to prevent data written with
  223          * the expectation of being synced from not being synced.
  224          * Likewise if this entry does not request NOSYNC then make
  225          * sure the page isn't marked NOSYNC.  Applications sharing
  226          * data should use the same flags to avoid ping ponging.
  227          */
  228         if ((entry->eflags & MAP_ENTRY_NOSYNC) != 0) {
  229                 if (m->dirty == 0) {
  230                         m->oflags |= VPO_NOSYNC;
  231                 }
  232         } else {
  233                 m->oflags &= ~VPO_NOSYNC;
  234         }
  235 
  236         /*
  237          * If the fault is a write, we know that this page is being
  238          * written NOW so dirty it explicitly to save on
  239          * pmap_is_modified() calls later.
  240          *
  241          * Also, since the page is now dirty, we can possibly tell
  242          * the pager to release any swap backing the page.  Calling
  243          * the pager requires a write lock on the object.
  244          */
  245         if (need_dirty)
  246                 vm_page_dirty(m);
  247         if (!set_wd)
  248                 vm_page_unlock(m);
  249         else if (need_dirty)
  250                 vm_pager_page_unswapped(m);
  251 }
  252 
  253 static void
  254 vm_fault_fill_hold(vm_page_t *m_hold, vm_page_t m)
  255 {
  256 
  257         if (m_hold != NULL) {
  258                 *m_hold = m;
  259                 vm_page_lock(m);
  260                 vm_page_hold(m);
  261                 vm_page_unlock(m);
  262         }
  263 }
  264 
  265 /*
  266  * Unlocks fs.first_object and fs.map on success.
  267  */
  268 static int
  269 vm_fault_soft_fast(struct faultstate *fs, vm_offset_t vaddr, vm_prot_t prot,
  270     int fault_type, int fault_flags, boolean_t wired, vm_page_t *m_hold)
  271 {
  272         vm_page_t m, m_map;
  273 #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
  274     __ARM_ARCH >= 6) || defined(__i386__)) && VM_NRESERVLEVEL > 0
  275         vm_page_t m_super;
  276         int flags;
  277 #endif
  278         int psind, rv;
  279 
  280         MPASS(fs->vp == NULL);
  281         m = vm_page_lookup(fs->first_object, fs->first_pindex);
  282         /* A busy page can be mapped for read|execute access. */
  283         if (m == NULL || ((prot & VM_PROT_WRITE) != 0 &&
  284             vm_page_busied(m)) || m->valid != VM_PAGE_BITS_ALL)
  285                 return (KERN_FAILURE);
  286         m_map = m;
  287         psind = 0;
  288 #if (defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
  289     __ARM_ARCH >= 6) || defined(__i386__)) && VM_NRESERVLEVEL > 0
  290         if ((m->flags & PG_FICTITIOUS) == 0 &&
  291             (m_super = vm_reserv_to_superpage(m)) != NULL &&
  292             rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
  293             roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
  294             (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
  295             (pagesizes[m_super->psind] - 1)) &&
  296             pmap_ps_enabled(fs->map->pmap)) {
  297                 flags = PS_ALL_VALID;
  298                 if ((prot & VM_PROT_WRITE) != 0) {
  299                         /*
  300                          * Create a superpage mapping allowing write access
  301                          * only if none of the constituent pages are busy and
  302                          * all of them are already dirty (except possibly for
  303                          * the page that was faulted on).
  304                          */
  305                         flags |= PS_NONE_BUSY;
  306                         if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
  307                                 flags |= PS_ALL_DIRTY;
  308                 }
  309                 if (vm_page_ps_test(m_super, flags, m)) {
  310                         m_map = m_super;
  311                         psind = m_super->psind;
  312                         vaddr = rounddown2(vaddr, pagesizes[psind]);
  313                         /* Preset the modified bit for dirty superpages. */
  314                         if ((flags & PS_ALL_DIRTY) != 0)
  315                                 fault_type |= VM_PROT_WRITE;
  316                 }
  317         }
  318 #endif
  319         rv = pmap_enter(fs->map->pmap, vaddr, m_map, prot, fault_type |
  320             PMAP_ENTER_NOSLEEP | (wired ? PMAP_ENTER_WIRED : 0), psind);
  321         if (rv != KERN_SUCCESS)
  322                 return (rv);
  323         vm_fault_fill_hold(m_hold, m);
  324         vm_fault_dirty(fs->entry, m, prot, fault_type, fault_flags, false);
  325         if (psind == 0 && !wired)
  326                 vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
  327         VM_OBJECT_RUNLOCK(fs->first_object);
  328         vm_map_lookup_done(fs->map, fs->entry);
  329         curthread->td_ru.ru_minflt++;
  330         return (KERN_SUCCESS);
  331 }
  332 
  333 static void
  334 vm_fault_restore_map_lock(struct faultstate *fs)
  335 {
  336 
  337         VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
  338         MPASS(fs->first_object->paging_in_progress > 0);
  339 
  340         if (!vm_map_trylock_read(fs->map)) {
  341                 VM_OBJECT_WUNLOCK(fs->first_object);
  342                 vm_map_lock_read(fs->map);
  343                 VM_OBJECT_WLOCK(fs->first_object);
  344         }
  345         fs->lookup_still_valid = true;
  346 }
  347 
  348 static void
  349 vm_fault_populate_check_page(vm_page_t m)
  350 {
  351 
  352         /*
  353          * Check each page to ensure that the pager is obeying the
  354          * interface: the page must be installed in the object, fully
  355          * valid, and exclusively busied.
  356          */
  357         MPASS(m != NULL);
  358         MPASS(m->valid == VM_PAGE_BITS_ALL);
  359         MPASS(vm_page_xbusied(m));
  360 }
  361 
  362 static void
  363 vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first,
  364     vm_pindex_t last)
  365 {
  366         vm_page_t m;
  367         vm_pindex_t pidx;
  368 
  369         VM_OBJECT_ASSERT_WLOCKED(object);
  370         MPASS(first <= last);
  371         for (pidx = first, m = vm_page_lookup(object, pidx);
  372             pidx <= last; pidx++, m = vm_page_next(m)) {
  373                 vm_fault_populate_check_page(m);
  374                 vm_page_lock(m);
  375                 vm_page_deactivate(m);
  376                 vm_page_unlock(m);
  377                 vm_page_xunbusy(m);
  378         }
  379 }
  380 
  381 static int
  382 vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type,
  383     int fault_flags, boolean_t wired, vm_page_t *m_hold)
  384 {
  385         struct mtx *m_mtx;
  386         vm_offset_t vaddr;
  387         vm_page_t m;
  388         vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
  389         int i, npages, psind, rv;
  390 
  391         MPASS(fs->object == fs->first_object);
  392         VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
  393         MPASS(fs->first_object->paging_in_progress > 0);
  394         MPASS(fs->first_object->backing_object == NULL);
  395         MPASS(fs->lookup_still_valid);
  396 
  397         pager_first = OFF_TO_IDX(fs->entry->offset);
  398         pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1;
  399         unlock_map(fs);
  400         unlock_vp(fs);
  401 
  402         /*
  403          * Call the pager (driver) populate() method.
  404          *
  405          * There is no guarantee that the method will be called again
  406          * if the current fault is for read, and a future fault is
  407          * for write.  Report the entry's maximum allowed protection
  408          * to the driver.
  409          */
  410         rv = vm_pager_populate(fs->first_object, fs->first_pindex,
  411             fault_type, fs->entry->max_protection, &pager_first, &pager_last);
  412 
  413         VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
  414         if (rv == VM_PAGER_BAD) {
  415                 /*
  416                  * VM_PAGER_BAD is the backdoor for a pager to request
  417                  * normal fault handling.
  418                  */
  419                 vm_fault_restore_map_lock(fs);
  420                 if (fs->map->timestamp != fs->map_generation)
  421                         return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
  422                 return (KERN_NOT_RECEIVER);
  423         }
  424         if (rv != VM_PAGER_OK)
  425                 return (KERN_FAILURE); /* AKA SIGSEGV */
  426 
  427         /* Ensure that the driver is obeying the interface. */
  428         MPASS(pager_first <= pager_last);
  429         MPASS(fs->first_pindex <= pager_last);
  430         MPASS(fs->first_pindex >= pager_first);
  431         MPASS(pager_last < fs->first_object->size);
  432 
  433         vm_fault_restore_map_lock(fs);
  434         if (fs->map->timestamp != fs->map_generation) {
  435                 vm_fault_populate_cleanup(fs->first_object, pager_first,
  436                     pager_last);
  437                 return (KERN_RESOURCE_SHORTAGE); /* RetryFault */
  438         }
  439 
  440         /*
  441          * The map is unchanged after our last unlock.  Process the fault.
  442          *
  443          * The range [pager_first, pager_last] that is given to the
  444          * pager is only a hint.  The pager may populate any range
  445          * within the object that includes the requested page index.
  446          * In case the pager expanded the range, clip it to fit into
  447          * the map entry.
  448          */
  449         map_first = OFF_TO_IDX(fs->entry->offset);
  450         if (map_first > pager_first) {
  451                 vm_fault_populate_cleanup(fs->first_object, pager_first,
  452                     map_first - 1);
  453                 pager_first = map_first;
  454         }
  455         map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1;
  456         if (map_last < pager_last) {
  457                 vm_fault_populate_cleanup(fs->first_object, map_last + 1,
  458                     pager_last);
  459                 pager_last = map_last;
  460         }
  461         for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx);
  462             pidx <= pager_last;
  463             pidx += npages, m = vm_page_next(&m[npages - 1])) {
  464                 vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset;
  465 #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \
  466     __ARM_ARCH >= 6) || defined(__i386__)
  467                 psind = m->psind;
  468                 if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 ||
  469                     pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||
  470                     !pmap_ps_enabled(fs->map->pmap)))
  471                         psind = 0;
  472 #else
  473                 psind = 0;
  474 #endif          
  475                 npages = atop(pagesizes[psind]);
  476                 for (i = 0; i < npages; i++) {
  477                         vm_fault_populate_check_page(&m[i]);
  478                         vm_fault_dirty(fs->entry, &m[i], prot, fault_type,
  479                             fault_flags, true);
  480                 }
  481                 VM_OBJECT_WUNLOCK(fs->first_object);
  482                 pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | (wired ?
  483                     PMAP_ENTER_WIRED : 0), psind);
  484                 VM_OBJECT_WLOCK(fs->first_object);
  485                 m_mtx = NULL;
  486                 for (i = 0; i < npages; i++) {
  487                         vm_page_change_lock(&m[i], &m_mtx);
  488                         if ((fault_flags & VM_FAULT_WIRE) != 0)
  489                                 vm_page_wire(&m[i]);
  490                         else
  491                                 vm_page_activate(&m[i]);
  492                         if (m_hold != NULL && m[i].pindex == fs->first_pindex) {
  493                                 *m_hold = &m[i];
  494                                 vm_page_hold(&m[i]);
  495                         }
  496                         vm_page_xunbusy_maybelocked(&m[i]);
  497                 }
  498                 if (m_mtx != NULL)
  499                         mtx_unlock(m_mtx);
  500         }
  501         curthread->td_ru.ru_majflt++;
  502         return (KERN_SUCCESS);
  503 }
  504 
  505 /*
  506  *      vm_fault:
  507  *
  508  *      Handle a page fault occurring at the given address,
  509  *      requiring the given permissions, in the map specified.
  510  *      If successful, the page is inserted into the
  511  *      associated physical map.
  512  *
  513  *      NOTE: the given address should be truncated to the
  514  *      proper page address.
  515  *
  516  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
  517  *      a standard error specifying why the fault is fatal is returned.
  518  *
  519  *      The map in question must be referenced, and remains so.
  520  *      Caller may hold no locks.
  521  */
  522 int
  523 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
  524     int fault_flags)
  525 {
  526         struct thread *td;
  527         int result;
  528 
  529         td = curthread;
  530         if ((td->td_pflags & TDP_NOFAULTING) != 0)
  531                 return (KERN_PROTECTION_FAILURE);
  532 #ifdef KTRACE
  533         if (map != kernel_map && KTRPOINT(td, KTR_FAULT))
  534                 ktrfault(vaddr, fault_type);
  535 #endif
  536         result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags,
  537             NULL);
  538 #ifdef KTRACE
  539         if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND))
  540                 ktrfaultend(result);
  541 #endif
  542         return (result);
  543 }
  544 
  545 int
  546 vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
  547     int fault_flags, vm_page_t *m_hold)
  548 {
  549         struct faultstate fs;
  550         struct vnode *vp;
  551         struct domainset *dset;
  552         vm_object_t next_object, retry_object;
  553         vm_offset_t e_end, e_start;
  554         vm_pindex_t retry_pindex;
  555         vm_prot_t prot, retry_prot;
  556         int ahead, alloc_req, behind, cluster_offset, error, era, faultcount;
  557         int locked, nera, result, rv;
  558         u_char behavior;
  559         boolean_t wired;        /* Passed by reference. */
  560         bool dead, hardfault, is_first_object_locked;
  561 
  562         VM_CNT_INC(v_vm_faults);
  563         fs.vp = NULL;
  564         faultcount = 0;
  565         nera = -1;
  566         hardfault = false;
  567 
  568 RetryFault:;
  569 
  570         /*
  571          * Find the backing store object and offset into it to begin the
  572          * search.
  573          */
  574         fs.map = map;
  575         result = vm_map_lookup(&fs.map, vaddr, fault_type |
  576             VM_PROT_FAULT_LOOKUP, &fs.entry, &fs.first_object,
  577             &fs.first_pindex, &prot, &wired);
  578         if (result != KERN_SUCCESS) {
  579                 unlock_vp(&fs);
  580                 return (result);
  581         }
  582 
  583         fs.map_generation = fs.map->timestamp;
  584 
  585         if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
  586                 panic("%s: fault on nofault entry, addr: %#lx",
  587                     __func__, (u_long)vaddr);
  588         }
  589 
  590         if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION &&
  591             fs.entry->wiring_thread != curthread) {
  592                 vm_map_unlock_read(fs.map);
  593                 vm_map_lock(fs.map);
  594                 if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) &&
  595                     (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
  596                         unlock_vp(&fs);
  597                         fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
  598                         vm_map_unlock_and_wait(fs.map, 0);
  599                 } else
  600                         vm_map_unlock(fs.map);
  601                 goto RetryFault;
  602         }
  603 
  604         MPASS((fs.entry->eflags & MAP_ENTRY_GUARD) == 0);
  605 
  606         if (wired)
  607                 fault_type = prot | (fault_type & VM_PROT_COPY);
  608         else
  609                 KASSERT((fault_flags & VM_FAULT_WIRE) == 0,
  610                     ("!wired && VM_FAULT_WIRE"));
  611 
  612         /*
  613          * Try to avoid lock contention on the top-level object through
  614          * special-case handling of some types of page faults, specifically,
  615          * those that are both (1) mapping an existing page from the top-
  616          * level object and (2) not having to mark that object as containing
  617          * dirty pages.  Under these conditions, a read lock on the top-level
  618          * object suffices, allowing multiple page faults of a similar type to
  619          * run in parallel on the same top-level object.
  620          */
  621         if (fs.vp == NULL /* avoid locked vnode leak */ &&
  622             (fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0 &&
  623             /* avoid calling vm_object_set_writeable_dirty() */
  624             ((prot & VM_PROT_WRITE) == 0 ||
  625             (fs.first_object->type != OBJT_VNODE &&
  626             (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
  627             (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0)) {
  628                 VM_OBJECT_RLOCK(fs.first_object);
  629                 if ((prot & VM_PROT_WRITE) == 0 ||
  630                     (fs.first_object->type != OBJT_VNODE &&
  631                     (fs.first_object->flags & OBJ_TMPFS_NODE) == 0) ||
  632                     (fs.first_object->flags & OBJ_MIGHTBEDIRTY) != 0) {
  633                         rv = vm_fault_soft_fast(&fs, vaddr, prot, fault_type,
  634                             fault_flags, wired, m_hold);
  635                         if (rv == KERN_SUCCESS)
  636                                 return (rv);
  637                 }
  638                 if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
  639                         VM_OBJECT_RUNLOCK(fs.first_object);
  640                         VM_OBJECT_WLOCK(fs.first_object);
  641                 }
  642         } else {
  643                 VM_OBJECT_WLOCK(fs.first_object);
  644         }
  645 
  646         /*
  647          * Make a reference to this object to prevent its disposal while we
  648          * are messing with it.  Once we have the reference, the map is free
  649          * to be diddled.  Since objects reference their shadows (and copies),
  650          * they will stay around as well.
  651          *
  652          * Bump the paging-in-progress count to prevent size changes (e.g. 
  653          * truncation operations) during I/O.
  654          */
  655         vm_object_reference_locked(fs.first_object);
  656         vm_object_pip_add(fs.first_object, 1);
  657 
  658         fs.lookup_still_valid = true;
  659 
  660         fs.first_m = NULL;
  661 
  662         /*
  663          * Search for the page at object/offset.
  664          */
  665         fs.object = fs.first_object;
  666         fs.pindex = fs.first_pindex;
  667         while (TRUE) {
  668                 /*
  669                  * If the object is marked for imminent termination,
  670                  * we retry here, since the collapse pass has raced
  671                  * with us.  Otherwise, if we see terminally dead
  672                  * object, return fail.
  673                  */
  674                 if ((fs.object->flags & OBJ_DEAD) != 0) {
  675                         dead = fs.object->type == OBJT_DEAD;
  676                         unlock_and_deallocate(&fs);
  677                         if (dead)
  678                                 return (KERN_PROTECTION_FAILURE);
  679                         pause("vmf_de", 1);
  680                         goto RetryFault;
  681                 }
  682 
  683                 /*
  684                  * See if page is resident
  685                  */
  686                 fs.m = vm_page_lookup(fs.object, fs.pindex);
  687                 if (fs.m != NULL) {
  688                         /*
  689                          * Wait/Retry if the page is busy.  We have to do this
  690                          * if the page is either exclusive or shared busy
  691                          * because the vm_pager may be using read busy for
  692                          * pageouts (and even pageins if it is the vnode
  693                          * pager), and we could end up trying to pagein and
  694                          * pageout the same page simultaneously.
  695                          *
  696                          * We can theoretically allow the busy case on a read
  697                          * fault if the page is marked valid, but since such
  698                          * pages are typically already pmap'd, putting that
  699                          * special case in might be more effort then it is 
  700                          * worth.  We cannot under any circumstances mess
  701                          * around with a shared busied page except, perhaps,
  702                          * to pmap it.
  703                          */
  704                         if (vm_page_busied(fs.m)) {
  705                                 /*
  706                                  * Reference the page before unlocking and
  707                                  * sleeping so that the page daemon is less
  708                                  * likely to reclaim it.
  709                                  */
  710                                 vm_page_aflag_set(fs.m, PGA_REFERENCED);
  711                                 if (fs.object != fs.first_object) {
  712                                         if (!VM_OBJECT_TRYWLOCK(
  713                                             fs.first_object)) {
  714                                                 VM_OBJECT_WUNLOCK(fs.object);
  715                                                 VM_OBJECT_WLOCK(fs.first_object);
  716                                                 VM_OBJECT_WLOCK(fs.object);
  717                                         }
  718                                         vm_page_lock(fs.first_m);
  719                                         vm_page_free(fs.first_m);
  720                                         vm_page_unlock(fs.first_m);
  721                                         vm_object_pip_wakeup(fs.first_object);
  722                                         VM_OBJECT_WUNLOCK(fs.first_object);
  723                                         fs.first_m = NULL;
  724                                 }
  725                                 unlock_map(&fs);
  726                                 if (fs.m == vm_page_lookup(fs.object,
  727                                     fs.pindex)) {
  728                                         vm_page_sleep_if_busy(fs.m, "vmpfw");
  729                                 }
  730                                 vm_object_pip_wakeup(fs.object);
  731                                 VM_OBJECT_WUNLOCK(fs.object);
  732                                 VM_CNT_INC(v_intrans);
  733                                 vm_object_deallocate(fs.first_object);
  734                                 goto RetryFault;
  735                         }
  736 
  737                         /*
  738                          * Mark page busy for other processes, and the 
  739                          * pagedaemon.  If it still isn't completely valid
  740                          * (readable), jump to readrest, else break-out ( we
  741                          * found the page ).
  742                          */
  743                         vm_page_xbusy(fs.m);
  744                         if (fs.m->valid != VM_PAGE_BITS_ALL)
  745                                 goto readrest;
  746                         break; /* break to PAGE HAS BEEN FOUND */
  747                 }
  748                 KASSERT(fs.m == NULL, ("fs.m should be NULL, not %p", fs.m));
  749 
  750                 /*
  751                  * Page is not resident.  If the pager might contain the page
  752                  * or this is the beginning of the search, allocate a new
  753                  * page.  (Default objects are zero-fill, so there is no real
  754                  * pager for them.)
  755                  */
  756                 if (fs.object->type != OBJT_DEFAULT ||
  757                     fs.object == fs.first_object) {
  758                         if (fs.pindex >= fs.object->size) {
  759                                 unlock_and_deallocate(&fs);
  760                                 return (KERN_PROTECTION_FAILURE);
  761                         }
  762 
  763                         if (fs.object == fs.first_object &&
  764                             (fs.first_object->flags & OBJ_POPULATE) != 0 &&
  765                             fs.first_object->shadow_count == 0) {
  766                                 rv = vm_fault_populate(&fs, prot, fault_type,
  767                                     fault_flags, wired, m_hold);
  768                                 switch (rv) {
  769                                 case KERN_SUCCESS:
  770                                 case KERN_FAILURE:
  771                                         unlock_and_deallocate(&fs);
  772                                         return (rv);
  773                                 case KERN_RESOURCE_SHORTAGE:
  774                                         unlock_and_deallocate(&fs);
  775                                         goto RetryFault;
  776                                 case KERN_NOT_RECEIVER:
  777                                         /*
  778                                          * Pager's populate() method
  779                                          * returned VM_PAGER_BAD.
  780                                          */
  781                                         break;
  782                                 default:
  783                                         panic("inconsistent return codes");
  784                                 }
  785                         }
  786 
  787                         /*
  788                          * Allocate a new page for this object/offset pair.
  789                          *
  790                          * Unlocked read of the p_flag is harmless. At
  791                          * worst, the P_KILLED might be not observed
  792                          * there, and allocation can fail, causing
  793                          * restart and new reading of the p_flag.
  794                          */
  795                         dset = fs.object->domain.dr_policy;
  796                         if (dset == NULL)
  797                                 dset = curthread->td_domain.dr_policy;
  798                         if (!vm_page_count_severe_set(&dset->ds_mask) ||
  799                             P_KILLED(curproc)) {
  800 #if VM_NRESERVLEVEL > 0
  801                                 vm_object_color(fs.object, atop(vaddr) -
  802                                     fs.pindex);
  803 #endif
  804                                 alloc_req = P_KILLED(curproc) ?
  805                                     VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
  806                                 if (fs.object->type != OBJT_VNODE &&
  807                                     fs.object->backing_object == NULL)
  808                                         alloc_req |= VM_ALLOC_ZERO;
  809                                 fs.m = vm_page_alloc(fs.object, fs.pindex,
  810                                     alloc_req);
  811                         }
  812                         if (fs.m == NULL) {
  813                                 unlock_and_deallocate(&fs);
  814                                 vm_waitpfault(dset);
  815                                 goto RetryFault;
  816                         }
  817                 }
  818 
  819 readrest:
  820                 /*
  821                  * At this point, we have either allocated a new page or found
  822                  * an existing page that is only partially valid.
  823                  *
  824                  * We hold a reference on the current object and the page is
  825                  * exclusive busied.
  826                  */
  827 
  828                 /*
  829                  * If the pager for the current object might have the page,
  830                  * then determine the number of additional pages to read and
  831                  * potentially reprioritize previously read pages for earlier
  832                  * reclamation.  These operations should only be performed
  833                  * once per page fault.  Even if the current pager doesn't
  834                  * have the page, the number of additional pages to read will
  835                  * apply to subsequent objects in the shadow chain.
  836                  */
  837                 if (fs.object->type != OBJT_DEFAULT && nera == -1 &&
  838                     !P_KILLED(curproc)) {
  839                         KASSERT(fs.lookup_still_valid, ("map unlocked"));
  840                         era = fs.entry->read_ahead;
  841                         behavior = vm_map_entry_behavior(fs.entry);
  842                         if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
  843                                 nera = 0;
  844                         } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
  845                                 nera = VM_FAULT_READ_AHEAD_MAX;
  846                                 if (vaddr == fs.entry->next_read)
  847                                         vm_fault_dontneed(&fs, vaddr, nera);
  848                         } else if (vaddr == fs.entry->next_read) {
  849                                 /*
  850                                  * This is a sequential fault.  Arithmetically
  851                                  * increase the requested number of pages in
  852                                  * the read-ahead window.  The requested
  853                                  * number of pages is "# of sequential faults
  854                                  * x (read ahead min + 1) + read ahead min"
  855                                  */
  856                                 nera = VM_FAULT_READ_AHEAD_MIN;
  857                                 if (era > 0) {
  858                                         nera += era + 1;
  859                                         if (nera > VM_FAULT_READ_AHEAD_MAX)
  860                                                 nera = VM_FAULT_READ_AHEAD_MAX;
  861                                 }
  862                                 if (era == VM_FAULT_READ_AHEAD_MAX)
  863                                         vm_fault_dontneed(&fs, vaddr, nera);
  864                         } else {
  865                                 /*
  866                                  * This is a non-sequential fault.
  867                                  */
  868                                 nera = 0;
  869                         }
  870                         if (era != nera) {
  871                                 /*
  872                                  * A read lock on the map suffices to update
  873                                  * the read ahead count safely.
  874                                  */
  875                                 fs.entry->read_ahead = nera;
  876                         }
  877 
  878                         /*
  879                          * Prepare for unlocking the map.  Save the map
  880                          * entry's start and end addresses, which are used to
  881                          * optimize the size of the pager operation below.
  882                          * Even if the map entry's addresses change after
  883                          * unlocking the map, using the saved addresses is
  884                          * safe.
  885                          */
  886                         e_start = fs.entry->start;
  887                         e_end = fs.entry->end;
  888                 }
  889 
  890                 /*
  891                  * Call the pager to retrieve the page if there is a chance
  892                  * that the pager has it, and potentially retrieve additional
  893                  * pages at the same time.
  894                  */
  895                 if (fs.object->type != OBJT_DEFAULT) {
  896                         /*
  897                          * Release the map lock before locking the vnode or
  898                          * sleeping in the pager.  (If the current object has
  899                          * a shadow, then an earlier iteration of this loop
  900                          * may have already unlocked the map.)
  901                          */
  902                         unlock_map(&fs);
  903 
  904                         if (fs.object->type == OBJT_VNODE &&
  905                             (vp = fs.object->handle) != fs.vp) {
  906                                 /*
  907                                  * Perform an unlock in case the desired vnode
  908                                  * changed while the map was unlocked during a
  909                                  * retry.
  910                                  */
  911                                 unlock_vp(&fs);
  912 
  913                                 locked = VOP_ISLOCKED(vp);
  914                                 if (locked != LK_EXCLUSIVE)
  915                                         locked = LK_SHARED;
  916 
  917                                 /*
  918                                  * We must not sleep acquiring the vnode lock
  919                                  * while we have the page exclusive busied or
  920                                  * the object's paging-in-progress count
  921                                  * incremented.  Otherwise, we could deadlock.
  922                                  */
  923                                 error = vget(vp, locked | LK_CANRECURSE |
  924                                     LK_NOWAIT, curthread);
  925                                 if (error != 0) {
  926                                         vhold(vp);
  927                                         release_page(&fs);
  928                                         unlock_and_deallocate(&fs);
  929                                         error = vget(vp, locked | LK_RETRY |
  930                                             LK_CANRECURSE, curthread);
  931                                         vdrop(vp);
  932                                         fs.vp = vp;
  933                                         KASSERT(error == 0,
  934                                             ("vm_fault: vget failed"));
  935                                         goto RetryFault;
  936                                 }
  937                                 fs.vp = vp;
  938                         }
  939                         KASSERT(fs.vp == NULL || !fs.map->system_map,
  940                             ("vm_fault: vnode-backed object mapped by system map"));
  941 
  942                         /*
  943                          * Page in the requested page and hint the pager,
  944                          * that it may bring up surrounding pages.
  945                          */
  946                         if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM ||
  947                             P_KILLED(curproc)) {
  948                                 behind = 0;
  949                                 ahead = 0;
  950                         } else {
  951                                 /* Is this a sequential fault? */
  952                                 if (nera > 0) {
  953                                         behind = 0;
  954                                         ahead = nera;
  955                                 } else {
  956                                         /*
  957                                          * Request a cluster of pages that is
  958                                          * aligned to a VM_FAULT_READ_DEFAULT
  959                                          * page offset boundary within the
  960                                          * object.  Alignment to a page offset
  961                                          * boundary is more likely to coincide
  962                                          * with the underlying file system
  963                                          * block than alignment to a virtual
  964                                          * address boundary.
  965                                          */
  966                                         cluster_offset = fs.pindex %
  967                                             VM_FAULT_READ_DEFAULT;
  968                                         behind = ulmin(cluster_offset,
  969                                             atop(vaddr - e_start));
  970                                         ahead = VM_FAULT_READ_DEFAULT - 1 -
  971                                             cluster_offset;
  972                                 }
  973                                 ahead = ulmin(ahead, atop(e_end - vaddr) - 1);
  974                         }
  975                         rv = vm_pager_get_pages(fs.object, &fs.m, 1,
  976                             &behind, &ahead);
  977                         if (rv == VM_PAGER_OK) {
  978                                 faultcount = behind + 1 + ahead;
  979                                 hardfault = true;
  980                                 break; /* break to PAGE HAS BEEN FOUND */
  981                         }
  982                         if (rv == VM_PAGER_ERROR)
  983                                 printf("vm_fault: pager read error, pid %d (%s)\n",
  984                                     curproc->p_pid, curproc->p_comm);
  985 
  986                         /*
  987                          * If an I/O error occurred or the requested page was
  988                          * outside the range of the pager, clean up and return
  989                          * an error.
  990                          */
  991                         if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) {
  992                                 vm_page_lock(fs.m);
  993                                 if (fs.m->wire_count == 0)
  994                                         vm_page_free(fs.m);
  995                                 else
  996                                         vm_page_xunbusy_maybelocked(fs.m);
  997                                 vm_page_unlock(fs.m);
  998                                 fs.m = NULL;
  999                                 unlock_and_deallocate(&fs);
 1000                                 return (rv == VM_PAGER_ERROR ? KERN_FAILURE :
 1001                                     KERN_PROTECTION_FAILURE);
 1002                         }
 1003 
 1004                         /*
 1005                          * The requested page does not exist at this object/
 1006                          * offset.  Remove the invalid page from the object,
 1007                          * waking up anyone waiting for it, and continue on to
 1008                          * the next object.  However, if this is the top-level
 1009                          * object, we must leave the busy page in place to
 1010                          * prevent another process from rushing past us, and
 1011                          * inserting the page in that object at the same time
 1012                          * that we are.
 1013                          */
 1014                         if (fs.object != fs.first_object) {
 1015                                 vm_page_lock(fs.m);
 1016                                 if (fs.m->wire_count == 0)
 1017                                         vm_page_free(fs.m);
 1018                                 else
 1019                                         vm_page_xunbusy_maybelocked(fs.m);
 1020                                 vm_page_unlock(fs.m);
 1021                                 fs.m = NULL;
 1022                         }
 1023                 }
 1024 
 1025                 /*
 1026                  * We get here if the object has default pager (or unwiring) 
 1027                  * or the pager doesn't have the page.
 1028                  */
 1029                 if (fs.object == fs.first_object)
 1030                         fs.first_m = fs.m;
 1031 
 1032                 /*
 1033                  * Move on to the next object.  Lock the next object before
 1034                  * unlocking the current one.
 1035                  */
 1036                 next_object = fs.object->backing_object;
 1037                 if (next_object == NULL) {
 1038                         /*
 1039                          * If there's no object left, fill the page in the top
 1040                          * object with zeros.
 1041                          */
 1042                         if (fs.object != fs.first_object) {
 1043                                 vm_object_pip_wakeup(fs.object);
 1044                                 VM_OBJECT_WUNLOCK(fs.object);
 1045 
 1046                                 fs.object = fs.first_object;
 1047                                 fs.pindex = fs.first_pindex;
 1048                                 fs.m = fs.first_m;
 1049                                 VM_OBJECT_WLOCK(fs.object);
 1050                         }
 1051                         fs.first_m = NULL;
 1052 
 1053                         /*
 1054                          * Zero the page if necessary and mark it valid.
 1055                          */
 1056                         if ((fs.m->flags & PG_ZERO) == 0) {
 1057                                 pmap_zero_page(fs.m);
 1058                         } else {
 1059                                 VM_CNT_INC(v_ozfod);
 1060                         }
 1061                         VM_CNT_INC(v_zfod);
 1062                         fs.m->valid = VM_PAGE_BITS_ALL;
 1063                         /* Don't try to prefault neighboring pages. */
 1064                         faultcount = 1;
 1065                         break;  /* break to PAGE HAS BEEN FOUND */
 1066                 } else {
 1067                         KASSERT(fs.object != next_object,
 1068                             ("object loop %p", next_object));
 1069                         VM_OBJECT_WLOCK(next_object);
 1070                         vm_object_pip_add(next_object, 1);
 1071                         if (fs.object != fs.first_object)
 1072                                 vm_object_pip_wakeup(fs.object);
 1073                         fs.pindex +=
 1074                             OFF_TO_IDX(fs.object->backing_object_offset);
 1075                         VM_OBJECT_WUNLOCK(fs.object);
 1076                         fs.object = next_object;
 1077                 }
 1078         }
 1079 
 1080         vm_page_assert_xbusied(fs.m);
 1081 
 1082         /*
 1083          * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
 1084          * is held.]
 1085          */
 1086 
 1087         /*
 1088          * If the page is being written, but isn't already owned by the
 1089          * top-level object, we have to copy it into a new page owned by the
 1090          * top-level object.
 1091          */
 1092         if (fs.object != fs.first_object) {
 1093                 /*
 1094                  * We only really need to copy if we want to write it.
 1095                  */
 1096                 if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 1097                         /*
 1098                          * This allows pages to be virtually copied from a 
 1099                          * backing_object into the first_object, where the 
 1100                          * backing object has no other refs to it, and cannot
 1101                          * gain any more refs.  Instead of a bcopy, we just 
 1102                          * move the page from the backing object to the 
 1103                          * first object.  Note that we must mark the page 
 1104                          * dirty in the first object so that it will go out 
 1105                          * to swap when needed.
 1106                          */
 1107                         is_first_object_locked = false;
 1108                         if (
 1109                                 /*
 1110                                  * Only one shadow object
 1111                                  */
 1112                                 (fs.object->shadow_count == 1) &&
 1113                                 /*
 1114                                  * No COW refs, except us
 1115                                  */
 1116                                 (fs.object->ref_count == 1) &&
 1117                                 /*
 1118                                  * No one else can look this object up
 1119                                  */
 1120                                 (fs.object->handle == NULL) &&
 1121                                 /*
 1122                                  * No other ways to look the object up
 1123                                  */
 1124                                 ((fs.object->type == OBJT_DEFAULT) ||
 1125                                  (fs.object->type == OBJT_SWAP)) &&
 1126                             (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
 1127                                 /*
 1128                                  * We don't chase down the shadow chain
 1129                                  */
 1130                             fs.object == fs.first_object->backing_object) {
 1131                                 vm_page_lock(fs.m);
 1132                                 vm_page_dequeue(fs.m);
 1133                                 vm_page_remove(fs.m);
 1134                                 vm_page_unlock(fs.m);
 1135                                 vm_page_lock(fs.first_m);
 1136                                 vm_page_replace_checked(fs.m, fs.first_object,
 1137                                     fs.first_pindex, fs.first_m);
 1138                                 vm_page_free(fs.first_m);
 1139                                 vm_page_unlock(fs.first_m);
 1140                                 vm_page_dirty(fs.m);
 1141 #if VM_NRESERVLEVEL > 0
 1142                                 /*
 1143                                  * Rename the reservation.
 1144                                  */
 1145                                 vm_reserv_rename(fs.m, fs.first_object,
 1146                                     fs.object, OFF_TO_IDX(
 1147                                     fs.first_object->backing_object_offset));
 1148 #endif
 1149                                 /*
 1150                                  * Removing the page from the backing object
 1151                                  * unbusied it.
 1152                                  */
 1153                                 vm_page_xbusy(fs.m);
 1154                                 fs.first_m = fs.m;
 1155                                 fs.m = NULL;
 1156                                 VM_CNT_INC(v_cow_optim);
 1157                         } else {
 1158                                 /*
 1159                                  * Oh, well, lets copy it.
 1160                                  */
 1161                                 pmap_copy_page(fs.m, fs.first_m);
 1162                                 fs.first_m->valid = VM_PAGE_BITS_ALL;
 1163                                 if (wired && (fault_flags &
 1164                                     VM_FAULT_WIRE) == 0) {
 1165                                         vm_page_lock(fs.first_m);
 1166                                         vm_page_wire(fs.first_m);
 1167                                         vm_page_unlock(fs.first_m);
 1168                                         
 1169                                         vm_page_lock(fs.m);
 1170                                         vm_page_unwire(fs.m, PQ_INACTIVE);
 1171                                         vm_page_unlock(fs.m);
 1172                                 }
 1173                                 /*
 1174                                  * We no longer need the old page or object.
 1175                                  */
 1176                                 release_page(&fs);
 1177                         }
 1178                         /*
 1179                          * fs.object != fs.first_object due to above 
 1180                          * conditional
 1181                          */
 1182                         vm_object_pip_wakeup(fs.object);
 1183                         VM_OBJECT_WUNLOCK(fs.object);
 1184                         /*
 1185                          * Only use the new page below...
 1186                          */
 1187                         fs.object = fs.first_object;
 1188                         fs.pindex = fs.first_pindex;
 1189                         fs.m = fs.first_m;
 1190                         if (!is_first_object_locked)
 1191                                 VM_OBJECT_WLOCK(fs.object);
 1192                         VM_CNT_INC(v_cow_faults);
 1193                         curthread->td_cow++;
 1194                 } else {
 1195                         prot &= ~VM_PROT_WRITE;
 1196                 }
 1197         }
 1198 
 1199         /*
 1200          * We must verify that the maps have not changed since our last
 1201          * lookup.
 1202          */
 1203         if (!fs.lookup_still_valid) {
 1204                 if (!vm_map_trylock_read(fs.map)) {
 1205                         release_page(&fs);
 1206                         unlock_and_deallocate(&fs);
 1207                         goto RetryFault;
 1208                 }
 1209                 fs.lookup_still_valid = true;
 1210                 if (fs.map->timestamp != fs.map_generation) {
 1211                         result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
 1212                             &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
 1213 
 1214                         /*
 1215                          * If we don't need the page any longer, put it on the inactive
 1216                          * list (the easiest thing to do here).  If no one needs it,
 1217                          * pageout will grab it eventually.
 1218                          */
 1219                         if (result != KERN_SUCCESS) {
 1220                                 release_page(&fs);
 1221                                 unlock_and_deallocate(&fs);
 1222 
 1223                                 /*
 1224                                  * If retry of map lookup would have blocked then
 1225                                  * retry fault from start.
 1226                                  */
 1227                                 if (result == KERN_FAILURE)
 1228                                         goto RetryFault;
 1229                                 return (result);
 1230                         }
 1231                         if ((retry_object != fs.first_object) ||
 1232                             (retry_pindex != fs.first_pindex)) {
 1233                                 release_page(&fs);
 1234                                 unlock_and_deallocate(&fs);
 1235                                 goto RetryFault;
 1236                         }
 1237 
 1238                         /*
 1239                          * Check whether the protection has changed or the object has
 1240                          * been copied while we left the map unlocked. Changing from
 1241                          * read to write permission is OK - we leave the page
 1242                          * write-protected, and catch the write fault. Changing from
 1243                          * write to read permission means that we can't mark the page
 1244                          * write-enabled after all.
 1245                          */
 1246                         prot &= retry_prot;
 1247                         fault_type &= retry_prot;
 1248                         if (prot == 0) {
 1249                                 release_page(&fs);
 1250                                 unlock_and_deallocate(&fs);
 1251                                 goto RetryFault;
 1252                         }
 1253 
 1254                         /* Reassert because wired may have changed. */
 1255                         KASSERT(wired || (fault_flags & VM_FAULT_WIRE) == 0,
 1256                             ("!wired && VM_FAULT_WIRE"));
 1257                 }
 1258         }
 1259 
 1260         /*
 1261          * If the page was filled by a pager, save the virtual address that
 1262          * should be faulted on next under a sequential access pattern to the
 1263          * map entry.  A read lock on the map suffices to update this address
 1264          * safely.
 1265          */
 1266         if (hardfault)
 1267                 fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
 1268 
 1269         vm_fault_dirty(fs.entry, fs.m, prot, fault_type, fault_flags, true);
 1270         vm_page_assert_xbusied(fs.m);
 1271 
 1272         /*
 1273          * Page must be completely valid or it is not fit to
 1274          * map into user space.  vm_pager_get_pages() ensures this.
 1275          */
 1276         KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
 1277             ("vm_fault: page %p partially invalid", fs.m));
 1278         VM_OBJECT_WUNLOCK(fs.object);
 1279 
 1280         /*
 1281          * Put this page into the physical map.  We had to do the unlock above
 1282          * because pmap_enter() may sleep.  We don't put the page
 1283          * back on the active queue until later so that the pageout daemon
 1284          * won't find it (yet).
 1285          */
 1286         pmap_enter(fs.map->pmap, vaddr, fs.m, prot,
 1287             fault_type | (wired ? PMAP_ENTER_WIRED : 0), 0);
 1288         if (faultcount != 1 && (fault_flags & VM_FAULT_WIRE) == 0 &&
 1289             wired == 0)
 1290                 vm_fault_prefault(&fs, vaddr,
 1291                     faultcount > 0 ? behind : PFBAK,
 1292                     faultcount > 0 ? ahead : PFFOR, false);
 1293         VM_OBJECT_WLOCK(fs.object);
 1294         vm_page_lock(fs.m);
 1295 
 1296         /*
 1297          * If the page is not wired down, then put it where the pageout daemon
 1298          * can find it.
 1299          */
 1300         if ((fault_flags & VM_FAULT_WIRE) != 0)
 1301                 vm_page_wire(fs.m);
 1302         else
 1303                 vm_page_activate(fs.m);
 1304         if (m_hold != NULL) {
 1305                 *m_hold = fs.m;
 1306                 vm_page_hold(fs.m);
 1307         }
 1308         vm_page_unlock(fs.m);
 1309         vm_page_xunbusy(fs.m);
 1310 
 1311         /*
 1312          * Unlock everything, and return
 1313          */
 1314         unlock_and_deallocate(&fs);
 1315         if (hardfault) {
 1316                 VM_CNT_INC(v_io_faults);
 1317                 curthread->td_ru.ru_majflt++;
 1318 #ifdef RACCT
 1319                 if (racct_enable && fs.object->type == OBJT_VNODE) {
 1320                         PROC_LOCK(curproc);
 1321                         if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 1322                                 racct_add_force(curproc, RACCT_WRITEBPS,
 1323                                     PAGE_SIZE + behind * PAGE_SIZE);
 1324                                 racct_add_force(curproc, RACCT_WRITEIOPS, 1);
 1325                         } else {
 1326                                 racct_add_force(curproc, RACCT_READBPS,
 1327                                     PAGE_SIZE + ahead * PAGE_SIZE);
 1328                                 racct_add_force(curproc, RACCT_READIOPS, 1);
 1329                         }
 1330                         PROC_UNLOCK(curproc);
 1331                 }
 1332 #endif
 1333         } else 
 1334                 curthread->td_ru.ru_minflt++;
 1335 
 1336         return (KERN_SUCCESS);
 1337 }
 1338 
 1339 /*
 1340  * Speed up the reclamation of pages that precede the faulting pindex within
 1341  * the first object of the shadow chain.  Essentially, perform the equivalent
 1342  * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
 1343  * the faulting pindex by the cluster size when the pages read by vm_fault()
 1344  * cross a cluster-size boundary.  The cluster size is the greater of the
 1345  * smallest superpage size and VM_FAULT_DONTNEED_MIN.
 1346  *
 1347  * When "fs->first_object" is a shadow object, the pages in the backing object
 1348  * that precede the faulting pindex are deactivated by vm_fault().  So, this
 1349  * function must only be concerned with pages in the first object.
 1350  */
 1351 static void
 1352 vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
 1353 {
 1354         vm_map_entry_t entry;
 1355         vm_object_t first_object, object;
 1356         vm_offset_t end, start;
 1357         vm_page_t m, m_next;
 1358         vm_pindex_t pend, pstart;
 1359         vm_size_t size;
 1360 
 1361         object = fs->object;
 1362         VM_OBJECT_ASSERT_WLOCKED(object);
 1363         first_object = fs->first_object;
 1364         if (first_object != object) {
 1365                 if (!VM_OBJECT_TRYWLOCK(first_object)) {
 1366                         VM_OBJECT_WUNLOCK(object);
 1367                         VM_OBJECT_WLOCK(first_object);
 1368                         VM_OBJECT_WLOCK(object);
 1369                 }
 1370         }
 1371         /* Neither fictitious nor unmanaged pages can be reclaimed. */
 1372         if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
 1373                 size = VM_FAULT_DONTNEED_MIN;
 1374                 if (MAXPAGESIZES > 1 && size < pagesizes[1])
 1375                         size = pagesizes[1];
 1376                 end = rounddown2(vaddr, size);
 1377                 if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
 1378                     (entry = fs->entry)->start < end) {
 1379                         if (end - entry->start < size)
 1380                                 start = entry->start;
 1381                         else
 1382                                 start = end - size;
 1383                         pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
 1384                         pstart = OFF_TO_IDX(entry->offset) + atop(start -
 1385                             entry->start);
 1386                         m_next = vm_page_find_least(first_object, pstart);
 1387                         pend = OFF_TO_IDX(entry->offset) + atop(end -
 1388                             entry->start);
 1389                         while ((m = m_next) != NULL && m->pindex < pend) {
 1390                                 m_next = TAILQ_NEXT(m, listq);
 1391                                 if (m->valid != VM_PAGE_BITS_ALL ||
 1392                                     vm_page_busied(m))
 1393                                         continue;
 1394 
 1395                                 /*
 1396                                  * Don't clear PGA_REFERENCED, since it would
 1397                                  * likely represent a reference by a different
 1398                                  * process.
 1399                                  *
 1400                                  * Typically, at this point, prefetched pages
 1401                                  * are still in the inactive queue.  Only
 1402                                  * pages that triggered page faults are in the
 1403                                  * active queue.
 1404                                  */
 1405                                 vm_page_lock(m);
 1406                                 if (!vm_page_inactive(m))
 1407                                         vm_page_deactivate(m);
 1408                                 vm_page_unlock(m);
 1409                         }
 1410                 }
 1411         }
 1412         if (first_object != object)
 1413                 VM_OBJECT_WUNLOCK(first_object);
 1414 }
 1415 
 1416 /*
 1417  * vm_fault_prefault provides a quick way of clustering
 1418  * pagefaults into a processes address space.  It is a "cousin"
 1419  * of vm_map_pmap_enter, except it runs at page fault time instead
 1420  * of mmap time.
 1421  */
 1422 static void
 1423 vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
 1424     int backward, int forward, bool obj_locked)
 1425 {
 1426         pmap_t pmap;
 1427         vm_map_entry_t entry;
 1428         vm_object_t backing_object, lobject;
 1429         vm_offset_t addr, starta;
 1430         vm_pindex_t pindex;
 1431         vm_page_t m;
 1432         int i;
 1433 
 1434         pmap = fs->map->pmap;
 1435         if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 1436                 return;
 1437 
 1438         entry = fs->entry;
 1439 
 1440         if (addra < backward * PAGE_SIZE) {
 1441                 starta = entry->start;
 1442         } else {
 1443                 starta = addra - backward * PAGE_SIZE;
 1444                 if (starta < entry->start)
 1445                         starta = entry->start;
 1446         }
 1447 
 1448         /*
 1449          * Generate the sequence of virtual addresses that are candidates for
 1450          * prefaulting in an outward spiral from the faulting virtual address,
 1451          * "addra".  Specifically, the sequence is "addra - PAGE_SIZE", "addra
 1452          * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
 1453          * If the candidate address doesn't have a backing physical page, then
 1454          * the loop immediately terminates.
 1455          */
 1456         for (i = 0; i < 2 * imax(backward, forward); i++) {
 1457                 addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
 1458                     PAGE_SIZE);
 1459                 if (addr > addra + forward * PAGE_SIZE)
 1460                         addr = 0;
 1461 
 1462                 if (addr < starta || addr >= entry->end)
 1463                         continue;
 1464 
 1465                 if (!pmap_is_prefaultable(pmap, addr))
 1466                         continue;
 1467 
 1468                 pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 1469                 lobject = entry->object.vm_object;
 1470                 if (!obj_locked)
 1471                         VM_OBJECT_RLOCK(lobject);
 1472                 while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 1473                     lobject->type == OBJT_DEFAULT &&
 1474                     (backing_object = lobject->backing_object) != NULL) {
 1475                         KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
 1476                             0, ("vm_fault_prefault: unaligned object offset"));
 1477                         pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 1478                         VM_OBJECT_RLOCK(backing_object);
 1479                         if (!obj_locked || lobject != entry->object.vm_object)
 1480                                 VM_OBJECT_RUNLOCK(lobject);
 1481                         lobject = backing_object;
 1482                 }
 1483                 if (m == NULL) {
 1484                         if (!obj_locked || lobject != entry->object.vm_object)
 1485                                 VM_OBJECT_RUNLOCK(lobject);
 1486                         break;
 1487                 }
 1488                 if (m->valid == VM_PAGE_BITS_ALL &&
 1489                     (m->flags & PG_FICTITIOUS) == 0)
 1490                         pmap_enter_quick(pmap, addr, m, entry->protection);
 1491                 if (!obj_locked || lobject != entry->object.vm_object)
 1492                         VM_OBJECT_RUNLOCK(lobject);
 1493         }
 1494 }
 1495 
 1496 /*
 1497  * Hold each of the physical pages that are mapped by the specified range of
 1498  * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
 1499  * and allow the specified types of access, "prot".  If all of the implied
 1500  * pages are successfully held, then the number of held pages is returned
 1501  * together with pointers to those pages in the array "ma".  However, if any
 1502  * of the pages cannot be held, -1 is returned.
 1503  */
 1504 int
 1505 vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
 1506     vm_prot_t prot, vm_page_t *ma, int max_count)
 1507 {
 1508         vm_offset_t end, va;
 1509         vm_page_t *mp;
 1510         int count;
 1511         boolean_t pmap_failed;
 1512 
 1513         if (len == 0)
 1514                 return (0);
 1515         end = round_page(addr + len);
 1516         addr = trunc_page(addr);
 1517 
 1518         /*
 1519          * Check for illegal addresses.
 1520          */
 1521         if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
 1522                 return (-1);
 1523 
 1524         if (atop(end - addr) > max_count)
 1525                 panic("vm_fault_quick_hold_pages: count > max_count");
 1526         count = atop(end - addr);
 1527 
 1528         /*
 1529          * Most likely, the physical pages are resident in the pmap, so it is
 1530          * faster to try pmap_extract_and_hold() first.
 1531          */
 1532         pmap_failed = FALSE;
 1533         for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
 1534                 *mp = pmap_extract_and_hold(map->pmap, va, prot);
 1535                 if (*mp == NULL)
 1536                         pmap_failed = TRUE;
 1537                 else if ((prot & VM_PROT_WRITE) != 0 &&
 1538                     (*mp)->dirty != VM_PAGE_BITS_ALL) {
 1539                         /*
 1540                          * Explicitly dirty the physical page.  Otherwise, the
 1541                          * caller's changes may go unnoticed because they are
 1542                          * performed through an unmanaged mapping or by a DMA
 1543                          * operation.
 1544                          *
 1545                          * The object lock is not held here.
 1546                          * See vm_page_clear_dirty_mask().
 1547                          */
 1548                         vm_page_dirty(*mp);
 1549                 }
 1550         }
 1551         if (pmap_failed) {
 1552                 /*
 1553                  * One or more pages could not be held by the pmap.  Either no
 1554                  * page was mapped at the specified virtual address or that
 1555                  * mapping had insufficient permissions.  Attempt to fault in
 1556                  * and hold these pages.
 1557                  *
 1558                  * If vm_fault_disable_pagefaults() was called,
 1559                  * i.e., TDP_NOFAULTING is set, we must not sleep nor
 1560                  * acquire MD VM locks, which means we must not call
 1561                  * vm_fault_hold().  Some (out of tree) callers mark
 1562                  * too wide a code area with vm_fault_disable_pagefaults()
 1563                  * already, use the VM_PROT_QUICK_NOFAULT flag to request
 1564                  * the proper behaviour explicitly.
 1565                  */
 1566                 if ((prot & VM_PROT_QUICK_NOFAULT) != 0 &&
 1567                     (curthread->td_pflags & TDP_NOFAULTING) != 0)
 1568                         goto error;
 1569                 for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
 1570                         if (*mp == NULL && vm_fault_hold(map, va, prot,
 1571                             VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
 1572                                 goto error;
 1573         }
 1574         return (count);
 1575 error:  
 1576         for (mp = ma; mp < ma + count; mp++)
 1577                 if (*mp != NULL) {
 1578                         vm_page_lock(*mp);
 1579                         vm_page_unhold(*mp);
 1580                         vm_page_unlock(*mp);
 1581                 }
 1582         return (-1);
 1583 }
 1584 
 1585 /*
 1586  *      Routine:
 1587  *              vm_fault_copy_entry
 1588  *      Function:
 1589  *              Create new shadow object backing dst_entry with private copy of
 1590  *              all underlying pages. When src_entry is equal to dst_entry,
 1591  *              function implements COW for wired-down map entry. Otherwise,
 1592  *              it forks wired entry into dst_map.
 1593  *
 1594  *      In/out conditions:
 1595  *              The source and destination maps must be locked for write.
 1596  *              The source map entry must be wired down (or be a sharing map
 1597  *              entry corresponding to a main map entry that is wired down).
 1598  */
 1599 void
 1600 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
 1601     vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
 1602     vm_ooffset_t *fork_charge)
 1603 {
 1604         vm_object_t backing_object, dst_object, object, src_object;
 1605         vm_pindex_t dst_pindex, pindex, src_pindex;
 1606         vm_prot_t access, prot;
 1607         vm_offset_t vaddr;
 1608         vm_page_t dst_m;
 1609         vm_page_t src_m;
 1610         boolean_t upgrade;
 1611 
 1612 #ifdef  lint
 1613         src_map++;
 1614 #endif  /* lint */
 1615 
 1616         upgrade = src_entry == dst_entry;
 1617         access = prot = dst_entry->protection;
 1618 
 1619         src_object = src_entry->object.vm_object;
 1620         src_pindex = OFF_TO_IDX(src_entry->offset);
 1621 
 1622         if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 1623                 dst_object = src_object;
 1624                 vm_object_reference(dst_object);
 1625         } else {
 1626                 /*
 1627                  * Create the top-level object for the destination entry. (Doesn't
 1628                  * actually shadow anything - we copy the pages directly.)
 1629                  */
 1630                 dst_object = vm_object_allocate(OBJT_DEFAULT,
 1631                     atop(dst_entry->end - dst_entry->start));
 1632 #if VM_NRESERVLEVEL > 0
 1633                 dst_object->flags |= OBJ_COLORED;
 1634                 dst_object->pg_color = atop(dst_entry->start);
 1635 #endif
 1636                 dst_object->domain = src_object->domain;
 1637                 dst_object->charge = dst_entry->end - dst_entry->start;
 1638         }
 1639 
 1640         VM_OBJECT_WLOCK(dst_object);
 1641         KASSERT(upgrade || dst_entry->object.vm_object == NULL,
 1642             ("vm_fault_copy_entry: vm_object not NULL"));
 1643         if (src_object != dst_object) {
 1644                 dst_entry->object.vm_object = dst_object;
 1645                 dst_entry->offset = 0;
 1646         }
 1647         if (fork_charge != NULL) {
 1648                 KASSERT(dst_entry->cred == NULL,
 1649                     ("vm_fault_copy_entry: leaked swp charge"));
 1650                 dst_object->cred = curthread->td_ucred;
 1651                 crhold(dst_object->cred);
 1652                 *fork_charge += dst_object->charge;
 1653         } else if ((dst_object->type == OBJT_DEFAULT ||
 1654             dst_object->type == OBJT_SWAP) &&
 1655             dst_object->cred == NULL) {
 1656                 KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
 1657                     dst_entry));
 1658                 dst_object->cred = dst_entry->cred;
 1659                 dst_entry->cred = NULL;
 1660         }
 1661 
 1662         /*
 1663          * If not an upgrade, then enter the mappings in the pmap as
 1664          * read and/or execute accesses.  Otherwise, enter them as
 1665          * write accesses.
 1666          *
 1667          * A writeable large page mapping is only created if all of
 1668          * the constituent small page mappings are modified. Marking
 1669          * PTEs as modified on inception allows promotion to happen
 1670          * without taking potentially large number of soft faults.
 1671          */
 1672         if (!upgrade)
 1673                 access &= ~VM_PROT_WRITE;
 1674 
 1675         /*
 1676          * Loop through all of the virtual pages within the entry's
 1677          * range, copying each page from the source object to the
 1678          * destination object.  Since the source is wired, those pages
 1679          * must exist.  In contrast, the destination is pageable.
 1680          * Since the destination object doesn't share any backing storage
 1681          * with the source object, all of its pages must be dirtied,
 1682          * regardless of whether they can be written.
 1683          */
 1684         for (vaddr = dst_entry->start, dst_pindex = 0;
 1685             vaddr < dst_entry->end;
 1686             vaddr += PAGE_SIZE, dst_pindex++) {
 1687 again:
 1688                 /*
 1689                  * Find the page in the source object, and copy it in.
 1690                  * Because the source is wired down, the page will be
 1691                  * in memory.
 1692                  */
 1693                 if (src_object != dst_object)
 1694                         VM_OBJECT_RLOCK(src_object);
 1695                 object = src_object;
 1696                 pindex = src_pindex + dst_pindex;
 1697                 while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
 1698                     (backing_object = object->backing_object) != NULL) {
 1699                         /*
 1700                          * Unless the source mapping is read-only or
 1701                          * it is presently being upgraded from
 1702                          * read-only, the first object in the shadow
 1703                          * chain should provide all of the pages.  In
 1704                          * other words, this loop body should never be
 1705                          * executed when the source mapping is already
 1706                          * read/write.
 1707                          */
 1708                         KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
 1709                             upgrade,
 1710                             ("vm_fault_copy_entry: main object missing page"));
 1711 
 1712                         VM_OBJECT_RLOCK(backing_object);
 1713                         pindex += OFF_TO_IDX(object->backing_object_offset);
 1714                         if (object != dst_object)
 1715                                 VM_OBJECT_RUNLOCK(object);
 1716                         object = backing_object;
 1717                 }
 1718                 KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
 1719 
 1720                 if (object != dst_object) {
 1721                         /*
 1722                          * Allocate a page in the destination object.
 1723                          */
 1724                         dst_m = vm_page_alloc(dst_object, (src_object ==
 1725                             dst_object ? src_pindex : 0) + dst_pindex,
 1726                             VM_ALLOC_NORMAL);
 1727                         if (dst_m == NULL) {
 1728                                 VM_OBJECT_WUNLOCK(dst_object);
 1729                                 VM_OBJECT_RUNLOCK(object);
 1730                                 vm_wait(dst_object);
 1731                                 VM_OBJECT_WLOCK(dst_object);
 1732                                 goto again;
 1733                         }
 1734                         pmap_copy_page(src_m, dst_m);
 1735                         VM_OBJECT_RUNLOCK(object);
 1736                         dst_m->valid = VM_PAGE_BITS_ALL;
 1737                         dst_m->dirty = VM_PAGE_BITS_ALL;
 1738                 } else {
 1739                         dst_m = src_m;
 1740                         if (vm_page_sleep_if_busy(dst_m, "fltupg"))
 1741                                 goto again;
 1742                         if (dst_m->pindex >= dst_object->size)
 1743                                 /*
 1744                                  * We are upgrading.  Index can occur
 1745                                  * out of bounds if the object type is
 1746                                  * vnode and the file was truncated.
 1747                                  */
 1748                                 break;
 1749                         vm_page_xbusy(dst_m);
 1750                         KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
 1751                             ("invalid dst page %p", dst_m));
 1752                 }
 1753                 VM_OBJECT_WUNLOCK(dst_object);
 1754 
 1755                 /*
 1756                  * Enter it in the pmap. If a wired, copy-on-write
 1757                  * mapping is being replaced by a write-enabled
 1758                  * mapping, then wire that new mapping.
 1759                  */
 1760                 pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
 1761                     access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
 1762 
 1763                 /*
 1764                  * Mark it no longer busy, and put it on the active list.
 1765                  */
 1766                 VM_OBJECT_WLOCK(dst_object);
 1767                 
 1768                 if (upgrade) {
 1769                         if (src_m != dst_m) {
 1770                                 vm_page_lock(src_m);
 1771                                 vm_page_unwire(src_m, PQ_INACTIVE);
 1772                                 vm_page_unlock(src_m);
 1773                                 vm_page_lock(dst_m);
 1774                                 vm_page_wire(dst_m);
 1775                                 vm_page_unlock(dst_m);
 1776                         } else {
 1777                                 KASSERT(dst_m->wire_count > 0,
 1778                                     ("dst_m %p is not wired", dst_m));
 1779                         }
 1780                 } else {
 1781                         vm_page_lock(dst_m);
 1782                         vm_page_activate(dst_m);
 1783                         vm_page_unlock(dst_m);
 1784                 }
 1785                 vm_page_xunbusy(dst_m);
 1786         }
 1787         VM_OBJECT_WUNLOCK(dst_object);
 1788         if (upgrade) {
 1789                 dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
 1790                 vm_object_deallocate(src_object);
 1791         }
 1792 }
 1793 
 1794 /*
 1795  * Block entry into the machine-independent layer's page fault handler by
 1796  * the calling thread.  Subsequent calls to vm_fault() by that thread will
 1797  * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of
 1798  * spurious page faults. 
 1799  */
 1800 int
 1801 vm_fault_disable_pagefaults(void)
 1802 {
 1803 
 1804         return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
 1805 }
 1806 
 1807 void
 1808 vm_fault_enable_pagefaults(int save)
 1809 {
 1810 
 1811         curthread_pflags_restore(save);
 1812 }
Cache object: 682bee6784ce8f2316c684ef3910a274
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vm/vm_fault.c

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_fault.c