vm_fault.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
    3  *
    4  * Copyright (c) 1991, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * Copyright (c) 1994 John S. Dyson
    7  * All rights reserved.
    8  * Copyright (c) 1994 David Greenman
    9  * All rights reserved.
   10  *
   11  *
   12  * This code is derived from software contributed to Berkeley by
   13  * The Mach Operating System project at Carnegie-Mellon University.
   14  *
   15  * Redistribution and use in source and binary forms, with or without
   16  * modification, are permitted provided that the following conditions
   17  * are met:
   18  * 1. Redistributions of source code must retain the above copyright
   19  *    notice, this list of conditions and the following disclaimer.
   20  * 2. Redistributions in binary form must reproduce the above copyright
   21  *    notice, this list of conditions and the following disclaimer in the
   22  *    documentation and/or other materials provided with the distribution.
   23  * 3. All advertising materials mentioning features or use of this software
   24  *    must display the following acknowledgement:
   25  *      This product includes software developed by the University of
   26  *      California, Berkeley and its contributors.
   27  * 4. Neither the name of the University nor the names of its contributors
   28  *    may be used to endorse or promote products derived from this software
   29  *    without specific prior written permission.
   30  *
   31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   41  * SUCH DAMAGE.
   42  *
   43  *      from: @(#)vm_fault.c    8.4 (Berkeley) 1/12/94
   44  *
   45  *
   46  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   47  * All rights reserved.
   48  *
   49  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
   50  *
   51  * Permission to use, copy, modify and distribute this software and
   52  * its documentation is hereby granted, provided that both the copyright
   53  * notice and this permission notice appear in all copies of the
   54  * software, derivative works or modified versions, and any portions
   55  * thereof, and that both notices appear in supporting documentation.
   56  *
   57  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   58  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   59  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   60  *
   61  * Carnegie Mellon requests users of this software to return to
   62  *
   63  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   64  *  School of Computer Science
   65  *  Carnegie Mellon University
   66  *  Pittsburgh PA 15213-3890
   67  *
   68  * any improvements or extensions that they make and grant Carnegie the
   69  * rights to redistribute these changes.
   70  */
   71 
   72 /*
   73  *      Page fault handling module.
   74  */
   75 
   76 #include <sys/cdefs.h>
   77 __FBSDID("$FreeBSD$");
   78 
   79 #include "opt_ktrace.h"
   80 #include "opt_vm.h"
   81 
   82 #include <sys/param.h>
   83 #include <sys/systm.h>
   84 #include <sys/kernel.h>
   85 #include <sys/lock.h>
   86 #include <sys/mman.h>
   87 #include <sys/mutex.h>
   88 #include <sys/pctrie.h>
   89 #include <sys/proc.h>
   90 #include <sys/racct.h>
   91 #include <sys/refcount.h>
   92 #include <sys/resourcevar.h>
   93 #include <sys/rwlock.h>
   94 #include <sys/signalvar.h>
   95 #include <sys/sysctl.h>
   96 #include <sys/sysent.h>
   97 #include <sys/vmmeter.h>
   98 #include <sys/vnode.h>
   99 #ifdef KTRACE
  100 #include <sys/ktrace.h>
  101 #endif
  102 
  103 #include <vm/vm.h>
  104 #include <vm/vm_param.h>
  105 #include <vm/pmap.h>
  106 #include <vm/vm_map.h>
  107 #include <vm/vm_object.h>
  108 #include <vm/vm_page.h>
  109 #include <vm/vm_pageout.h>
  110 #include <vm/vm_kern.h>
  111 #include <vm/vm_pager.h>
  112 #include <vm/vm_extern.h>
  113 #include <vm/vm_reserv.h>
  114 
  115 #define PFBAK 4
  116 #define PFFOR 4
  117 
  118 #define VM_FAULT_READ_DEFAULT   (1 + VM_FAULT_READ_AHEAD_INIT)
  119 
  120 #define VM_FAULT_DONTNEED_MIN   1048576
  121 
  122 struct faultstate {
  123         /* Fault parameters. */
  124         vm_offset_t     vaddr;
  125         vm_page_t       *m_hold;
  126         vm_prot_t       fault_type;
  127         vm_prot_t       prot;
  128         int             fault_flags;
  129         boolean_t       wired;
  130 
  131         /* Control state. */
  132         struct timeval  oom_start_time;
  133         bool            oom_started;
  134         int             nera;
  135 
  136         /* Page reference for cow. */
  137         vm_page_t m_cow;
  138 
  139         /* Current object. */
  140         vm_object_t     object;
  141         vm_pindex_t     pindex;
  142         vm_page_t       m;
  143 
  144         /* Top-level map object. */
  145         vm_object_t     first_object;
  146         vm_pindex_t     first_pindex;
  147         vm_page_t       first_m;
  148 
  149         /* Map state. */
  150         vm_map_t        map;
  151         vm_map_entry_t  entry;
  152         int             map_generation;
  153         bool            lookup_still_valid;
  154 
  155         /* Vnode if locked. */
  156         struct vnode    *vp;
  157 };
  158 
  159 /*
  160  * Return codes for internal fault routines.
  161  */
  162 enum fault_status {
  163         FAULT_SUCCESS = 1,      /* Return success to user. */
  164         FAULT_FAILURE,          /* Return failure to user. */
  165         FAULT_CONTINUE,         /* Continue faulting. */
  166         FAULT_RESTART,          /* Restart fault. */
  167         FAULT_OUT_OF_BOUNDS,    /* Invalid address for pager. */
  168         FAULT_HARD,             /* Performed I/O. */
  169         FAULT_SOFT,             /* Found valid page. */
  170         FAULT_PROTECTION_FAILURE, /* Invalid access. */
  171 };
  172 
  173 static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr,
  174             int ahead);
  175 static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
  176             int backward, int forward, bool obj_locked);
  177 
  178 static int vm_pfault_oom_attempts = 3;
  179 SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN,
  180     &vm_pfault_oom_attempts, 0,
  181     "Number of page allocation attempts in page fault handler before it "
  182     "triggers OOM handling");
  183 
  184 static int vm_pfault_oom_wait = 10;
  185 SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN,
  186     &vm_pfault_oom_wait, 0,
  187     "Number of seconds to wait for free pages before retrying "
  188     "the page fault handler");
  189 
  190 static inline void
  191 fault_page_release(vm_page_t *mp)
  192 {
  193         vm_page_t m;
  194 
  195         m = *mp;
  196         if (m != NULL) {
  197                 /*
  198                  * We are likely to loop around again and attempt to busy
  199                  * this page.  Deactivating it leaves it available for
  200                  * pageout while optimizing fault restarts.
  201                  */
  202                 vm_page_deactivate(m);
  203                 vm_page_xunbusy(m);
  204                 *mp = NULL;
  205         }
  206 }
  207 
  208 static inline void
  209 fault_page_free(vm_page_t *mp)
  210 {
  211         vm_page_t m;
  212 
  213         m = *mp;
  214         if (m != NULL) {
  215                 VM_OBJECT_ASSERT_WLOCKED(m->object);
  216                 if (!vm_page_wired(m))
  217                         vm_page_free(m);
  218                 else
  219                         vm_page_xunbusy(m);
  220                 *mp = NULL;
  221         }
  222 }
  223 
  224 /*
  225  * Return true if a vm_pager_get_pages() call is needed in order to check
  226  * whether the pager might have a particular page, false if it can be determined
  227  * immediately that the pager can not have a copy.  For swap objects, this can
  228  * be checked quickly.
  229  */
  230 static inline bool
  231 fault_object_needs_getpages(vm_object_t object)
  232 {
  233         VM_OBJECT_ASSERT_LOCKED(object);
  234 
  235         return ((object->flags & OBJ_SWAP) == 0 ||
  236             !pctrie_is_empty(&object->un_pager.swp.swp_blks));
  237 }
  238 
  239 static inline void
  240 unlock_map(struct faultstate *fs)
  241 {
  242 
  243         if (fs->lookup_still_valid) {
  244                 vm_map_lookup_done(fs->map, fs->entry);
  245                 fs->lookup_still_valid = false;
  246         }
  247 }
  248 
  249 static void
  250 unlock_vp(struct faultstate *fs)
  251 {
  252 
  253         if (fs->vp != NULL) {
  254                 vput(fs->vp);
  255                 fs->vp = NULL;
  256         }
  257 }
  258 
  259 static void
  260 fault_deallocate(struct faultstate *fs)
  261 {
  262 
  263         fault_page_release(&fs->m_cow);
  264         fault_page_release(&fs->m);
  265         vm_object_pip_wakeup(fs->object);
  266         if (fs->object != fs->first_object) {
  267                 VM_OBJECT_WLOCK(fs->first_object);
  268                 fault_page_free(&fs->first_m);
  269                 VM_OBJECT_WUNLOCK(fs->first_object);
  270                 vm_object_pip_wakeup(fs->first_object);
  271         }
  272         vm_object_deallocate(fs->first_object);
  273         unlock_map(fs);
  274         unlock_vp(fs);
  275 }
  276 
  277 static void
  278 unlock_and_deallocate(struct faultstate *fs)
  279 {
  280 
  281         VM_OBJECT_WUNLOCK(fs->object);
  282         fault_deallocate(fs);
  283 }
  284 
  285 static void
  286 vm_fault_dirty(struct faultstate *fs, vm_page_t m)
  287 {
  288         bool need_dirty;
  289 
  290         if (((fs->prot & VM_PROT_WRITE) == 0 &&
  291             (fs->fault_flags & VM_FAULT_DIRTY) == 0) ||
  292             (m->oflags & VPO_UNMANAGED) != 0)
  293                 return;
  294 
  295         VM_PAGE_OBJECT_BUSY_ASSERT(m);
  296 
  297         need_dirty = ((fs->fault_type & VM_PROT_WRITE) != 0 &&
  298             (fs->fault_flags & VM_FAULT_WIRE) == 0) ||
  299             (fs->fault_flags & VM_FAULT_DIRTY) != 0;
  300 
  301         vm_object_set_writeable_dirty(m->object);
  302 
  303         /*
  304          * If the fault is a write, we know that this page is being
  305          * written NOW so dirty it explicitly to save on
  306          * pmap_is_modified() calls later.
  307          *
  308          * Also, since the page is now dirty, we can possibly tell
  309          * the pager to release any swap backing the page.
  310          */
  311         if (need_dirty && vm_page_set_dirty(m) == 0) {
  312                 /*
  313                  * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC
  314                  * if the page is already dirty to prevent data written with
  315                  * the expectation of being synced from not being synced.
  316                  * Likewise if this entry does not request NOSYNC then make
  317                  * sure the page isn't marked NOSYNC.  Applications sharing
  318                  * data should use the same flags to avoid ping ponging.
  319                  */
  320                 if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0)
  321                         vm_page_aflag_set(m, PGA_NOSYNC);
  322                 else
  323                         vm_page_aflag_clear(m, PGA_NOSYNC);
  324         }
  325 
  326 }
  327 
  328 /*
  329  * Unlocks fs.first_object and fs.map on success.
  330  */
  331 static enum fault_status
  332 vm_fault_soft_fast(struct faultstate *fs)
  333 {
  334         vm_page_t m, m_map;
  335 #if VM_NRESERVLEVEL > 0
  336         vm_page_t m_super;
  337         int flags;
  338 #endif
  339         int psind;
  340         vm_offset_t vaddr;
  341         enum fault_status res;
  342 
  343         MPASS(fs->vp == NULL);
  344 
  345         res = FAULT_SUCCESS;
  346         vaddr = fs->vaddr;
  347         vm_object_busy(fs->first_object);
  348         m = vm_page_lookup(fs->first_object, fs->first_pindex);
  349         /* A busy page can be mapped for read|execute access. */
  350         if (m == NULL || ((fs->prot & VM_PROT_WRITE) != 0 &&
  351             vm_page_busied(m)) || !vm_page_all_valid(m)) {
  352                 res = FAULT_FAILURE;
  353                 goto out;
  354         }
  355         m_map = m;
  356         psind = 0;
  357 #if VM_NRESERVLEVEL > 0
  358         if ((m->flags & PG_FICTITIOUS) == 0 &&
  359             (m_super = vm_reserv_to_superpage(m)) != NULL &&
  360             rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start &&
  361             roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end &&
  362             (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) &
  363             (pagesizes[m_super->psind] - 1)) && !fs->wired &&
  364             pmap_ps_enabled(fs->map->pmap)) {
  365                 flags = PS_ALL_VALID;
  366                 if ((fs->prot & VM_PROT_WRITE) != 0) {
  367                         /*
  368                          * Create a superpage mapping allowing write access
  369                          * only if none of the constituent pages are busy and
  370                          * all of them are already dirty (except possibly for
  371                          * the page that was faulted on).
  372                          */
  373                         flags |= PS_NONE_BUSY;
  374                         if ((fs->first_object->flags & OBJ_UNMANAGED) == 0)
  375                                 flags |= PS_ALL_DIRTY;
  376                 }
  377                 if (vm_page_ps_test(m_super, flags, m)) {
  378                         m_map = m_super;
  379                         psind = m_super->psind;
  380                         vaddr = rounddown2(vaddr, pagesizes[psind]);
  381                         /* Preset the modified bit for dirty superpages. */
  382                         if ((flags & PS_ALL_DIRTY) != 0)
  383                                 fs->fault_type |= VM_PROT_WRITE;
  384                 }
  385         }
  386 #endif
  387         if (pmap_enter(fs->map->pmap, vaddr, m_map, fs->prot, fs->fault_type |
  388             PMAP_ENTER_NOSLEEP | (fs->wired ? PMAP_ENTER_WIRED : 0), psind) !=
  389             KERN_SUCCESS) {
  390                 res = FAULT_FAILURE;
  391                 goto out;
  392         }
  393         if (fs->m_hold != NULL) {
  394                 (*fs->m_hold) = m;
  395                 vm_page_wire(m);
  396         }
  397         if (psind == 0 && !fs->wired)
  398                 vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true);
  399         VM_OBJECT_RUNLOCK(fs->first_object);
  400         vm_fault_dirty(fs, m);
  401         vm_map_lookup_done(fs->map, fs->entry);
  402         curthread->td_ru.ru_minflt++;
  403 
  404 out:
  405         vm_object_unbusy(fs->first_object);
  406         return (res);
  407 }
  408 
  409 static void
  410 vm_fault_restore_map_lock(struct faultstate *fs)
  411 {
  412 
  413         VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
  414         MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0);
  415 
  416         if (!vm_map_trylock_read(fs->map)) {
  417                 VM_OBJECT_WUNLOCK(fs->first_object);
  418                 vm_map_lock_read(fs->map);
  419                 VM_OBJECT_WLOCK(fs->first_object);
  420         }
  421         fs->lookup_still_valid = true;
  422 }
  423 
  424 static void
  425 vm_fault_populate_check_page(vm_page_t m)
  426 {
  427 
  428         /*
  429          * Check each page to ensure that the pager is obeying the
  430          * interface: the page must be installed in the object, fully
  431          * valid, and exclusively busied.
  432          */
  433         MPASS(m != NULL);
  434         MPASS(vm_page_all_valid(m));
  435         MPASS(vm_page_xbusied(m));
  436 }
  437 
  438 static void
  439 vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first,
  440     vm_pindex_t last)
  441 {
  442         vm_page_t m;
  443         vm_pindex_t pidx;
  444 
  445         VM_OBJECT_ASSERT_WLOCKED(object);
  446         MPASS(first <= last);
  447         for (pidx = first, m = vm_page_lookup(object, pidx);
  448             pidx <= last; pidx++, m = vm_page_next(m)) {
  449                 vm_fault_populate_check_page(m);
  450                 vm_page_deactivate(m);
  451                 vm_page_xunbusy(m);
  452         }
  453 }
  454 
  455 static enum fault_status
  456 vm_fault_populate(struct faultstate *fs)
  457 {
  458         vm_offset_t vaddr;
  459         vm_page_t m;
  460         vm_pindex_t map_first, map_last, pager_first, pager_last, pidx;
  461         int bdry_idx, i, npages, psind, rv;
  462         enum fault_status res;
  463 
  464         MPASS(fs->object == fs->first_object);
  465         VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
  466         MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0);
  467         MPASS(fs->first_object->backing_object == NULL);
  468         MPASS(fs->lookup_still_valid);
  469 
  470         pager_first = OFF_TO_IDX(fs->entry->offset);
  471         pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1;
  472         unlock_map(fs);
  473         unlock_vp(fs);
  474 
  475         res = FAULT_SUCCESS;
  476 
  477         /*
  478          * Call the pager (driver) populate() method.
  479          *
  480          * There is no guarantee that the method will be called again
  481          * if the current fault is for read, and a future fault is
  482          * for write.  Report the entry's maximum allowed protection
  483          * to the driver.
  484          */
  485         rv = vm_pager_populate(fs->first_object, fs->first_pindex,
  486             fs->fault_type, fs->entry->max_protection, &pager_first,
  487             &pager_last);
  488 
  489         VM_OBJECT_ASSERT_WLOCKED(fs->first_object);
  490         if (rv == VM_PAGER_BAD) {
  491                 /*
  492                  * VM_PAGER_BAD is the backdoor for a pager to request
  493                  * normal fault handling.
  494                  */
  495                 vm_fault_restore_map_lock(fs);
  496                 if (fs->map->timestamp != fs->map_generation)
  497                         return (FAULT_RESTART);
  498                 return (FAULT_CONTINUE);
  499         }
  500         if (rv != VM_PAGER_OK)
  501                 return (FAULT_FAILURE); /* AKA SIGSEGV */
  502 
  503         /* Ensure that the driver is obeying the interface. */
  504         MPASS(pager_first <= pager_last);
  505         MPASS(fs->first_pindex <= pager_last);
  506         MPASS(fs->first_pindex >= pager_first);
  507         MPASS(pager_last < fs->first_object->size);
  508 
  509         vm_fault_restore_map_lock(fs);
  510         bdry_idx = (fs->entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
  511             MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
  512         if (fs->map->timestamp != fs->map_generation) {
  513                 if (bdry_idx == 0) {
  514                         vm_fault_populate_cleanup(fs->first_object, pager_first,
  515                             pager_last);
  516                 } else {
  517                         m = vm_page_lookup(fs->first_object, pager_first);
  518                         if (m != fs->m)
  519                                 vm_page_xunbusy(m);
  520                 }
  521                 return (FAULT_RESTART);
  522         }
  523 
  524         /*
  525          * The map is unchanged after our last unlock.  Process the fault.
  526          *
  527          * First, the special case of largepage mappings, where
  528          * populate only busies the first page in superpage run.
  529          */
  530         if (bdry_idx != 0) {
  531                 KASSERT(PMAP_HAS_LARGEPAGES,
  532                     ("missing pmap support for large pages"));
  533                 m = vm_page_lookup(fs->first_object, pager_first);
  534                 vm_fault_populate_check_page(m);
  535                 VM_OBJECT_WUNLOCK(fs->first_object);
  536                 vaddr = fs->entry->start + IDX_TO_OFF(pager_first) -
  537                     fs->entry->offset;
  538                 /* assert alignment for entry */
  539                 KASSERT((vaddr & (pagesizes[bdry_idx] - 1)) == 0,
  540     ("unaligned superpage start %#jx pager_first %#jx offset %#jx vaddr %#jx",
  541                     (uintmax_t)fs->entry->start, (uintmax_t)pager_first,
  542                     (uintmax_t)fs->entry->offset, (uintmax_t)vaddr));
  543                 KASSERT((VM_PAGE_TO_PHYS(m) & (pagesizes[bdry_idx] - 1)) == 0,
  544                     ("unaligned superpage m %p %#jx", m,
  545                     (uintmax_t)VM_PAGE_TO_PHYS(m)));
  546                 rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot,
  547                     fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0) |
  548                     PMAP_ENTER_LARGEPAGE, bdry_idx);
  549                 VM_OBJECT_WLOCK(fs->first_object);
  550                 vm_page_xunbusy(m);
  551                 if (rv != KERN_SUCCESS) {
  552                         res = FAULT_FAILURE;
  553                         goto out;
  554                 }
  555                 if ((fs->fault_flags & VM_FAULT_WIRE) != 0) {
  556                         for (i = 0; i < atop(pagesizes[bdry_idx]); i++)
  557                                 vm_page_wire(m + i);
  558                 }
  559                 if (fs->m_hold != NULL) {
  560                         *fs->m_hold = m + (fs->first_pindex - pager_first);
  561                         vm_page_wire(*fs->m_hold);
  562                 }
  563                 goto out;
  564         }
  565 
  566         /*
  567          * The range [pager_first, pager_last] that is given to the
  568          * pager is only a hint.  The pager may populate any range
  569          * within the object that includes the requested page index.
  570          * In case the pager expanded the range, clip it to fit into
  571          * the map entry.
  572          */
  573         map_first = OFF_TO_IDX(fs->entry->offset);
  574         if (map_first > pager_first) {
  575                 vm_fault_populate_cleanup(fs->first_object, pager_first,
  576                     map_first - 1);
  577                 pager_first = map_first;
  578         }
  579         map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1;
  580         if (map_last < pager_last) {
  581                 vm_fault_populate_cleanup(fs->first_object, map_last + 1,
  582                     pager_last);
  583                 pager_last = map_last;
  584         }
  585         for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx);
  586             pidx <= pager_last;
  587             pidx += npages, m = vm_page_next(&m[npages - 1])) {
  588                 vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset;
  589 
  590                 psind = m->psind;
  591                 if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 ||
  592                     pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last ||
  593                     !pmap_ps_enabled(fs->map->pmap) || fs->wired))
  594                         psind = 0;
  595 
  596                 npages = atop(pagesizes[psind]);
  597                 for (i = 0; i < npages; i++) {
  598                         vm_fault_populate_check_page(&m[i]);
  599                         vm_fault_dirty(fs, &m[i]);
  600                 }
  601                 VM_OBJECT_WUNLOCK(fs->first_object);
  602                 rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, fs->fault_type |
  603                     (fs->wired ? PMAP_ENTER_WIRED : 0), psind);
  604 
  605                 /*
  606                  * pmap_enter() may fail for a superpage mapping if additional
  607                  * protection policies prevent the full mapping.
  608                  * For example, this will happen on amd64 if the entire
  609                  * address range does not share the same userspace protection
  610                  * key.  Revert to single-page mappings if this happens.
  611                  */
  612                 MPASS(rv == KERN_SUCCESS ||
  613                     (psind > 0 && rv == KERN_PROTECTION_FAILURE));
  614                 if (__predict_false(psind > 0 &&
  615                     rv == KERN_PROTECTION_FAILURE)) {
  616                         MPASS(!fs->wired);
  617                         for (i = 0; i < npages; i++) {
  618                                 rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i),
  619                                     &m[i], fs->prot, fs->fault_type, 0);
  620                                 MPASS(rv == KERN_SUCCESS);
  621                         }
  622                 }
  623 
  624                 VM_OBJECT_WLOCK(fs->first_object);
  625                 for (i = 0; i < npages; i++) {
  626                         if ((fs->fault_flags & VM_FAULT_WIRE) != 0 &&
  627                             m[i].pindex == fs->first_pindex)
  628                                 vm_page_wire(&m[i]);
  629                         else
  630                                 vm_page_activate(&m[i]);
  631                         if (fs->m_hold != NULL &&
  632                             m[i].pindex == fs->first_pindex) {
  633                                 (*fs->m_hold) = &m[i];
  634                                 vm_page_wire(&m[i]);
  635                         }
  636                         vm_page_xunbusy(&m[i]);
  637                 }
  638         }
  639 out:
  640         curthread->td_ru.ru_majflt++;
  641         return (res);
  642 }
  643 
  644 static int prot_fault_translation;
  645 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
  646     &prot_fault_translation, 0,
  647     "Control signal to deliver on protection fault");
  648 
  649 /* compat definition to keep common code for signal translation */
  650 #define UCODE_PAGEFLT   12
  651 #ifdef T_PAGEFLT
  652 _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT");
  653 #endif
  654 
  655 /*
  656  *      vm_fault_trap:
  657  *
  658  *      Handle a page fault occurring at the given address,
  659  *      requiring the given permissions, in the map specified.
  660  *      If successful, the page is inserted into the
  661  *      associated physical map.
  662  *
  663  *      NOTE: the given address should be truncated to the
  664  *      proper page address.
  665  *
  666  *      KERN_SUCCESS is returned if the page fault is handled; otherwise,
  667  *      a standard error specifying why the fault is fatal is returned.
  668  *
  669  *      The map in question must be referenced, and remains so.
  670  *      Caller may hold no locks.
  671  */
  672 int
  673 vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
  674     int fault_flags, int *signo, int *ucode)
  675 {
  676         int result;
  677 
  678         MPASS(signo == NULL || ucode != NULL);
  679 #ifdef KTRACE
  680         if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT))
  681                 ktrfault(vaddr, fault_type);
  682 #endif
  683         result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags,
  684             NULL);
  685         KASSERT(result == KERN_SUCCESS || result == KERN_FAILURE ||
  686             result == KERN_INVALID_ADDRESS ||
  687             result == KERN_RESOURCE_SHORTAGE ||
  688             result == KERN_PROTECTION_FAILURE ||
  689             result == KERN_OUT_OF_BOUNDS,
  690             ("Unexpected Mach error %d from vm_fault()", result));
  691 #ifdef KTRACE
  692         if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND))
  693                 ktrfaultend(result);
  694 #endif
  695         if (result != KERN_SUCCESS && signo != NULL) {
  696                 switch (result) {
  697                 case KERN_FAILURE:
  698                 case KERN_INVALID_ADDRESS:
  699                         *signo = SIGSEGV;
  700                         *ucode = SEGV_MAPERR;
  701                         break;
  702                 case KERN_RESOURCE_SHORTAGE:
  703                         *signo = SIGBUS;
  704                         *ucode = BUS_OOMERR;
  705                         break;
  706                 case KERN_OUT_OF_BOUNDS:
  707                         *signo = SIGBUS;
  708                         *ucode = BUS_OBJERR;
  709                         break;
  710                 case KERN_PROTECTION_FAILURE:
  711                         if (prot_fault_translation == 0) {
  712                                 /*
  713                                  * Autodetect.  This check also covers
  714                                  * the images without the ABI-tag ELF
  715                                  * note.
  716                                  */
  717                                 if (SV_CURPROC_ABI() == SV_ABI_FREEBSD &&
  718                                     curproc->p_osrel >= P_OSREL_SIGSEGV) {
  719                                         *signo = SIGSEGV;
  720                                         *ucode = SEGV_ACCERR;
  721                                 } else {
  722                                         *signo = SIGBUS;
  723                                         *ucode = UCODE_PAGEFLT;
  724                                 }
  725                         } else if (prot_fault_translation == 1) {
  726                                 /* Always compat mode. */
  727                                 *signo = SIGBUS;
  728                                 *ucode = UCODE_PAGEFLT;
  729                         } else {
  730                                 /* Always SIGSEGV mode. */
  731                                 *signo = SIGSEGV;
  732                                 *ucode = SEGV_ACCERR;
  733                         }
  734                         break;
  735                 default:
  736                         KASSERT(0, ("Unexpected Mach error %d from vm_fault()",
  737                             result));
  738                         break;
  739                 }
  740         }
  741         return (result);
  742 }
  743 
  744 static enum fault_status
  745 vm_fault_lock_vnode(struct faultstate *fs, bool objlocked)
  746 {
  747         struct vnode *vp;
  748         int error, locked;
  749 
  750         if (fs->object->type != OBJT_VNODE)
  751                 return (FAULT_CONTINUE);
  752         vp = fs->object->handle;
  753         if (vp == fs->vp) {
  754                 ASSERT_VOP_LOCKED(vp, "saved vnode is not locked");
  755                 return (FAULT_CONTINUE);
  756         }
  757 
  758         /*
  759          * Perform an unlock in case the desired vnode changed while
  760          * the map was unlocked during a retry.
  761          */
  762         unlock_vp(fs);
  763 
  764         locked = VOP_ISLOCKED(vp);
  765         if (locked != LK_EXCLUSIVE)
  766                 locked = LK_SHARED;
  767 
  768         /*
  769          * We must not sleep acquiring the vnode lock while we have
  770          * the page exclusive busied or the object's
  771          * paging-in-progress count incremented.  Otherwise, we could
  772          * deadlock.
  773          */
  774         error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT);
  775         if (error == 0) {
  776                 fs->vp = vp;
  777                 return (FAULT_CONTINUE);
  778         }
  779 
  780         vhold(vp);
  781         if (objlocked)
  782                 unlock_and_deallocate(fs);
  783         else
  784                 fault_deallocate(fs);
  785         error = vget(vp, locked | LK_RETRY | LK_CANRECURSE);
  786         vdrop(vp);
  787         fs->vp = vp;
  788         KASSERT(error == 0, ("vm_fault: vget failed %d", error));
  789         return (FAULT_RESTART);
  790 }
  791 
  792 /*
  793  * Calculate the desired readahead.  Handle drop-behind.
  794  *
  795  * Returns the number of readahead blocks to pass to the pager.
  796  */
  797 static int
  798 vm_fault_readahead(struct faultstate *fs)
  799 {
  800         int era, nera;
  801         u_char behavior;
  802 
  803         KASSERT(fs->lookup_still_valid, ("map unlocked"));
  804         era = fs->entry->read_ahead;
  805         behavior = vm_map_entry_behavior(fs->entry);
  806         if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
  807                 nera = 0;
  808         } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
  809                 nera = VM_FAULT_READ_AHEAD_MAX;
  810                 if (fs->vaddr == fs->entry->next_read)
  811                         vm_fault_dontneed(fs, fs->vaddr, nera);
  812         } else if (fs->vaddr == fs->entry->next_read) {
  813                 /*
  814                  * This is a sequential fault.  Arithmetically
  815                  * increase the requested number of pages in
  816                  * the read-ahead window.  The requested
  817                  * number of pages is "# of sequential faults
  818                  * x (read ahead min + 1) + read ahead min"
  819                  */
  820                 nera = VM_FAULT_READ_AHEAD_MIN;
  821                 if (era > 0) {
  822                         nera += era + 1;
  823                         if (nera > VM_FAULT_READ_AHEAD_MAX)
  824                                 nera = VM_FAULT_READ_AHEAD_MAX;
  825                 }
  826                 if (era == VM_FAULT_READ_AHEAD_MAX)
  827                         vm_fault_dontneed(fs, fs->vaddr, nera);
  828         } else {
  829                 /*
  830                  * This is a non-sequential fault.
  831                  */
  832                 nera = 0;
  833         }
  834         if (era != nera) {
  835                 /*
  836                  * A read lock on the map suffices to update
  837                  * the read ahead count safely.
  838                  */
  839                 fs->entry->read_ahead = nera;
  840         }
  841 
  842         return (nera);
  843 }
  844 
  845 static int
  846 vm_fault_lookup(struct faultstate *fs)
  847 {
  848         int result;
  849 
  850         KASSERT(!fs->lookup_still_valid,
  851            ("vm_fault_lookup: Map already locked."));
  852         result = vm_map_lookup(&fs->map, fs->vaddr, fs->fault_type |
  853             VM_PROT_FAULT_LOOKUP, &fs->entry, &fs->first_object,
  854             &fs->first_pindex, &fs->prot, &fs->wired);
  855         if (result != KERN_SUCCESS) {
  856                 unlock_vp(fs);
  857                 return (result);
  858         }
  859 
  860         fs->map_generation = fs->map->timestamp;
  861 
  862         if (fs->entry->eflags & MAP_ENTRY_NOFAULT) {
  863                 panic("%s: fault on nofault entry, addr: %#lx",
  864                     __func__, (u_long)fs->vaddr);
  865         }
  866 
  867         if (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION &&
  868             fs->entry->wiring_thread != curthread) {
  869                 vm_map_unlock_read(fs->map);
  870                 vm_map_lock(fs->map);
  871                 if (vm_map_lookup_entry(fs->map, fs->vaddr, &fs->entry) &&
  872                     (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
  873                         unlock_vp(fs);
  874                         fs->entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
  875                         vm_map_unlock_and_wait(fs->map, 0);
  876                 } else
  877                         vm_map_unlock(fs->map);
  878                 return (KERN_RESOURCE_SHORTAGE);
  879         }
  880 
  881         MPASS((fs->entry->eflags & MAP_ENTRY_GUARD) == 0);
  882 
  883         if (fs->wired)
  884                 fs->fault_type = fs->prot | (fs->fault_type & VM_PROT_COPY);
  885         else
  886                 KASSERT((fs->fault_flags & VM_FAULT_WIRE) == 0,
  887                     ("!fs->wired && VM_FAULT_WIRE"));
  888         fs->lookup_still_valid = true;
  889 
  890         return (KERN_SUCCESS);
  891 }
  892 
  893 static int
  894 vm_fault_relookup(struct faultstate *fs)
  895 {
  896         vm_object_t retry_object;
  897         vm_pindex_t retry_pindex;
  898         vm_prot_t retry_prot;
  899         int result;
  900 
  901         if (!vm_map_trylock_read(fs->map))
  902                 return (KERN_RESTART);
  903 
  904         fs->lookup_still_valid = true;
  905         if (fs->map->timestamp == fs->map_generation)
  906                 return (KERN_SUCCESS);
  907 
  908         result = vm_map_lookup_locked(&fs->map, fs->vaddr, fs->fault_type,
  909             &fs->entry, &retry_object, &retry_pindex, &retry_prot,
  910             &fs->wired);
  911         if (result != KERN_SUCCESS) {
  912                 /*
  913                  * If retry of map lookup would have blocked then
  914                  * retry fault from start.
  915                  */
  916                 if (result == KERN_FAILURE)
  917                         return (KERN_RESTART);
  918                 return (result);
  919         }
  920         if (retry_object != fs->first_object ||
  921             retry_pindex != fs->first_pindex)
  922                 return (KERN_RESTART);
  923 
  924         /*
  925          * Check whether the protection has changed or the object has
  926          * been copied while we left the map unlocked. Changing from
  927          * read to write permission is OK - we leave the page
  928          * write-protected, and catch the write fault. Changing from
  929          * write to read permission means that we can't mark the page
  930          * write-enabled after all.
  931          */
  932         fs->prot &= retry_prot;
  933         fs->fault_type &= retry_prot;
  934         if (fs->prot == 0)
  935                 return (KERN_RESTART);
  936 
  937         /* Reassert because wired may have changed. */
  938         KASSERT(fs->wired || (fs->fault_flags & VM_FAULT_WIRE) == 0,
  939             ("!wired && VM_FAULT_WIRE"));
  940 
  941         return (KERN_SUCCESS);
  942 }
  943 
  944 static void
  945 vm_fault_cow(struct faultstate *fs)
  946 {
  947         bool is_first_object_locked;
  948 
  949         KASSERT(fs->object != fs->first_object,
  950             ("source and target COW objects are identical"));
  951 
  952         /*
  953          * This allows pages to be virtually copied from a backing_object
  954          * into the first_object, where the backing object has no other
  955          * refs to it, and cannot gain any more refs.  Instead of a bcopy,
  956          * we just move the page from the backing object to the first
  957          * object.  Note that we must mark the page dirty in the first
  958          * object so that it will go out to swap when needed.
  959          */
  960         is_first_object_locked = false;
  961         if (
  962             /*
  963              * Only one shadow object and no other refs.
  964              */
  965             fs->object->shadow_count == 1 && fs->object->ref_count == 1 &&
  966             /*
  967              * No other ways to look the object up
  968              */
  969             fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 &&
  970             /*
  971              * We don't chase down the shadow chain and we can acquire locks.
  972              */
  973             (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) &&
  974             fs->object == fs->first_object->backing_object &&
  975             VM_OBJECT_TRYWLOCK(fs->object)) {
  976                 /*
  977                  * Remove but keep xbusy for replace.  fs->m is moved into
  978                  * fs->first_object and left busy while fs->first_m is
  979                  * conditionally freed.
  980                  */
  981                 vm_page_remove_xbusy(fs->m);
  982                 vm_page_replace(fs->m, fs->first_object, fs->first_pindex,
  983                     fs->first_m);
  984                 vm_page_dirty(fs->m);
  985 #if VM_NRESERVLEVEL > 0
  986                 /*
  987                  * Rename the reservation.
  988                  */
  989                 vm_reserv_rename(fs->m, fs->first_object, fs->object,
  990                     OFF_TO_IDX(fs->first_object->backing_object_offset));
  991 #endif
  992                 VM_OBJECT_WUNLOCK(fs->object);
  993                 VM_OBJECT_WUNLOCK(fs->first_object);
  994                 fs->first_m = fs->m;
  995                 fs->m = NULL;
  996                 VM_CNT_INC(v_cow_optim);
  997         } else {
  998                 if (is_first_object_locked)
  999                         VM_OBJECT_WUNLOCK(fs->first_object);
 1000                 /*
 1001                  * Oh, well, lets copy it.
 1002                  */
 1003                 pmap_copy_page(fs->m, fs->first_m);
 1004                 vm_page_valid(fs->first_m);
 1005                 if (fs->wired && (fs->fault_flags & VM_FAULT_WIRE) == 0) {
 1006                         vm_page_wire(fs->first_m);
 1007                         vm_page_unwire(fs->m, PQ_INACTIVE);
 1008                 }
 1009                 /*
 1010                  * Save the cow page to be released after
 1011                  * pmap_enter is complete.
 1012                  */
 1013                 fs->m_cow = fs->m;
 1014                 fs->m = NULL;
 1015 
 1016                 /*
 1017                  * Typically, the shadow object is either private to this
 1018                  * address space (OBJ_ONEMAPPING) or its pages are read only.
 1019                  * In the highly unusual case where the pages of a shadow object
 1020                  * are read/write shared between this and other address spaces,
 1021                  * we need to ensure that any pmap-level mappings to the
 1022                  * original, copy-on-write page from the backing object are
 1023                  * removed from those other address spaces.
 1024                  *
 1025                  * The flag check is racy, but this is tolerable: if
 1026                  * OBJ_ONEMAPPING is cleared after the check, the busy state
 1027                  * ensures that new mappings of m_cow can't be created.
 1028                  * pmap_enter() will replace an existing mapping in the current
 1029                  * address space.  If OBJ_ONEMAPPING is set after the check,
 1030                  * removing mappings will at worse trigger some unnecessary page
 1031                  * faults.
 1032                  */
 1033                 vm_page_assert_xbusied(fs->m_cow);
 1034                 if ((fs->first_object->flags & OBJ_ONEMAPPING) == 0)
 1035                         pmap_remove_all(fs->m_cow);
 1036         }
 1037 
 1038         vm_object_pip_wakeup(fs->object);
 1039 
 1040         /*
 1041          * Only use the new page below...
 1042          */
 1043         fs->object = fs->first_object;
 1044         fs->pindex = fs->first_pindex;
 1045         fs->m = fs->first_m;
 1046         VM_CNT_INC(v_cow_faults);
 1047         curthread->td_cow++;
 1048 }
 1049 
 1050 static bool
 1051 vm_fault_next(struct faultstate *fs)
 1052 {
 1053         vm_object_t next_object;
 1054 
 1055         /*
 1056          * The requested page does not exist at this object/
 1057          * offset.  Remove the invalid page from the object,
 1058          * waking up anyone waiting for it, and continue on to
 1059          * the next object.  However, if this is the top-level
 1060          * object, we must leave the busy page in place to
 1061          * prevent another process from rushing past us, and
 1062          * inserting the page in that object at the same time
 1063          * that we are.
 1064          */
 1065         if (fs->object == fs->first_object) {
 1066                 fs->first_m = fs->m;
 1067                 fs->m = NULL;
 1068         } else
 1069                 fault_page_free(&fs->m);
 1070 
 1071         /*
 1072          * Move on to the next object.  Lock the next object before
 1073          * unlocking the current one.
 1074          */
 1075         VM_OBJECT_ASSERT_WLOCKED(fs->object);
 1076         next_object = fs->object->backing_object;
 1077         if (next_object == NULL)
 1078                 return (false);
 1079         MPASS(fs->first_m != NULL);
 1080         KASSERT(fs->object != next_object, ("object loop %p", next_object));
 1081         VM_OBJECT_WLOCK(next_object);
 1082         vm_object_pip_add(next_object, 1);
 1083         if (fs->object != fs->first_object)
 1084                 vm_object_pip_wakeup(fs->object);
 1085         fs->pindex += OFF_TO_IDX(fs->object->backing_object_offset);
 1086         VM_OBJECT_WUNLOCK(fs->object);
 1087         fs->object = next_object;
 1088 
 1089         return (true);
 1090 }
 1091 
 1092 static void
 1093 vm_fault_zerofill(struct faultstate *fs)
 1094 {
 1095 
 1096         /*
 1097          * If there's no object left, fill the page in the top
 1098          * object with zeros.
 1099          */
 1100         if (fs->object != fs->first_object) {
 1101                 vm_object_pip_wakeup(fs->object);
 1102                 fs->object = fs->first_object;
 1103                 fs->pindex = fs->first_pindex;
 1104         }
 1105         MPASS(fs->first_m != NULL);
 1106         MPASS(fs->m == NULL);
 1107         fs->m = fs->first_m;
 1108         fs->first_m = NULL;
 1109 
 1110         /*
 1111          * Zero the page if necessary and mark it valid.
 1112          */
 1113         if ((fs->m->flags & PG_ZERO) == 0) {
 1114                 pmap_zero_page(fs->m);
 1115         } else {
 1116                 VM_CNT_INC(v_ozfod);
 1117         }
 1118         VM_CNT_INC(v_zfod);
 1119         vm_page_valid(fs->m);
 1120 }
 1121 
 1122 /*
 1123  * Initiate page fault after timeout.  Returns true if caller should
 1124  * do vm_waitpfault() after the call.
 1125  */
 1126 static bool
 1127 vm_fault_allocate_oom(struct faultstate *fs)
 1128 {
 1129         struct timeval now;
 1130 
 1131         unlock_and_deallocate(fs);
 1132         if (vm_pfault_oom_attempts < 0)
 1133                 return (true);
 1134         if (!fs->oom_started) {
 1135                 fs->oom_started = true;
 1136                 getmicrotime(&fs->oom_start_time);
 1137                 return (true);
 1138         }
 1139 
 1140         getmicrotime(&now);
 1141         timevalsub(&now, &fs->oom_start_time);
 1142         if (now.tv_sec < vm_pfault_oom_attempts * vm_pfault_oom_wait)
 1143                 return (true);
 1144 
 1145         if (bootverbose)
 1146                 printf(
 1147             "proc %d (%s) failed to alloc page on fault, starting OOM\n",
 1148                     curproc->p_pid, curproc->p_comm);
 1149         vm_pageout_oom(VM_OOM_MEM_PF);
 1150         fs->oom_started = false;
 1151         return (false);
 1152 }
 1153 
 1154 /*
 1155  * Allocate a page directly or via the object populate method.
 1156  */
 1157 static enum fault_status
 1158 vm_fault_allocate(struct faultstate *fs)
 1159 {
 1160         struct domainset *dset;
 1161         enum fault_status res;
 1162 
 1163         if ((fs->object->flags & OBJ_SIZEVNLOCK) != 0) {
 1164                 res = vm_fault_lock_vnode(fs, true);
 1165                 MPASS(res == FAULT_CONTINUE || res == FAULT_RESTART);
 1166                 if (res == FAULT_RESTART)
 1167                         return (res);
 1168         }
 1169 
 1170         if (fs->pindex >= fs->object->size) {
 1171                 unlock_and_deallocate(fs);
 1172                 return (FAULT_OUT_OF_BOUNDS);
 1173         }
 1174 
 1175         if (fs->object == fs->first_object &&
 1176             (fs->first_object->flags & OBJ_POPULATE) != 0 &&
 1177             fs->first_object->shadow_count == 0) {
 1178                 res = vm_fault_populate(fs);
 1179                 switch (res) {
 1180                 case FAULT_SUCCESS:
 1181                 case FAULT_FAILURE:
 1182                 case FAULT_RESTART:
 1183                         unlock_and_deallocate(fs);
 1184                         return (res);
 1185                 case FAULT_CONTINUE:
 1186                         /*
 1187                          * Pager's populate() method
 1188                          * returned VM_PAGER_BAD.
 1189                          */
 1190                         break;
 1191                 default:
 1192                         panic("inconsistent return codes");
 1193                 }
 1194         }
 1195 
 1196         /*
 1197          * Allocate a new page for this object/offset pair.
 1198          *
 1199          * If the process has a fatal signal pending, prioritize the allocation
 1200          * with the expectation that the process will exit shortly and free some
 1201          * pages.  In particular, the signal may have been posted by the page
 1202          * daemon in an attempt to resolve an out-of-memory condition.
 1203          *
 1204          * The unlocked read of the p_flag is harmless.  At worst, the P_KILLED
 1205          * might be not observed here, and allocation fails, causing a restart
 1206          * and new reading of the p_flag.
 1207          */
 1208         dset = fs->object->domain.dr_policy;
 1209         if (dset == NULL)
 1210                 dset = curthread->td_domain.dr_policy;
 1211         if (!vm_page_count_severe_set(&dset->ds_mask) || P_KILLED(curproc)) {
 1212 #if VM_NRESERVLEVEL > 0
 1213                 vm_object_color(fs->object, atop(fs->vaddr) - fs->pindex);
 1214 #endif
 1215                 if (!vm_pager_can_alloc_page(fs->object, fs->pindex)) {
 1216                         unlock_and_deallocate(fs);
 1217                         return (FAULT_FAILURE);
 1218                 }
 1219                 fs->m = vm_page_alloc(fs->object, fs->pindex,
 1220                     P_KILLED(curproc) ? VM_ALLOC_SYSTEM : 0);
 1221         }
 1222         if (fs->m == NULL) {
 1223                 if (vm_fault_allocate_oom(fs))
 1224                         vm_waitpfault(dset, vm_pfault_oom_wait * hz);
 1225                 return (FAULT_RESTART);
 1226         }
 1227         fs->oom_started = false;
 1228 
 1229         return (FAULT_CONTINUE);
 1230 }
 1231 
 1232 /*
 1233  * Call the pager to retrieve the page if there is a chance
 1234  * that the pager has it, and potentially retrieve additional
 1235  * pages at the same time.
 1236  */
 1237 static enum fault_status
 1238 vm_fault_getpages(struct faultstate *fs, int *behindp, int *aheadp)
 1239 {
 1240         vm_offset_t e_end, e_start;
 1241         int ahead, behind, cluster_offset, rv;
 1242         enum fault_status status;
 1243         u_char behavior;
 1244 
 1245         /*
 1246          * Prepare for unlocking the map.  Save the map
 1247          * entry's start and end addresses, which are used to
 1248          * optimize the size of the pager operation below.
 1249          * Even if the map entry's addresses change after
 1250          * unlocking the map, using the saved addresses is
 1251          * safe.
 1252          */
 1253         e_start = fs->entry->start;
 1254         e_end = fs->entry->end;
 1255         behavior = vm_map_entry_behavior(fs->entry);
 1256 
 1257         /*
 1258          * If the pager for the current object might have
 1259          * the page, then determine the number of additional
 1260          * pages to read and potentially reprioritize
 1261          * previously read pages for earlier reclamation.
 1262          * These operations should only be performed once per
 1263          * page fault.  Even if the current pager doesn't
 1264          * have the page, the number of additional pages to
 1265          * read will apply to subsequent objects in the
 1266          * shadow chain.
 1267          */
 1268         if (fs->nera == -1 && !P_KILLED(curproc))
 1269                 fs->nera = vm_fault_readahead(fs);
 1270 
 1271         /*
 1272          * Release the map lock before locking the vnode or
 1273          * sleeping in the pager.  (If the current object has
 1274          * a shadow, then an earlier iteration of this loop
 1275          * may have already unlocked the map.)
 1276          */
 1277         unlock_map(fs);
 1278 
 1279         status = vm_fault_lock_vnode(fs, false);
 1280         MPASS(status == FAULT_CONTINUE || status == FAULT_RESTART);
 1281         if (status == FAULT_RESTART)
 1282                 return (status);
 1283         KASSERT(fs->vp == NULL || !fs->map->system_map,
 1284             ("vm_fault: vnode-backed object mapped by system map"));
 1285 
 1286         /*
 1287          * Page in the requested page and hint the pager,
 1288          * that it may bring up surrounding pages.
 1289          */
 1290         if (fs->nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM ||
 1291             P_KILLED(curproc)) {
 1292                 behind = 0;
 1293                 ahead = 0;
 1294         } else {
 1295                 /* Is this a sequential fault? */
 1296                 if (fs->nera > 0) {
 1297                         behind = 0;
 1298                         ahead = fs->nera;
 1299                 } else {
 1300                         /*
 1301                          * Request a cluster of pages that is
 1302                          * aligned to a VM_FAULT_READ_DEFAULT
 1303                          * page offset boundary within the
 1304                          * object.  Alignment to a page offset
 1305                          * boundary is more likely to coincide
 1306                          * with the underlying file system
 1307                          * block than alignment to a virtual
 1308                          * address boundary.
 1309                          */
 1310                         cluster_offset = fs->pindex % VM_FAULT_READ_DEFAULT;
 1311                         behind = ulmin(cluster_offset,
 1312                             atop(fs->vaddr - e_start));
 1313                         ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset;
 1314                 }
 1315                 ahead = ulmin(ahead, atop(e_end - fs->vaddr) - 1);
 1316         }
 1317         *behindp = behind;
 1318         *aheadp = ahead;
 1319         rv = vm_pager_get_pages(fs->object, &fs->m, 1, behindp, aheadp);
 1320         if (rv == VM_PAGER_OK)
 1321                 return (FAULT_HARD);
 1322         if (rv == VM_PAGER_ERROR)
 1323                 printf("vm_fault: pager read error, pid %d (%s)\n",
 1324                     curproc->p_pid, curproc->p_comm);
 1325         /*
 1326          * If an I/O error occurred or the requested page was
 1327          * outside the range of the pager, clean up and return
 1328          * an error.
 1329          */
 1330         if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) {
 1331                 VM_OBJECT_WLOCK(fs->object);
 1332                 fault_page_free(&fs->m);
 1333                 unlock_and_deallocate(fs);
 1334                 return (FAULT_OUT_OF_BOUNDS);
 1335         }
 1336         KASSERT(rv == VM_PAGER_FAIL,
 1337             ("%s: unexpected pager error %d", __func__, rv));
 1338         return (FAULT_CONTINUE);
 1339 }
 1340 
 1341 /*
 1342  * Wait/Retry if the page is busy.  We have to do this if the page is
 1343  * either exclusive or shared busy because the vm_pager may be using
 1344  * read busy for pageouts (and even pageins if it is the vnode pager),
 1345  * and we could end up trying to pagein and pageout the same page
 1346  * simultaneously.
 1347  *
 1348  * We can theoretically allow the busy case on a read fault if the page
 1349  * is marked valid, but since such pages are typically already pmap'd,
 1350  * putting that special case in might be more effort then it is worth.
 1351  * We cannot under any circumstances mess around with a shared busied
 1352  * page except, perhaps, to pmap it.
 1353  */
 1354 static void
 1355 vm_fault_busy_sleep(struct faultstate *fs)
 1356 {
 1357         /*
 1358          * Reference the page before unlocking and
 1359          * sleeping so that the page daemon is less
 1360          * likely to reclaim it.
 1361          */
 1362         vm_page_aflag_set(fs->m, PGA_REFERENCED);
 1363         if (fs->object != fs->first_object) {
 1364                 fault_page_release(&fs->first_m);
 1365                 vm_object_pip_wakeup(fs->first_object);
 1366         }
 1367         vm_object_pip_wakeup(fs->object);
 1368         unlock_map(fs);
 1369         if (fs->m != vm_page_lookup(fs->object, fs->pindex) ||
 1370             !vm_page_busy_sleep(fs->m, "vmpfw", 0))
 1371                 VM_OBJECT_WUNLOCK(fs->object);
 1372         VM_CNT_INC(v_intrans);
 1373         vm_object_deallocate(fs->first_object);
 1374 }
 1375 
 1376 /*
 1377  * Handle page lookup, populate, allocate, page-in for the current
 1378  * object.
 1379  *
 1380  * The object is locked on entry and will remain locked with a return
 1381  * code of FAULT_CONTINUE so that fault may follow the shadow chain.
 1382  * Otherwise, the object will be unlocked upon return.
 1383  */
 1384 static enum fault_status
 1385 vm_fault_object(struct faultstate *fs, int *behindp, int *aheadp)
 1386 {
 1387         enum fault_status res;
 1388         bool dead;
 1389 
 1390         /*
 1391          * If the object is marked for imminent termination, we retry
 1392          * here, since the collapse pass has raced with us.  Otherwise,
 1393          * if we see terminally dead object, return fail.
 1394          */
 1395         if ((fs->object->flags & OBJ_DEAD) != 0) {
 1396                 dead = fs->object->type == OBJT_DEAD;
 1397                 unlock_and_deallocate(fs);
 1398                 if (dead)
 1399                         return (FAULT_PROTECTION_FAILURE);
 1400                 pause("vmf_de", 1);
 1401                 return (FAULT_RESTART);
 1402         }
 1403 
 1404         /*
 1405          * See if the page is resident.
 1406          */
 1407         fs->m = vm_page_lookup(fs->object, fs->pindex);
 1408         if (fs->m != NULL) {
 1409                 if (!vm_page_tryxbusy(fs->m)) {
 1410                         vm_fault_busy_sleep(fs);
 1411                         return (FAULT_RESTART);
 1412                 }
 1413 
 1414                 /*
 1415                  * The page is marked busy for other processes and the
 1416                  * pagedaemon.  If it is still completely valid we are
 1417                  * done.
 1418                  */
 1419                 if (vm_page_all_valid(fs->m)) {
 1420                         VM_OBJECT_WUNLOCK(fs->object);
 1421                         return (FAULT_SOFT);
 1422                 }
 1423         }
 1424         VM_OBJECT_ASSERT_WLOCKED(fs->object);
 1425 
 1426         /*
 1427          * Page is not resident.  If the pager might contain the page
 1428          * or this is the beginning of the search, allocate a new
 1429          * page.
 1430          */
 1431         if (fs->m == NULL && (fault_object_needs_getpages(fs->object) ||
 1432             fs->object == fs->first_object)) {
 1433                 res = vm_fault_allocate(fs);
 1434                 if (res != FAULT_CONTINUE)
 1435                         return (res);
 1436         }
 1437 
 1438         /*
 1439          * Default objects have no pager so no exclusive busy exists
 1440          * to protect this page in the chain.  Skip to the next
 1441          * object without dropping the lock to preserve atomicity of
 1442          * shadow faults.
 1443          */
 1444         if (fault_object_needs_getpages(fs->object)) {
 1445                 /*
 1446                  * At this point, we have either allocated a new page
 1447                  * or found an existing page that is only partially
 1448                  * valid.
 1449                  *
 1450                  * We hold a reference on the current object and the
 1451                  * page is exclusive busied.  The exclusive busy
 1452                  * prevents simultaneous faults and collapses while
 1453                  * the object lock is dropped.
 1454                  */
 1455                 VM_OBJECT_WUNLOCK(fs->object);
 1456                 res = vm_fault_getpages(fs, behindp, aheadp);
 1457                 if (res == FAULT_CONTINUE)
 1458                         VM_OBJECT_WLOCK(fs->object);
 1459         } else {
 1460                 res = FAULT_CONTINUE;
 1461         }
 1462         return (res);
 1463 }
 1464 
 1465 int
 1466 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 1467     int fault_flags, vm_page_t *m_hold)
 1468 {
 1469         struct faultstate fs;
 1470         int ahead, behind, faultcount, rv;
 1471         enum fault_status res;
 1472         bool hardfault;
 1473 
 1474         VM_CNT_INC(v_vm_faults);
 1475 
 1476         if ((curthread->td_pflags & TDP_NOFAULTING) != 0)
 1477                 return (KERN_PROTECTION_FAILURE);
 1478 
 1479         fs.vp = NULL;
 1480         fs.vaddr = vaddr;
 1481         fs.m_hold = m_hold;
 1482         fs.fault_flags = fault_flags;
 1483         fs.map = map;
 1484         fs.lookup_still_valid = false;
 1485         fs.oom_started = false;
 1486         fs.nera = -1;
 1487         faultcount = 0;
 1488         hardfault = false;
 1489 
 1490 RetryFault:
 1491         fs.fault_type = fault_type;
 1492 
 1493         /*
 1494          * Find the backing store object and offset into it to begin the
 1495          * search.
 1496          */
 1497         rv = vm_fault_lookup(&fs);
 1498         if (rv != KERN_SUCCESS) {
 1499                 if (rv == KERN_RESOURCE_SHORTAGE)
 1500                         goto RetryFault;
 1501                 return (rv);
 1502         }
 1503 
 1504         /*
 1505          * Try to avoid lock contention on the top-level object through
 1506          * special-case handling of some types of page faults, specifically,
 1507          * those that are mapping an existing page from the top-level object.
 1508          * Under this condition, a read lock on the object suffices, allowing
 1509          * multiple page faults of a similar type to run in parallel.
 1510          */
 1511         if (fs.vp == NULL /* avoid locked vnode leak */ &&
 1512             (fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) == 0 &&
 1513             (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) {
 1514                 VM_OBJECT_RLOCK(fs.first_object);
 1515                 res = vm_fault_soft_fast(&fs);
 1516                 if (res == FAULT_SUCCESS)
 1517                         return (KERN_SUCCESS);
 1518                 if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) {
 1519                         VM_OBJECT_RUNLOCK(fs.first_object);
 1520                         VM_OBJECT_WLOCK(fs.first_object);
 1521                 }
 1522         } else {
 1523                 VM_OBJECT_WLOCK(fs.first_object);
 1524         }
 1525 
 1526         /*
 1527          * Make a reference to this object to prevent its disposal while we
 1528          * are messing with it.  Once we have the reference, the map is free
 1529          * to be diddled.  Since objects reference their shadows (and copies),
 1530          * they will stay around as well.
 1531          *
 1532          * Bump the paging-in-progress count to prevent size changes (e.g. 
 1533          * truncation operations) during I/O.
 1534          */
 1535         vm_object_reference_locked(fs.first_object);
 1536         vm_object_pip_add(fs.first_object, 1);
 1537 
 1538         fs.m_cow = fs.m = fs.first_m = NULL;
 1539 
 1540         /*
 1541          * Search for the page at object/offset.
 1542          */
 1543         fs.object = fs.first_object;
 1544         fs.pindex = fs.first_pindex;
 1545 
 1546         if ((fs.entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) != 0) {
 1547                 res = vm_fault_allocate(&fs);
 1548                 switch (res) {
 1549                 case FAULT_RESTART:
 1550                         goto RetryFault;
 1551                 case FAULT_SUCCESS:
 1552                         return (KERN_SUCCESS);
 1553                 case FAULT_FAILURE:
 1554                         return (KERN_FAILURE);
 1555                 case FAULT_OUT_OF_BOUNDS:
 1556                         return (KERN_OUT_OF_BOUNDS);
 1557                 case FAULT_CONTINUE:
 1558                         break;
 1559                 default:
 1560                         panic("vm_fault: Unhandled status %d", res);
 1561                 }
 1562         }
 1563 
 1564         while (TRUE) {
 1565                 KASSERT(fs.m == NULL,
 1566                     ("page still set %p at loop start", fs.m));
 1567 
 1568                 res = vm_fault_object(&fs, &behind, &ahead);
 1569                 switch (res) {
 1570                 case FAULT_SOFT:
 1571                         goto found;
 1572                 case FAULT_HARD:
 1573                         faultcount = behind + 1 + ahead;
 1574                         hardfault = true;
 1575                         goto found;
 1576                 case FAULT_RESTART:
 1577                         goto RetryFault;
 1578                 case FAULT_SUCCESS:
 1579                         return (KERN_SUCCESS);
 1580                 case FAULT_FAILURE:
 1581                         return (KERN_FAILURE);
 1582                 case FAULT_OUT_OF_BOUNDS:
 1583                         return (KERN_OUT_OF_BOUNDS);
 1584                 case FAULT_PROTECTION_FAILURE:
 1585                         return (KERN_PROTECTION_FAILURE);
 1586                 case FAULT_CONTINUE:
 1587                         break;
 1588                 default:
 1589                         panic("vm_fault: Unhandled status %d", res);
 1590                 }
 1591 
 1592                 /*
 1593                  * The page was not found in the current object.  Try to
 1594                  * traverse into a backing object or zero fill if none is
 1595                  * found.
 1596                  */
 1597                 if (vm_fault_next(&fs))
 1598                         continue;
 1599                 if ((fs.fault_flags & VM_FAULT_NOFILL) != 0) {
 1600                         if (fs.first_object == fs.object)
 1601                                 fault_page_free(&fs.first_m);
 1602                         unlock_and_deallocate(&fs);
 1603                         return (KERN_OUT_OF_BOUNDS);
 1604                 }
 1605                 VM_OBJECT_WUNLOCK(fs.object);
 1606                 vm_fault_zerofill(&fs);
 1607                 /* Don't try to prefault neighboring pages. */
 1608                 faultcount = 1;
 1609                 break;
 1610         }
 1611 
 1612 found:
 1613         /*
 1614          * A valid page has been found and exclusively busied.  The
 1615          * object lock must no longer be held.
 1616          */
 1617         vm_page_assert_xbusied(fs.m);
 1618         VM_OBJECT_ASSERT_UNLOCKED(fs.object);
 1619 
 1620         /*
 1621          * If the page is being written, but isn't already owned by the
 1622          * top-level object, we have to copy it into a new page owned by the
 1623          * top-level object.
 1624          */
 1625         if (fs.object != fs.first_object) {
 1626                 /*
 1627                  * We only really need to copy if we want to write it.
 1628                  */
 1629                 if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 1630                         vm_fault_cow(&fs);
 1631                         /*
 1632                          * We only try to prefault read-only mappings to the
 1633                          * neighboring pages when this copy-on-write fault is
 1634                          * a hard fault.  In other cases, trying to prefault
 1635                          * is typically wasted effort.
 1636                          */
 1637                         if (faultcount == 0)
 1638                                 faultcount = 1;
 1639 
 1640                 } else {
 1641                         fs.prot &= ~VM_PROT_WRITE;
 1642                 }
 1643         }
 1644 
 1645         /*
 1646          * We must verify that the maps have not changed since our last
 1647          * lookup.
 1648          */
 1649         if (!fs.lookup_still_valid) {
 1650                 rv = vm_fault_relookup(&fs);
 1651                 if (rv != KERN_SUCCESS) {
 1652                         fault_deallocate(&fs);
 1653                         if (rv == KERN_RESTART)
 1654                                 goto RetryFault;
 1655                         return (rv);
 1656                 }
 1657         }
 1658         VM_OBJECT_ASSERT_UNLOCKED(fs.object);
 1659 
 1660         /*
 1661          * If the page was filled by a pager, save the virtual address that
 1662          * should be faulted on next under a sequential access pattern to the
 1663          * map entry.  A read lock on the map suffices to update this address
 1664          * safely.
 1665          */
 1666         if (hardfault)
 1667                 fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE;
 1668 
 1669         /*
 1670          * Page must be completely valid or it is not fit to
 1671          * map into user space.  vm_pager_get_pages() ensures this.
 1672          */
 1673         vm_page_assert_xbusied(fs.m);
 1674         KASSERT(vm_page_all_valid(fs.m),
 1675             ("vm_fault: page %p partially invalid", fs.m));
 1676 
 1677         vm_fault_dirty(&fs, fs.m);
 1678 
 1679         /*
 1680          * Put this page into the physical map.  We had to do the unlock above
 1681          * because pmap_enter() may sleep.  We don't put the page
 1682          * back on the active queue until later so that the pageout daemon
 1683          * won't find it (yet).
 1684          */
 1685         pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot,
 1686             fs.fault_type | (fs.wired ? PMAP_ENTER_WIRED : 0), 0);
 1687         if (faultcount != 1 && (fs.fault_flags & VM_FAULT_WIRE) == 0 &&
 1688             fs.wired == 0)
 1689                 vm_fault_prefault(&fs, vaddr,
 1690                     faultcount > 0 ? behind : PFBAK,
 1691                     faultcount > 0 ? ahead : PFFOR, false);
 1692 
 1693         /*
 1694          * If the page is not wired down, then put it where the pageout daemon
 1695          * can find it.
 1696          */
 1697         if ((fs.fault_flags & VM_FAULT_WIRE) != 0)
 1698                 vm_page_wire(fs.m);
 1699         else
 1700                 vm_page_activate(fs.m);
 1701         if (fs.m_hold != NULL) {
 1702                 (*fs.m_hold) = fs.m;
 1703                 vm_page_wire(fs.m);
 1704         }
 1705         vm_page_xunbusy(fs.m);
 1706         fs.m = NULL;
 1707 
 1708         /*
 1709          * Unlock everything, and return
 1710          */
 1711         fault_deallocate(&fs);
 1712         if (hardfault) {
 1713                 VM_CNT_INC(v_io_faults);
 1714                 curthread->td_ru.ru_majflt++;
 1715 #ifdef RACCT
 1716                 if (racct_enable && fs.object->type == OBJT_VNODE) {
 1717                         PROC_LOCK(curproc);
 1718                         if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 1719                                 racct_add_force(curproc, RACCT_WRITEBPS,
 1720                                     PAGE_SIZE + behind * PAGE_SIZE);
 1721                                 racct_add_force(curproc, RACCT_WRITEIOPS, 1);
 1722                         } else {
 1723                                 racct_add_force(curproc, RACCT_READBPS,
 1724                                     PAGE_SIZE + ahead * PAGE_SIZE);
 1725                                 racct_add_force(curproc, RACCT_READIOPS, 1);
 1726                         }
 1727                         PROC_UNLOCK(curproc);
 1728                 }
 1729 #endif
 1730         } else 
 1731                 curthread->td_ru.ru_minflt++;
 1732 
 1733         return (KERN_SUCCESS);
 1734 }
 1735 
 1736 /*
 1737  * Speed up the reclamation of pages that precede the faulting pindex within
 1738  * the first object of the shadow chain.  Essentially, perform the equivalent
 1739  * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes
 1740  * the faulting pindex by the cluster size when the pages read by vm_fault()
 1741  * cross a cluster-size boundary.  The cluster size is the greater of the
 1742  * smallest superpage size and VM_FAULT_DONTNEED_MIN.
 1743  *
 1744  * When "fs->first_object" is a shadow object, the pages in the backing object
 1745  * that precede the faulting pindex are deactivated by vm_fault().  So, this
 1746  * function must only be concerned with pages in the first object.
 1747  */
 1748 static void
 1749 vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead)
 1750 {
 1751         vm_map_entry_t entry;
 1752         vm_object_t first_object;
 1753         vm_offset_t end, start;
 1754         vm_page_t m, m_next;
 1755         vm_pindex_t pend, pstart;
 1756         vm_size_t size;
 1757 
 1758         VM_OBJECT_ASSERT_UNLOCKED(fs->object);
 1759         first_object = fs->first_object;
 1760         /* Neither fictitious nor unmanaged pages can be reclaimed. */
 1761         if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
 1762                 VM_OBJECT_RLOCK(first_object);
 1763                 size = VM_FAULT_DONTNEED_MIN;
 1764                 if (MAXPAGESIZES > 1 && size < pagesizes[1])
 1765                         size = pagesizes[1];
 1766                 end = rounddown2(vaddr, size);
 1767                 if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) &&
 1768                     (entry = fs->entry)->start < end) {
 1769                         if (end - entry->start < size)
 1770                                 start = entry->start;
 1771                         else
 1772                                 start = end - size;
 1773                         pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED);
 1774                         pstart = OFF_TO_IDX(entry->offset) + atop(start -
 1775                             entry->start);
 1776                         m_next = vm_page_find_least(first_object, pstart);
 1777                         pend = OFF_TO_IDX(entry->offset) + atop(end -
 1778                             entry->start);
 1779                         while ((m = m_next) != NULL && m->pindex < pend) {
 1780                                 m_next = TAILQ_NEXT(m, listq);
 1781                                 if (!vm_page_all_valid(m) ||
 1782                                     vm_page_busied(m))
 1783                                         continue;
 1784 
 1785                                 /*
 1786                                  * Don't clear PGA_REFERENCED, since it would
 1787                                  * likely represent a reference by a different
 1788                                  * process.
 1789                                  *
 1790                                  * Typically, at this point, prefetched pages
 1791                                  * are still in the inactive queue.  Only
 1792                                  * pages that triggered page faults are in the
 1793                                  * active queue.  The test for whether the page
 1794                                  * is in the inactive queue is racy; in the
 1795                                  * worst case we will requeue the page
 1796                                  * unnecessarily.
 1797                                  */
 1798                                 if (!vm_page_inactive(m))
 1799                                         vm_page_deactivate(m);
 1800                         }
 1801                 }
 1802                 VM_OBJECT_RUNLOCK(first_object);
 1803         }
 1804 }
 1805 
 1806 /*
 1807  * vm_fault_prefault provides a quick way of clustering
 1808  * pagefaults into a processes address space.  It is a "cousin"
 1809  * of vm_map_pmap_enter, except it runs at page fault time instead
 1810  * of mmap time.
 1811  */
 1812 static void
 1813 vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
 1814     int backward, int forward, bool obj_locked)
 1815 {
 1816         pmap_t pmap;
 1817         vm_map_entry_t entry;
 1818         vm_object_t backing_object, lobject;
 1819         vm_offset_t addr, starta;
 1820         vm_pindex_t pindex;
 1821         vm_page_t m;
 1822         int i;
 1823 
 1824         pmap = fs->map->pmap;
 1825         if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 1826                 return;
 1827 
 1828         entry = fs->entry;
 1829 
 1830         if (addra < backward * PAGE_SIZE) {
 1831                 starta = entry->start;
 1832         } else {
 1833                 starta = addra - backward * PAGE_SIZE;
 1834                 if (starta < entry->start)
 1835                         starta = entry->start;
 1836         }
 1837 
 1838         /*
 1839          * Generate the sequence of virtual addresses that are candidates for
 1840          * prefaulting in an outward spiral from the faulting virtual address,
 1841          * "addra".  Specifically, the sequence is "addra - PAGE_SIZE", "addra
 1842          * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
 1843          * If the candidate address doesn't have a backing physical page, then
 1844          * the loop immediately terminates.
 1845          */
 1846         for (i = 0; i < 2 * imax(backward, forward); i++) {
 1847                 addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
 1848                     PAGE_SIZE);
 1849                 if (addr > addra + forward * PAGE_SIZE)
 1850                         addr = 0;
 1851 
 1852                 if (addr < starta || addr >= entry->end)
 1853                         continue;
 1854 
 1855                 if (!pmap_is_prefaultable(pmap, addr))
 1856                         continue;
 1857 
 1858                 pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 1859                 lobject = entry->object.vm_object;
 1860                 if (!obj_locked)
 1861                         VM_OBJECT_RLOCK(lobject);
 1862                 while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 1863                     !fault_object_needs_getpages(lobject) &&
 1864                     (backing_object = lobject->backing_object) != NULL) {
 1865                         KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
 1866                             0, ("vm_fault_prefault: unaligned object offset"));
 1867                         pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 1868                         VM_OBJECT_RLOCK(backing_object);
 1869                         if (!obj_locked || lobject != entry->object.vm_object)
 1870                                 VM_OBJECT_RUNLOCK(lobject);
 1871                         lobject = backing_object;
 1872                 }
 1873                 if (m == NULL) {
 1874                         if (!obj_locked || lobject != entry->object.vm_object)
 1875                                 VM_OBJECT_RUNLOCK(lobject);
 1876                         break;
 1877                 }
 1878                 if (vm_page_all_valid(m) &&
 1879                     (m->flags & PG_FICTITIOUS) == 0)
 1880                         pmap_enter_quick(pmap, addr, m, entry->protection);
 1881                 if (!obj_locked || lobject != entry->object.vm_object)
 1882                         VM_OBJECT_RUNLOCK(lobject);
 1883         }
 1884 }
 1885 
 1886 /*
 1887  * Hold each of the physical pages that are mapped by the specified range of
 1888  * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
 1889  * and allow the specified types of access, "prot".  If all of the implied
 1890  * pages are successfully held, then the number of held pages is returned
 1891  * together with pointers to those pages in the array "ma".  However, if any
 1892  * of the pages cannot be held, -1 is returned.
 1893  */
 1894 int
 1895 vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
 1896     vm_prot_t prot, vm_page_t *ma, int max_count)
 1897 {
 1898         vm_offset_t end, va;
 1899         vm_page_t *mp;
 1900         int count;
 1901         boolean_t pmap_failed;
 1902 
 1903         if (len == 0)
 1904                 return (0);
 1905         end = round_page(addr + len);
 1906         addr = trunc_page(addr);
 1907 
 1908         if (!vm_map_range_valid(map, addr, end))
 1909                 return (-1);
 1910 
 1911         if (atop(end - addr) > max_count)
 1912                 panic("vm_fault_quick_hold_pages: count > max_count");
 1913         count = atop(end - addr);
 1914 
 1915         /*
 1916          * Most likely, the physical pages are resident in the pmap, so it is
 1917          * faster to try pmap_extract_and_hold() first.
 1918          */
 1919         pmap_failed = FALSE;
 1920         for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
 1921                 *mp = pmap_extract_and_hold(map->pmap, va, prot);
 1922                 if (*mp == NULL)
 1923                         pmap_failed = TRUE;
 1924                 else if ((prot & VM_PROT_WRITE) != 0 &&
 1925                     (*mp)->dirty != VM_PAGE_BITS_ALL) {
 1926                         /*
 1927                          * Explicitly dirty the physical page.  Otherwise, the
 1928                          * caller's changes may go unnoticed because they are
 1929                          * performed through an unmanaged mapping or by a DMA
 1930                          * operation.
 1931                          *
 1932                          * The object lock is not held here.
 1933                          * See vm_page_clear_dirty_mask().
 1934                          */
 1935                         vm_page_dirty(*mp);
 1936                 }
 1937         }
 1938         if (pmap_failed) {
 1939                 /*
 1940                  * One or more pages could not be held by the pmap.  Either no
 1941                  * page was mapped at the specified virtual address or that
 1942                  * mapping had insufficient permissions.  Attempt to fault in
 1943                  * and hold these pages.
 1944                  *
 1945                  * If vm_fault_disable_pagefaults() was called,
 1946                  * i.e., TDP_NOFAULTING is set, we must not sleep nor
 1947                  * acquire MD VM locks, which means we must not call
 1948                  * vm_fault().  Some (out of tree) callers mark
 1949                  * too wide a code area with vm_fault_disable_pagefaults()
 1950                  * already, use the VM_PROT_QUICK_NOFAULT flag to request
 1951                  * the proper behaviour explicitly.
 1952                  */
 1953                 if ((prot & VM_PROT_QUICK_NOFAULT) != 0 &&
 1954                     (curthread->td_pflags & TDP_NOFAULTING) != 0)
 1955                         goto error;
 1956                 for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
 1957                         if (*mp == NULL && vm_fault(map, va, prot,
 1958                             VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
 1959                                 goto error;
 1960         }
 1961         return (count);
 1962 error:  
 1963         for (mp = ma; mp < ma + count; mp++)
 1964                 if (*mp != NULL)
 1965                         vm_page_unwire(*mp, PQ_INACTIVE);
 1966         return (-1);
 1967 }
 1968 
 1969 /*
 1970  *      Routine:
 1971  *              vm_fault_copy_entry
 1972  *      Function:
 1973  *              Create new object backing dst_entry with private copy of all
 1974  *              underlying pages. When src_entry is equal to dst_entry, function
 1975  *              implements COW for wired-down map entry. Otherwise, it forks
 1976  *              wired entry into dst_map.
 1977  *
 1978  *      In/out conditions:
 1979  *              The source and destination maps must be locked for write.
 1980  *              The source map entry must be wired down (or be a sharing map
 1981  *              entry corresponding to a main map entry that is wired down).
 1982  */
 1983 void
 1984 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map __unused,
 1985     vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
 1986     vm_ooffset_t *fork_charge)
 1987 {
 1988         vm_object_t backing_object, dst_object, object, src_object;
 1989         vm_pindex_t dst_pindex, pindex, src_pindex;
 1990         vm_prot_t access, prot;
 1991         vm_offset_t vaddr;
 1992         vm_page_t dst_m;
 1993         vm_page_t src_m;
 1994         bool upgrade;
 1995 
 1996         upgrade = src_entry == dst_entry;
 1997         KASSERT(upgrade || dst_entry->object.vm_object == NULL,
 1998             ("vm_fault_copy_entry: vm_object not NULL"));
 1999 
 2000         /*
 2001          * If not an upgrade, then enter the mappings in the pmap as
 2002          * read and/or execute accesses.  Otherwise, enter them as
 2003          * write accesses.
 2004          *
 2005          * A writeable large page mapping is only created if all of
 2006          * the constituent small page mappings are modified. Marking
 2007          * PTEs as modified on inception allows promotion to happen
 2008          * without taking potentially large number of soft faults.
 2009          */
 2010         access = prot = dst_entry->protection;
 2011         if (!upgrade)
 2012                 access &= ~VM_PROT_WRITE;
 2013 
 2014         src_object = src_entry->object.vm_object;
 2015         src_pindex = OFF_TO_IDX(src_entry->offset);
 2016 
 2017         if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 2018                 dst_object = src_object;
 2019                 vm_object_reference(dst_object);
 2020         } else {
 2021                 /*
 2022                  * Create the top-level object for the destination entry.
 2023                  * Doesn't actually shadow anything - we copy the pages
 2024                  * directly.
 2025                  */
 2026                 dst_object = vm_object_allocate_anon(atop(dst_entry->end -
 2027                     dst_entry->start), NULL, NULL, 0);
 2028 #if VM_NRESERVLEVEL > 0
 2029                 dst_object->flags |= OBJ_COLORED;
 2030                 dst_object->pg_color = atop(dst_entry->start);
 2031 #endif
 2032                 dst_object->domain = src_object->domain;
 2033                 dst_object->charge = dst_entry->end - dst_entry->start;
 2034 
 2035                 dst_entry->object.vm_object = dst_object;
 2036                 dst_entry->offset = 0;
 2037                 dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC;
 2038         }
 2039 
 2040         VM_OBJECT_WLOCK(dst_object);
 2041         if (fork_charge != NULL) {
 2042                 KASSERT(dst_entry->cred == NULL,
 2043                     ("vm_fault_copy_entry: leaked swp charge"));
 2044                 dst_object->cred = curthread->td_ucred;
 2045                 crhold(dst_object->cred);
 2046                 *fork_charge += dst_object->charge;
 2047         } else if ((dst_object->flags & OBJ_SWAP) != 0 &&
 2048             dst_object->cred == NULL) {
 2049                 KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
 2050                     dst_entry));
 2051                 dst_object->cred = dst_entry->cred;
 2052                 dst_entry->cred = NULL;
 2053         }
 2054 
 2055         /*
 2056          * Loop through all of the virtual pages within the entry's
 2057          * range, copying each page from the source object to the
 2058          * destination object.  Since the source is wired, those pages
 2059          * must exist.  In contrast, the destination is pageable.
 2060          * Since the destination object doesn't share any backing storage
 2061          * with the source object, all of its pages must be dirtied,
 2062          * regardless of whether they can be written.
 2063          */
 2064         for (vaddr = dst_entry->start, dst_pindex = 0;
 2065             vaddr < dst_entry->end;
 2066             vaddr += PAGE_SIZE, dst_pindex++) {
 2067 again:
 2068                 /*
 2069                  * Find the page in the source object, and copy it in.
 2070                  * Because the source is wired down, the page will be
 2071                  * in memory.
 2072                  */
 2073                 if (src_object != dst_object)
 2074                         VM_OBJECT_RLOCK(src_object);
 2075                 object = src_object;
 2076                 pindex = src_pindex + dst_pindex;
 2077                 while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
 2078                     (backing_object = object->backing_object) != NULL) {
 2079                         /*
 2080                          * Unless the source mapping is read-only or
 2081                          * it is presently being upgraded from
 2082                          * read-only, the first object in the shadow
 2083                          * chain should provide all of the pages.  In
 2084                          * other words, this loop body should never be
 2085                          * executed when the source mapping is already
 2086                          * read/write.
 2087                          */
 2088                         KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
 2089                             upgrade,
 2090                             ("vm_fault_copy_entry: main object missing page"));
 2091 
 2092                         VM_OBJECT_RLOCK(backing_object);
 2093                         pindex += OFF_TO_IDX(object->backing_object_offset);
 2094                         if (object != dst_object)
 2095                                 VM_OBJECT_RUNLOCK(object);
 2096                         object = backing_object;
 2097                 }
 2098                 KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
 2099 
 2100                 if (object != dst_object) {
 2101                         /*
 2102                          * Allocate a page in the destination object.
 2103                          */
 2104                         dst_m = vm_page_alloc(dst_object, (src_object ==
 2105                             dst_object ? src_pindex : 0) + dst_pindex,
 2106                             VM_ALLOC_NORMAL);
 2107                         if (dst_m == NULL) {
 2108                                 VM_OBJECT_WUNLOCK(dst_object);
 2109                                 VM_OBJECT_RUNLOCK(object);
 2110                                 vm_wait(dst_object);
 2111                                 VM_OBJECT_WLOCK(dst_object);
 2112                                 goto again;
 2113                         }
 2114 
 2115                         /*
 2116                          * See the comment in vm_fault_cow().
 2117                          */
 2118                         if (src_object == dst_object &&
 2119                             (object->flags & OBJ_ONEMAPPING) == 0)
 2120                                 pmap_remove_all(src_m);
 2121                         pmap_copy_page(src_m, dst_m);
 2122 
 2123                         /*
 2124                          * The object lock does not guarantee that "src_m" will
 2125                          * transition from invalid to valid, but it does ensure
 2126                          * that "src_m" will not transition from valid to
 2127                          * invalid.
 2128                          */
 2129                         dst_m->dirty = dst_m->valid = src_m->valid;
 2130                         VM_OBJECT_RUNLOCK(object);
 2131                 } else {
 2132                         dst_m = src_m;
 2133                         if (vm_page_busy_acquire(dst_m, VM_ALLOC_WAITFAIL) == 0)
 2134                                 goto again;
 2135                         if (dst_m->pindex >= dst_object->size) {
 2136                                 /*
 2137                                  * We are upgrading.  Index can occur
 2138                                  * out of bounds if the object type is
 2139                                  * vnode and the file was truncated.
 2140                                  */
 2141                                 vm_page_xunbusy(dst_m);
 2142                                 break;
 2143                         }
 2144                 }
 2145 
 2146                 /*
 2147                  * Enter it in the pmap. If a wired, copy-on-write
 2148                  * mapping is being replaced by a write-enabled
 2149                  * mapping, then wire that new mapping.
 2150                  *
 2151                  * The page can be invalid if the user called
 2152                  * msync(MS_INVALIDATE) or truncated the backing vnode
 2153                  * or shared memory object.  In this case, do not
 2154                  * insert it into pmap, but still do the copy so that
 2155                  * all copies of the wired map entry have similar
 2156                  * backing pages.
 2157                  */
 2158                 if (vm_page_all_valid(dst_m)) {
 2159                         VM_OBJECT_WUNLOCK(dst_object);
 2160                         pmap_enter(dst_map->pmap, vaddr, dst_m, prot,
 2161                             access | (upgrade ? PMAP_ENTER_WIRED : 0), 0);
 2162                         VM_OBJECT_WLOCK(dst_object);
 2163                 }
 2164 
 2165                 /*
 2166                  * Mark it no longer busy, and put it on the active list.
 2167                  */
 2168                 if (upgrade) {
 2169                         if (src_m != dst_m) {
 2170                                 vm_page_unwire(src_m, PQ_INACTIVE);
 2171                                 vm_page_wire(dst_m);
 2172                         } else {
 2173                                 KASSERT(vm_page_wired(dst_m),
 2174                                     ("dst_m %p is not wired", dst_m));
 2175                         }
 2176                 } else {
 2177                         vm_page_activate(dst_m);
 2178                 }
 2179                 vm_page_xunbusy(dst_m);
 2180         }
 2181         VM_OBJECT_WUNLOCK(dst_object);
 2182         if (upgrade) {
 2183                 dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
 2184                 vm_object_deallocate(src_object);
 2185         }
 2186 }
 2187 
 2188 /*
 2189  * Block entry into the machine-independent layer's page fault handler by
 2190  * the calling thread.  Subsequent calls to vm_fault() by that thread will
 2191  * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of
 2192  * spurious page faults. 
 2193  */
 2194 int
 2195 vm_fault_disable_pagefaults(void)
 2196 {
 2197 
 2198         return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
 2199 }
 2200 
 2201 void
 2202 vm_fault_enable_pagefaults(int save)
 2203 {
 2204 
 2205         curthread_pflags_restore(save);
 2206 }
Cache object: 6c17abc729857f8058f4e3d923465005
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vm/vm_fault.c

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_fault.c