vm_page.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1991 Regents of the University of California.
    3  * All rights reserved.
    4  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
    5  *
    6  * This code is derived from software contributed to Berkeley by
    7  * The Mach Operating System project at Carnegie-Mellon University.
    8  *
    9  * Redistribution and use in source and binary forms, with or without
   10  * modification, are permitted provided that the following conditions
   11  * are met:
   12  * 1. Redistributions of source code must retain the above copyright
   13  *    notice, this list of conditions and the following disclaimer.
   14  * 2. Redistributions in binary form must reproduce the above copyright
   15  *    notice, this list of conditions and the following disclaimer in the
   16  *    documentation and/or other materials provided with the distribution.
   17  * 4. Neither the name of the University nor the names of its contributors
   18  *    may be used to endorse or promote products derived from this software
   19  *    without specific prior written permission.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31  * SUCH DAMAGE.
   32  *
   33  *      from: @(#)vm_page.c     7.4 (Berkeley) 5/7/91
   34  */
   35 
   36 /*-
   37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   38  * All rights reserved.
   39  *
   40  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
   41  *
   42  * Permission to use, copy, modify and distribute this software and
   43  * its documentation is hereby granted, provided that both the copyright
   44  * notice and this permission notice appear in all copies of the
   45  * software, derivative works or modified versions, and any portions
   46  * thereof, and that both notices appear in supporting documentation.
   47  *
   48  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   49  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   50  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   51  *
   52  * Carnegie Mellon requests users of this software to return to
   53  *
   54  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   55  *  School of Computer Science
   56  *  Carnegie Mellon University
   57  *  Pittsburgh PA 15213-3890
   58  *
   59  * any improvements or extensions that they make and grant Carnegie the
   60  * rights to redistribute these changes.
   61  */
   62 
   63 /*
   64  *                      GENERAL RULES ON VM_PAGE MANIPULATION
   65  *
   66  *      - a pageq mutex is required when adding or removing a page from a
   67  *        page queue (vm_page_queue[]), regardless of other mutexes or the
   68  *        busy state of a page.
   69  *
   70  *      - a hash chain mutex is required when associating or disassociating
   71  *        a page from the VM PAGE CACHE hash table (vm_page_buckets),
   72  *        regardless of other mutexes or the busy state of a page.
   73  *
   74  *      - either a hash chain mutex OR a busied page is required in order
   75  *        to modify the page flags.  A hash chain mutex must be obtained in
   76  *        order to busy a page.  A page's flags cannot be modified by a
   77  *        hash chain mutex if the page is marked busy.
   78  *
   79  *      - The object memq mutex is held when inserting or removing
   80  *        pages from an object (vm_page_insert() or vm_page_remove()).  This
   81  *        is different from the object's main mutex.
   82  *
   83  *      Generally speaking, you have to be aware of side effects when running
   84  *      vm_page ops.  A vm_page_lookup() will return with the hash chain
   85  *      locked, whether it was able to lookup the page or not.  vm_page_free(),
   86  *      vm_page_cache(), vm_page_activate(), and a number of other routines
   87  *      will release the hash chain mutex for you.  Intermediate manipulation
   88  *      routines such as vm_page_flag_set() expect the hash chain to be held
   89  *      on entry and the hash chain will remain held on return.
   90  *
   91  *      pageq scanning can only occur with the pageq in question locked.
   92  *      We have a known bottleneck with the active queue, but the cache
   93  *      and free queues are actually arrays already. 
   94  */
   95 
   96 /*
   97  *      Resident memory management module.
   98  */
   99 
  100 #include <sys/cdefs.h>
  101 __FBSDID("$FreeBSD: releng/8.3/sys/vm/vm_page.c 227422 2011-11-10 17:04:33Z alc $");
  102 
  103 #include "opt_vm.h"
  104 
  105 #include <sys/param.h>
  106 #include <sys/systm.h>
  107 #include <sys/lock.h>
  108 #include <sys/kernel.h>
  109 #include <sys/limits.h>
  110 #include <sys/malloc.h>
  111 #include <sys/msgbuf.h>
  112 #include <sys/mutex.h>
  113 #include <sys/proc.h>
  114 #include <sys/sysctl.h>
  115 #include <sys/vmmeter.h>
  116 #include <sys/vnode.h>
  117 
  118 #include <vm/vm.h>
  119 #include <vm/vm_param.h>
  120 #include <vm/vm_kern.h>
  121 #include <vm/vm_object.h>
  122 #include <vm/vm_page.h>
  123 #include <vm/vm_pageout.h>
  124 #include <vm/vm_pager.h>
  125 #include <vm/vm_phys.h>
  126 #include <vm/vm_reserv.h>
  127 #include <vm/vm_extern.h>
  128 #include <vm/uma.h>
  129 #include <vm/uma_int.h>
  130 
  131 #include <machine/md_var.h>
  132 
  133 /*
  134  *      Associated with page of user-allocatable memory is a
  135  *      page structure.
  136  */
  137 
  138 struct vpgqueues vm_page_queues[PQ_COUNT];
  139 struct mtx vm_page_queue_mtx;
  140 struct mtx vm_page_queue_free_mtx;
  141 
  142 vm_page_t vm_page_array = 0;
  143 int vm_page_array_size = 0;
  144 long first_page = 0;
  145 int vm_page_zero_count = 0;
  146 
  147 static int boot_pages = UMA_BOOT_PAGES;
  148 TUNABLE_INT("vm.boot_pages", &boot_pages);
  149 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
  150         "number of pages allocated for bootstrapping the VM system");
  151 
  152 static void vm_page_enqueue(int queue, vm_page_t m);
  153 
  154 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
  155 #if PAGE_SIZE == 32768
  156 #ifdef CTASSERT
  157 CTASSERT(sizeof(u_long) >= 8);
  158 #endif
  159 #endif
  160 
  161 /*
  162  *      vm_set_page_size:
  163  *
  164  *      Sets the page size, perhaps based upon the memory
  165  *      size.  Must be called before any use of page-size
  166  *      dependent functions.
  167  */
  168 void
  169 vm_set_page_size(void)
  170 {
  171         if (cnt.v_page_size == 0)
  172                 cnt.v_page_size = PAGE_SIZE;
  173         if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
  174                 panic("vm_set_page_size: page size not a power of two");
  175 }
  176 
  177 /*
  178  *      vm_page_blacklist_lookup:
  179  *
  180  *      See if a physical address in this page has been listed
  181  *      in the blacklist tunable.  Entries in the tunable are
  182  *      separated by spaces or commas.  If an invalid integer is
  183  *      encountered then the rest of the string is skipped.
  184  */
  185 static int
  186 vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
  187 {
  188         vm_paddr_t bad;
  189         char *cp, *pos;
  190 
  191         for (pos = list; *pos != '\0'; pos = cp) {
  192                 bad = strtoq(pos, &cp, 0);
  193                 if (*cp != '\0') {
  194                         if (*cp == ' ' || *cp == ',') {
  195                                 cp++;
  196                                 if (cp == pos)
  197                                         continue;
  198                         } else
  199                                 break;
  200                 }
  201                 if (pa == trunc_page(bad))
  202                         return (1);
  203         }
  204         return (0);
  205 }
  206 
  207 /*
  208  *      vm_page_startup:
  209  *
  210  *      Initializes the resident memory module.
  211  *
  212  *      Allocates memory for the page cells, and
  213  *      for the object/offset-to-page hash table headers.
  214  *      Each page cell is initialized and placed on the free list.
  215  */
  216 vm_offset_t
  217 vm_page_startup(vm_offset_t vaddr)
  218 {
  219         vm_offset_t mapped;
  220         vm_paddr_t page_range;
  221         vm_paddr_t new_end;
  222         int i;
  223         vm_paddr_t pa;
  224         vm_paddr_t last_pa;
  225         char *list;
  226 
  227         /* the biggest memory array is the second group of pages */
  228         vm_paddr_t end;
  229         vm_paddr_t biggestsize;
  230         vm_paddr_t low_water, high_water;
  231         int biggestone;
  232 
  233         biggestsize = 0;
  234         biggestone = 0;
  235         vaddr = round_page(vaddr);
  236 
  237         for (i = 0; phys_avail[i + 1]; i += 2) {
  238                 phys_avail[i] = round_page(phys_avail[i]);
  239                 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
  240         }
  241 
  242         low_water = phys_avail[0];
  243         high_water = phys_avail[1];
  244 
  245         for (i = 0; phys_avail[i + 1]; i += 2) {
  246                 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
  247 
  248                 if (size > biggestsize) {
  249                         biggestone = i;
  250                         biggestsize = size;
  251                 }
  252                 if (phys_avail[i] < low_water)
  253                         low_water = phys_avail[i];
  254                 if (phys_avail[i + 1] > high_water)
  255                         high_water = phys_avail[i + 1];
  256         }
  257 
  258 #ifdef XEN
  259         low_water = 0;
  260 #endif  
  261 
  262         end = phys_avail[biggestone+1];
  263 
  264         /*
  265          * Initialize the locks.
  266          */
  267         mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
  268             MTX_RECURSE);
  269         mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
  270             MTX_DEF);
  271 
  272         /*
  273          * Initialize the queue headers for the hold queue, the active queue,
  274          * and the inactive queue.
  275          */
  276         for (i = 0; i < PQ_COUNT; i++)
  277                 TAILQ_INIT(&vm_page_queues[i].pl);
  278         vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
  279         vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
  280         vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
  281 
  282         /*
  283          * Allocate memory for use when boot strapping the kernel memory
  284          * allocator.
  285          */
  286         new_end = end - (boot_pages * UMA_SLAB_SIZE);
  287         new_end = trunc_page(new_end);
  288         mapped = pmap_map(&vaddr, new_end, end,
  289             VM_PROT_READ | VM_PROT_WRITE);
  290         bzero((void *)mapped, end - new_end);
  291         uma_startup((void *)mapped, boot_pages);
  292 
  293 #if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
  294     defined(__mips__)
  295         /*
  296          * Allocate a bitmap to indicate that a random physical page
  297          * needs to be included in a minidump.
  298          *
  299          * The amd64 port needs this to indicate which direct map pages
  300          * need to be dumped, via calls to dump_add_page()/dump_drop_page().
  301          *
  302          * However, i386 still needs this workspace internally within the
  303          * minidump code.  In theory, they are not needed on i386, but are
  304          * included should the sf_buf code decide to use them.
  305          */
  306         last_pa = 0;
  307         for (i = 0; dump_avail[i + 1] != 0; i += 2)
  308                 if (dump_avail[i + 1] > last_pa)
  309                         last_pa = dump_avail[i + 1];
  310         page_range = last_pa / PAGE_SIZE;
  311         vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
  312         new_end -= vm_page_dump_size;
  313         vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
  314             new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
  315         bzero((void *)vm_page_dump, vm_page_dump_size);
  316 #endif
  317 #ifdef __amd64__
  318         /*
  319          * Request that the physical pages underlying the message buffer be
  320          * included in a crash dump.  Since the message buffer is accessed
  321          * through the direct map, they are not automatically included.
  322          */
  323         pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
  324         last_pa = pa + round_page(msgbufsize);
  325         while (pa < last_pa) {
  326                 dump_add_page(pa);
  327                 pa += PAGE_SIZE;
  328         }
  329 #endif
  330         /*
  331          * Compute the number of pages of memory that will be available for
  332          * use (taking into account the overhead of a page structure per
  333          * page).
  334          */
  335         first_page = low_water / PAGE_SIZE;
  336 #ifdef VM_PHYSSEG_SPARSE
  337         page_range = 0;
  338         for (i = 0; phys_avail[i + 1] != 0; i += 2)
  339                 page_range += atop(phys_avail[i + 1] - phys_avail[i]);
  340 #elif defined(VM_PHYSSEG_DENSE)
  341         page_range = high_water / PAGE_SIZE - first_page;
  342 #else
  343 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
  344 #endif
  345         end = new_end;
  346 
  347         /*
  348          * Reserve an unmapped guard page to trap access to vm_page_array[-1].
  349          */
  350         vaddr += PAGE_SIZE;
  351 
  352         /*
  353          * Initialize the mem entry structures now, and put them in the free
  354          * queue.
  355          */
  356         new_end = trunc_page(end - page_range * sizeof(struct vm_page));
  357         mapped = pmap_map(&vaddr, new_end, end,
  358             VM_PROT_READ | VM_PROT_WRITE);
  359         vm_page_array = (vm_page_t) mapped;
  360 #if VM_NRESERVLEVEL > 0
  361         /*
  362          * Allocate memory for the reservation management system's data
  363          * structures.
  364          */
  365         new_end = vm_reserv_startup(&vaddr, new_end, high_water);
  366 #endif
  367 #ifdef __amd64__
  368         /*
  369          * pmap_map on amd64 comes out of the direct-map, not kvm like i386,
  370          * so the pages must be tracked for a crashdump to include this data.
  371          * This includes the vm_page_array and the early UMA bootstrap pages.
  372          */
  373         for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
  374                 dump_add_page(pa);
  375 #endif  
  376         phys_avail[biggestone + 1] = new_end;
  377 
  378         /*
  379          * Clear all of the page structures
  380          */
  381         bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
  382         for (i = 0; i < page_range; i++)
  383                 vm_page_array[i].order = VM_NFREEORDER;
  384         vm_page_array_size = page_range;
  385 
  386         /*
  387          * Initialize the physical memory allocator.
  388          */
  389         vm_phys_init();
  390 
  391         /*
  392          * Add every available physical page that is not blacklisted to
  393          * the free lists.
  394          */
  395         cnt.v_page_count = 0;
  396         cnt.v_free_count = 0;
  397         list = getenv("vm.blacklist");
  398         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
  399                 pa = phys_avail[i];
  400                 last_pa = phys_avail[i + 1];
  401                 while (pa < last_pa) {
  402                         if (list != NULL &&
  403                             vm_page_blacklist_lookup(list, pa))
  404                                 printf("Skipping page with pa 0x%jx\n",
  405                                     (uintmax_t)pa);
  406                         else
  407                                 vm_phys_add_page(pa);
  408                         pa += PAGE_SIZE;
  409                 }
  410         }
  411         freeenv(list);
  412 #if VM_NRESERVLEVEL > 0
  413         /*
  414          * Initialize the reservation management system.
  415          */
  416         vm_reserv_init();
  417 #endif
  418         return (vaddr);
  419 }
  420 
  421 void
  422 vm_page_flag_set(vm_page_t m, unsigned short bits)
  423 {
  424 
  425         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
  426         m->flags |= bits;
  427 } 
  428 
  429 void
  430 vm_page_flag_clear(vm_page_t m, unsigned short bits)
  431 {
  432 
  433         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
  434         m->flags &= ~bits;
  435 }
  436 
  437 void
  438 vm_page_busy(vm_page_t m)
  439 {
  440 
  441         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  442         KASSERT((m->oflags & VPO_BUSY) == 0,
  443             ("vm_page_busy: page already busy!!!"));
  444         m->oflags |= VPO_BUSY;
  445 }
  446 
  447 /*
  448  *      vm_page_flash:
  449  *
  450  *      wakeup anyone waiting for the page.
  451  */
  452 void
  453 vm_page_flash(vm_page_t m)
  454 {
  455 
  456         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  457         if (m->oflags & VPO_WANTED) {
  458                 m->oflags &= ~VPO_WANTED;
  459                 wakeup(m);
  460         }
  461 }
  462 
  463 /*
  464  *      vm_page_wakeup:
  465  *
  466  *      clear the VPO_BUSY flag and wakeup anyone waiting for the
  467  *      page.
  468  *
  469  */
  470 void
  471 vm_page_wakeup(vm_page_t m)
  472 {
  473 
  474         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  475         KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
  476         m->oflags &= ~VPO_BUSY;
  477         vm_page_flash(m);
  478 }
  479 
  480 void
  481 vm_page_io_start(vm_page_t m)
  482 {
  483 
  484         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  485         m->busy++;
  486 }
  487 
  488 void
  489 vm_page_io_finish(vm_page_t m)
  490 {
  491 
  492         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  493         m->busy--;
  494         if (m->busy == 0)
  495                 vm_page_flash(m);
  496 }
  497 
  498 /*
  499  * Keep page from being freed by the page daemon
  500  * much of the same effect as wiring, except much lower
  501  * overhead and should be used only for *very* temporary
  502  * holding ("wiring").
  503  */
  504 void
  505 vm_page_hold(vm_page_t mem)
  506 {
  507 
  508         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
  509         mem->hold_count++;
  510 }
  511 
  512 void
  513 vm_page_unhold(vm_page_t mem)
  514 {
  515 
  516         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
  517         --mem->hold_count;
  518         KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
  519         if (mem->hold_count == 0 && VM_PAGE_INQUEUE2(mem, PQ_HOLD))
  520                 vm_page_free_toq(mem);
  521 }
  522 
  523 /*
  524  *      vm_page_free:
  525  *
  526  *      Free a page.
  527  */
  528 void
  529 vm_page_free(vm_page_t m)
  530 {
  531 
  532         m->flags &= ~PG_ZERO;
  533         vm_page_free_toq(m);
  534 }
  535 
  536 /*
  537  *      vm_page_free_zero:
  538  *
  539  *      Free a page to the zerod-pages queue
  540  */
  541 void
  542 vm_page_free_zero(vm_page_t m)
  543 {
  544 
  545         m->flags |= PG_ZERO;
  546         vm_page_free_toq(m);
  547 }
  548 
  549 /*
  550  *      vm_page_sleep:
  551  *
  552  *      Sleep and release the page queues lock.
  553  *
  554  *      The object containing the given page must be locked.
  555  */
  556 void
  557 vm_page_sleep(vm_page_t m, const char *msg)
  558 {
  559 
  560         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  561         if (!mtx_owned(&vm_page_queue_mtx))
  562                 vm_page_lock_queues();
  563         vm_page_flag_set(m, PG_REFERENCED);
  564         vm_page_unlock_queues();
  565 
  566         /*
  567          * It's possible that while we sleep, the page will get
  568          * unbusied and freed.  If we are holding the object
  569          * lock, we will assume we hold a reference to the object
  570          * such that even if m->object changes, we can re-lock
  571          * it.
  572          */
  573         m->oflags |= VPO_WANTED;
  574         msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
  575 }
  576 
  577 /*
  578  *      vm_page_dirty:
  579  *
  580  *      make page all dirty
  581  */
  582 void
  583 vm_page_dirty(vm_page_t m)
  584 {
  585 
  586         KASSERT((m->flags & PG_CACHED) == 0,
  587             ("vm_page_dirty: page in cache!"));
  588         KASSERT(!VM_PAGE_IS_FREE(m),
  589             ("vm_page_dirty: page is free!"));
  590         KASSERT(m->valid == VM_PAGE_BITS_ALL,
  591             ("vm_page_dirty: page is invalid!"));
  592         m->dirty = VM_PAGE_BITS_ALL;
  593 }
  594 
  595 /*
  596  *      vm_page_splay:
  597  *
  598  *      Implements Sleator and Tarjan's top-down splay algorithm.  Returns
  599  *      the vm_page containing the given pindex.  If, however, that
  600  *      pindex is not found in the vm_object, returns a vm_page that is
  601  *      adjacent to the pindex, coming before or after it.
  602  */
  603 vm_page_t
  604 vm_page_splay(vm_pindex_t pindex, vm_page_t root)
  605 {
  606         struct vm_page dummy;
  607         vm_page_t lefttreemax, righttreemin, y;
  608 
  609         if (root == NULL)
  610                 return (root);
  611         lefttreemax = righttreemin = &dummy;
  612         for (;; root = y) {
  613                 if (pindex < root->pindex) {
  614                         if ((y = root->left) == NULL)
  615                                 break;
  616                         if (pindex < y->pindex) {
  617                                 /* Rotate right. */
  618                                 root->left = y->right;
  619                                 y->right = root;
  620                                 root = y;
  621                                 if ((y = root->left) == NULL)
  622                                         break;
  623                         }
  624                         /* Link into the new root's right tree. */
  625                         righttreemin->left = root;
  626                         righttreemin = root;
  627                 } else if (pindex > root->pindex) {
  628                         if ((y = root->right) == NULL)
  629                                 break;
  630                         if (pindex > y->pindex) {
  631                                 /* Rotate left. */
  632                                 root->right = y->left;
  633                                 y->left = root;
  634                                 root = y;
  635                                 if ((y = root->right) == NULL)
  636                                         break;
  637                         }
  638                         /* Link into the new root's left tree. */
  639                         lefttreemax->right = root;
  640                         lefttreemax = root;
  641                 } else
  642                         break;
  643         }
  644         /* Assemble the new root. */
  645         lefttreemax->right = root->left;
  646         righttreemin->left = root->right;
  647         root->left = dummy.right;
  648         root->right = dummy.left;
  649         return (root);
  650 }
  651 
  652 /*
  653  *      vm_page_insert:         [ internal use only ]
  654  *
  655  *      Inserts the given mem entry into the object and object list.
  656  *
  657  *      The pagetables are not updated but will presumably fault the page
  658  *      in if necessary, or if a kernel page the caller will at some point
  659  *      enter the page into the kernel's pmap.  We are not allowed to block
  660  *      here so we *can't* do this anyway.
  661  *
  662  *      The object and page must be locked.
  663  *      This routine may not block.
  664  */
  665 void
  666 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
  667 {
  668         vm_page_t root;
  669 
  670         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  671         if (m->object != NULL)
  672                 panic("vm_page_insert: page already inserted");
  673 
  674         /*
  675          * Record the object/offset pair in this page
  676          */
  677         m->object = object;
  678         m->pindex = pindex;
  679 
  680         /*
  681          * Now link into the object's ordered list of backed pages.
  682          */
  683         root = object->root;
  684         if (root == NULL) {
  685                 m->left = NULL;
  686                 m->right = NULL;
  687                 TAILQ_INSERT_TAIL(&object->memq, m, listq);
  688         } else {
  689                 root = vm_page_splay(pindex, root);
  690                 if (pindex < root->pindex) {
  691                         m->left = root->left;
  692                         m->right = root;
  693                         root->left = NULL;
  694                         TAILQ_INSERT_BEFORE(root, m, listq);
  695                 } else if (pindex == root->pindex)
  696                         panic("vm_page_insert: offset already allocated");
  697                 else {
  698                         m->right = root->right;
  699                         m->left = root;
  700                         root->right = NULL;
  701                         TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
  702                 }
  703         }
  704         object->root = m;
  705 
  706         /*
  707          * show that the object has one more resident page.
  708          */
  709         object->resident_page_count++;
  710         /*
  711          * Hold the vnode until the last page is released.
  712          */
  713         if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
  714                 vhold((struct vnode *)object->handle);
  715 
  716         /*
  717          * Since we are inserting a new and possibly dirty page,
  718          * update the object's OBJ_MIGHTBEDIRTY flag.
  719          */
  720         if (m->flags & PG_WRITEABLE)
  721                 vm_object_set_writeable_dirty(object);
  722 }
  723 
  724 /*
  725  *      vm_page_remove:
  726  *                              NOTE: used by device pager as well -wfj
  727  *
  728  *      Removes the given mem entry from the object/offset-page
  729  *      table and the object page list, but do not invalidate/terminate
  730  *      the backing store.
  731  *
  732  *      The object and page must be locked.
  733  *      The underlying pmap entry (if any) is NOT removed here.
  734  *      This routine may not block.
  735  */
  736 void
  737 vm_page_remove(vm_page_t m)
  738 {
  739         vm_object_t object;
  740         vm_page_t next, prev, root;
  741 
  742         if ((object = m->object) == NULL)
  743                 return;
  744         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  745         if (m->oflags & VPO_BUSY) {
  746                 m->oflags &= ~VPO_BUSY;
  747                 vm_page_flash(m);
  748         }
  749         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
  750 
  751         /*
  752          * Now remove from the object's list of backed pages.
  753          */
  754         if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
  755                 /*
  756                  * Since the page's successor in the list is also its parent
  757                  * in the tree, its right subtree must be empty.
  758                  */
  759                 next->left = m->left;
  760                 KASSERT(m->right == NULL,
  761                     ("vm_page_remove: page %p has right child", m));
  762         } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
  763             prev->right == m) {
  764                 /*
  765                  * Since the page's predecessor in the list is also its parent
  766                  * in the tree, its left subtree must be empty.
  767                  */
  768                 KASSERT(m->left == NULL,
  769                     ("vm_page_remove: page %p has left child", m));
  770                 prev->right = m->right;
  771         } else {
  772                 if (m != object->root)
  773                         vm_page_splay(m->pindex, object->root);
  774                 if (m->left == NULL)
  775                         root = m->right;
  776                 else if (m->right == NULL)
  777                         root = m->left;
  778                 else {
  779                         /*
  780                          * Move the page's successor to the root, because
  781                          * pages are usually removed in ascending order.
  782                          */
  783                         if (m->right != next)
  784                                 vm_page_splay(m->pindex, m->right);
  785                         next->left = m->left;
  786                         root = next;
  787                 }
  788                 object->root = root;
  789         }
  790         TAILQ_REMOVE(&object->memq, m, listq);
  791 
  792         /*
  793          * And show that the object has one fewer resident page.
  794          */
  795         object->resident_page_count--;
  796         /*
  797          * The vnode may now be recycled.
  798          */
  799         if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
  800                 vdrop((struct vnode *)object->handle);
  801 
  802         m->object = NULL;
  803 }
  804 
  805 /*
  806  *      vm_page_lookup:
  807  *
  808  *      Returns the page associated with the object/offset
  809  *      pair specified; if none is found, NULL is returned.
  810  *
  811  *      The object must be locked.
  812  *      This routine may not block.
  813  *      This is a critical path routine
  814  */
  815 vm_page_t
  816 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
  817 {
  818         vm_page_t m;
  819 
  820         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  821         if ((m = object->root) != NULL && m->pindex != pindex) {
  822                 m = vm_page_splay(pindex, m);
  823                 if ((object->root = m)->pindex != pindex)
  824                         m = NULL;
  825         }
  826         return (m);
  827 }
  828 
  829 /*
  830  *      vm_page_find_least:
  831  *
  832  *      Returns the page associated with the object with least pindex
  833  *      greater than or equal to the parameter pindex, or NULL.
  834  *
  835  *      The object must be locked.
  836  *      The routine may not block.
  837  */
  838 vm_page_t
  839 vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
  840 {
  841         vm_page_t m;
  842 
  843         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
  844         if ((m = TAILQ_FIRST(&object->memq)) != NULL) {
  845                 if (m->pindex < pindex) {
  846                         m = vm_page_splay(pindex, object->root);
  847                         if ((object->root = m)->pindex < pindex)
  848                                 m = TAILQ_NEXT(m, listq);
  849                 }
  850         }
  851         return (m);
  852 }
  853 
  854 /*
  855  * Returns the given page's successor (by pindex) within the object if it is
  856  * resident; if none is found, NULL is returned.
  857  *
  858  * The object must be locked.
  859  */
  860 vm_page_t
  861 vm_page_next(vm_page_t m)
  862 {
  863         vm_page_t next;
  864 
  865         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  866         if ((next = TAILQ_NEXT(m, listq)) != NULL &&
  867             next->pindex != m->pindex + 1)
  868                 next = NULL;
  869         return (next);
  870 }
  871 
  872 /*
  873  * Returns the given page's predecessor (by pindex) within the object if it is
  874  * resident; if none is found, NULL is returned.
  875  *
  876  * The object must be locked.
  877  */
  878 vm_page_t
  879 vm_page_prev(vm_page_t m)
  880 {
  881         vm_page_t prev;
  882 
  883         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
  884         if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
  885             prev->pindex != m->pindex - 1)
  886                 prev = NULL;
  887         return (prev);
  888 }
  889 
  890 /*
  891  *      vm_page_rename:
  892  *
  893  *      Move the given memory entry from its
  894  *      current object to the specified target object/offset.
  895  *
  896  *      The object must be locked.
  897  *      This routine may not block.
  898  *
  899  *      Note: swap associated with the page must be invalidated by the move.  We
  900  *            have to do this for several reasons:  (1) we aren't freeing the
  901  *            page, (2) we are dirtying the page, (3) the VM system is probably
  902  *            moving the page from object A to B, and will then later move
  903  *            the backing store from A to B and we can't have a conflict.
  904  *
  905  *      Note: we *always* dirty the page.  It is necessary both for the
  906  *            fact that we moved it, and because we may be invalidating
  907  *            swap.  If the page is on the cache, we have to deactivate it
  908  *            or vm_page_dirty() will panic.  Dirty pages are not allowed
  909  *            on the cache.
  910  */
  911 void
  912 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
  913 {
  914 
  915         vm_page_remove(m);
  916         vm_page_insert(m, new_object, new_pindex);
  917         vm_page_dirty(m);
  918 }
  919 
  920 /*
  921  *      Convert all of the given object's cached pages that have a
  922  *      pindex within the given range into free pages.  If the value
  923  *      zero is given for "end", then the range's upper bound is
  924  *      infinity.  If the given object is backed by a vnode and it
  925  *      transitions from having one or more cached pages to none, the
  926  *      vnode's hold count is reduced. 
  927  */
  928 void
  929 vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
  930 {
  931         vm_page_t m, m_next;
  932         boolean_t empty;
  933 
  934         mtx_lock(&vm_page_queue_free_mtx);
  935         if (__predict_false(object->cache == NULL)) {
  936                 mtx_unlock(&vm_page_queue_free_mtx);
  937                 return;
  938         }
  939         m = object->cache = vm_page_splay(start, object->cache);
  940         if (m->pindex < start) {
  941                 if (m->right == NULL)
  942                         m = NULL;
  943                 else {
  944                         m_next = vm_page_splay(start, m->right);
  945                         m_next->left = m;
  946                         m->right = NULL;
  947                         m = object->cache = m_next;
  948                 }
  949         }
  950 
  951         /*
  952          * At this point, "m" is either (1) a reference to the page
  953          * with the least pindex that is greater than or equal to
  954          * "start" or (2) NULL.
  955          */
  956         for (; m != NULL && (m->pindex < end || end == 0); m = m_next) {
  957                 /*
  958                  * Find "m"'s successor and remove "m" from the
  959                  * object's cache.
  960                  */
  961                 if (m->right == NULL) {
  962                         object->cache = m->left;
  963                         m_next = NULL;
  964                 } else {
  965                         m_next = vm_page_splay(start, m->right);
  966                         m_next->left = m->left;
  967                         object->cache = m_next;
  968                 }
  969                 /* Convert "m" to a free page. */
  970                 m->object = NULL;
  971                 m->valid = 0;
  972                 /* Clear PG_CACHED and set PG_FREE. */
  973                 m->flags ^= PG_CACHED | PG_FREE;
  974                 KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
  975                     ("vm_page_cache_free: page %p has inconsistent flags", m));
  976                 cnt.v_cache_count--;
  977                 cnt.v_free_count++;
  978         }
  979         empty = object->cache == NULL;
  980         mtx_unlock(&vm_page_queue_free_mtx);
  981         if (object->type == OBJT_VNODE && empty)
  982                 vdrop(object->handle);
  983 }
  984 
  985 /*
  986  *      Returns the cached page that is associated with the given
  987  *      object and offset.  If, however, none exists, returns NULL.
  988  *
  989  *      The free page queue must be locked.
  990  */
  991 static inline vm_page_t
  992 vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
  993 {
  994         vm_page_t m;
  995 
  996         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
  997         if ((m = object->cache) != NULL && m->pindex != pindex) {
  998                 m = vm_page_splay(pindex, m);
  999                 if ((object->cache = m)->pindex != pindex)
 1000                         m = NULL;
 1001         }
 1002         return (m);
 1003 }
 1004 
 1005 /*
 1006  *      Remove the given cached page from its containing object's
 1007  *      collection of cached pages.
 1008  *
 1009  *      The free page queue must be locked.
 1010  */
 1011 void
 1012 vm_page_cache_remove(vm_page_t m)
 1013 {
 1014         vm_object_t object;
 1015         vm_page_t root;
 1016 
 1017         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1018         KASSERT((m->flags & PG_CACHED) != 0,
 1019             ("vm_page_cache_remove: page %p is not cached", m));
 1020         object = m->object;
 1021         if (m != object->cache) {
 1022                 root = vm_page_splay(m->pindex, object->cache);
 1023                 KASSERT(root == m,
 1024                     ("vm_page_cache_remove: page %p is not cached in object %p",
 1025                     m, object));
 1026         }
 1027         if (m->left == NULL)
 1028                 root = m->right;
 1029         else if (m->right == NULL)
 1030                 root = m->left;
 1031         else {
 1032                 root = vm_page_splay(m->pindex, m->left);
 1033                 root->right = m->right;
 1034         }
 1035         object->cache = root;
 1036         m->object = NULL;
 1037         cnt.v_cache_count--;
 1038 }
 1039 
 1040 /*
 1041  *      Transfer all of the cached pages with offset greater than or
 1042  *      equal to 'offidxstart' from the original object's cache to the
 1043  *      new object's cache.  However, any cached pages with offset
 1044  *      greater than or equal to the new object's size are kept in the
 1045  *      original object.  Initially, the new object's cache must be
 1046  *      empty.  Offset 'offidxstart' in the original object must
 1047  *      correspond to offset zero in the new object.
 1048  *
 1049  *      The new object must be locked.
 1050  */
 1051 void
 1052 vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
 1053     vm_object_t new_object)
 1054 {
 1055         vm_page_t m, m_next;
 1056 
 1057         /*
 1058          * Insertion into an object's collection of cached pages
 1059          * requires the object to be locked.  In contrast, removal does
 1060          * not.
 1061          */
 1062         VM_OBJECT_LOCK_ASSERT(new_object, MA_OWNED);
 1063         KASSERT(new_object->cache == NULL,
 1064             ("vm_page_cache_transfer: object %p has cached pages",
 1065             new_object));
 1066         mtx_lock(&vm_page_queue_free_mtx);
 1067         if ((m = orig_object->cache) != NULL) {
 1068                 /*
 1069                  * Transfer all of the pages with offset greater than or
 1070                  * equal to 'offidxstart' from the original object's
 1071                  * cache to the new object's cache.
 1072                  */
 1073                 m = vm_page_splay(offidxstart, m);
 1074                 if (m->pindex < offidxstart) {
 1075                         orig_object->cache = m;
 1076                         new_object->cache = m->right;
 1077                         m->right = NULL;
 1078                 } else {
 1079                         orig_object->cache = m->left;
 1080                         new_object->cache = m;
 1081                         m->left = NULL;
 1082                 }
 1083                 while ((m = new_object->cache) != NULL) {
 1084                         if ((m->pindex - offidxstart) >= new_object->size) {
 1085                                 /*
 1086                                  * Return all of the cached pages with
 1087                                  * offset greater than or equal to the
 1088                                  * new object's size to the original
 1089                                  * object's cache. 
 1090                                  */
 1091                                 new_object->cache = m->left;
 1092                                 m->left = orig_object->cache;
 1093                                 orig_object->cache = m;
 1094                                 break;
 1095                         }
 1096                         m_next = vm_page_splay(m->pindex, m->right);
 1097                         /* Update the page's object and offset. */
 1098                         m->object = new_object;
 1099                         m->pindex -= offidxstart;
 1100                         if (m_next == NULL)
 1101                                 break;
 1102                         m->right = NULL;
 1103                         m_next->left = m;
 1104                         new_object->cache = m_next;
 1105                 }
 1106                 KASSERT(new_object->cache == NULL ||
 1107                     new_object->type == OBJT_SWAP,
 1108                     ("vm_page_cache_transfer: object %p's type is incompatible"
 1109                     " with cached pages", new_object));
 1110         }
 1111         mtx_unlock(&vm_page_queue_free_mtx);
 1112 }
 1113 
 1114 /*
 1115  *      vm_page_alloc:
 1116  *
 1117  *      Allocate and return a memory cell associated
 1118  *      with this VM object/offset pair.
 1119  *
 1120  *      The caller must always specify an allocation class.
 1121  *
 1122  *      allocation classes:
 1123  *      VM_ALLOC_NORMAL         normal process request
 1124  *      VM_ALLOC_SYSTEM         system *really* needs a page
 1125  *      VM_ALLOC_INTERRUPT      interrupt time request
 1126  *
 1127  *      optional allocation flags:
 1128  *      VM_ALLOC_ZERO           prefer a zeroed page
 1129  *      VM_ALLOC_WIRED          wire the allocated page
 1130  *      VM_ALLOC_NOOBJ          page is not associated with a vm object
 1131  *      VM_ALLOC_NOBUSY         do not set the page busy
 1132  *      VM_ALLOC_IFCACHED       return page only if it is cached
 1133  *      VM_ALLOC_IFNOTCACHED    return NULL, do not reactivate if the page
 1134  *                              is cached
 1135  *
 1136  *      This routine may not sleep.
 1137  */
 1138 vm_page_t
 1139 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 1140 {
 1141         struct vnode *vp = NULL;
 1142         vm_object_t m_object;
 1143         vm_page_t m;
 1144         int flags, page_req;
 1145 
 1146         if ((req & VM_ALLOC_NOOBJ) == 0) {
 1147                 KASSERT(object != NULL,
 1148                     ("vm_page_alloc: NULL object."));
 1149                 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1150         }
 1151 
 1152         page_req = req & VM_ALLOC_CLASS_MASK;
 1153 
 1154         /*
 1155          * The pager is allowed to eat deeper into the free page list.
 1156          */
 1157         if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT))
 1158                 page_req = VM_ALLOC_SYSTEM;
 1159 
 1160         mtx_lock(&vm_page_queue_free_mtx);
 1161         if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
 1162             (page_req == VM_ALLOC_SYSTEM && 
 1163             cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
 1164             (page_req == VM_ALLOC_INTERRUPT &&
 1165             cnt.v_free_count + cnt.v_cache_count > 0)) {
 1166                 /*
 1167                  * Allocate from the free queue if the number of free pages
 1168                  * exceeds the minimum for the request class.
 1169                  */
 1170                 if (object != NULL &&
 1171                     (m = vm_page_cache_lookup(object, pindex)) != NULL) {
 1172                         if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
 1173                                 mtx_unlock(&vm_page_queue_free_mtx);
 1174                                 return (NULL);
 1175                         }
 1176                         if (vm_phys_unfree_page(m))
 1177                                 vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
 1178 #if VM_NRESERVLEVEL > 0
 1179                         else if (!vm_reserv_reactivate_page(m))
 1180 #else
 1181                         else
 1182 #endif
 1183                                 panic("vm_page_alloc: cache page %p is missing"
 1184                                     " from the free queue", m);
 1185                 } else if ((req & VM_ALLOC_IFCACHED) != 0) {
 1186                         mtx_unlock(&vm_page_queue_free_mtx);
 1187                         return (NULL);
 1188 #if VM_NRESERVLEVEL > 0
 1189                 } else if (object == NULL || object->type == OBJT_DEVICE ||
 1190                     object->type == OBJT_SG ||
 1191                     (object->flags & OBJ_COLORED) == 0 ||
 1192                     (m = vm_reserv_alloc_page(object, pindex)) == NULL) {
 1193 #else
 1194                 } else {
 1195 #endif
 1196                         m = vm_phys_alloc_pages(object != NULL ?
 1197                             VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
 1198 #if VM_NRESERVLEVEL > 0
 1199                         if (m == NULL && vm_reserv_reclaim_inactive()) {
 1200                                 m = vm_phys_alloc_pages(object != NULL ?
 1201                                     VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
 1202                                     0);
 1203                         }
 1204 #endif
 1205                 }
 1206         } else {
 1207                 /*
 1208                  * Not allocatable, give up.
 1209                  */
 1210                 mtx_unlock(&vm_page_queue_free_mtx);
 1211                 atomic_add_int(&vm_pageout_deficit, 1);
 1212                 pagedaemon_wakeup();
 1213                 return (NULL);
 1214         }
 1215 
 1216         /*
 1217          *  At this point we had better have found a good page.
 1218          */
 1219 
 1220         KASSERT(m != NULL, ("vm_page_alloc: missing page"));
 1221         KASSERT(m->queue == PQ_NONE,
 1222             ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
 1223         KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
 1224         KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
 1225         KASSERT(m->busy == 0, ("vm_page_alloc: page %p is busy", m));
 1226         KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
 1227         KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 1228             ("vm_page_alloc: page %p has unexpected memattr %d", m,
 1229             pmap_page_get_memattr(m)));
 1230         if ((m->flags & PG_CACHED) != 0) {
 1231                 KASSERT(m->valid != 0,
 1232                     ("vm_page_alloc: cached page %p is invalid", m));
 1233                 if (m->object == object && m->pindex == pindex)
 1234                         cnt.v_reactivated++;
 1235                 else
 1236                         m->valid = 0;
 1237                 m_object = m->object;
 1238                 vm_page_cache_remove(m);
 1239                 if (m_object->type == OBJT_VNODE && m_object->cache == NULL)
 1240                         vp = m_object->handle;
 1241         } else {
 1242                 KASSERT(VM_PAGE_IS_FREE(m),
 1243                     ("vm_page_alloc: page %p is not free", m));
 1244                 KASSERT(m->valid == 0,
 1245                     ("vm_page_alloc: free page %p is valid", m));
 1246                 cnt.v_free_count--;
 1247         }
 1248 
 1249         /*
 1250          * Initialize structure.  Only the PG_ZERO flag is inherited.
 1251          */
 1252         flags = 0;
 1253         if (m->flags & PG_ZERO) {
 1254                 vm_page_zero_count--;
 1255                 if (req & VM_ALLOC_ZERO)
 1256                         flags = PG_ZERO;
 1257         }
 1258         if (object == NULL || object->type == OBJT_PHYS)
 1259                 flags |= PG_UNMANAGED;
 1260         m->flags = flags;
 1261         if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
 1262                 m->oflags = 0;
 1263         else
 1264                 m->oflags = VPO_BUSY;
 1265         if (req & VM_ALLOC_WIRED) {
 1266                 atomic_add_int(&cnt.v_wire_count, 1);
 1267                 m->wire_count = 1;
 1268         }
 1269         m->act_count = 0;
 1270         mtx_unlock(&vm_page_queue_free_mtx);
 1271 
 1272         if (object != NULL) {
 1273                 /* Ignore device objects; the pager sets "memattr" for them. */
 1274                 if (object->memattr != VM_MEMATTR_DEFAULT &&
 1275                     object->type != OBJT_DEVICE && object->type != OBJT_SG)
 1276                         pmap_page_set_memattr(m, object->memattr);
 1277                 vm_page_insert(m, object, pindex);
 1278         } else
 1279                 m->pindex = pindex;
 1280 
 1281         /*
 1282          * The following call to vdrop() must come after the above call
 1283          * to vm_page_insert() in case both affect the same object and
 1284          * vnode.  Otherwise, the affected vnode's hold count could
 1285          * temporarily become zero.
 1286          */
 1287         if (vp != NULL)
 1288                 vdrop(vp);
 1289 
 1290         /*
 1291          * Don't wakeup too often - wakeup the pageout daemon when
 1292          * we would be nearly out of memory.
 1293          */
 1294         if (vm_paging_needed())
 1295                 pagedaemon_wakeup();
 1296 
 1297         return (m);
 1298 }
 1299 
 1300 /*
 1301  * Initialize a page that has been freshly dequeued from a freelist.
 1302  * The caller has to drop the vnode returned, if it is not NULL.
 1303  *
 1304  * To be called with vm_page_queue_free_mtx held.
 1305  */
 1306 struct vnode *
 1307 vm_page_alloc_init(vm_page_t m)
 1308 {
 1309         struct vnode *drop;
 1310         vm_object_t m_object;
 1311 
 1312         KASSERT(m->queue == PQ_NONE,
 1313             ("vm_page_alloc_init: page %p has unexpected queue %d",
 1314             m, m->queue));
 1315         KASSERT(m->wire_count == 0,
 1316             ("vm_page_alloc_init: page %p is wired", m));
 1317         KASSERT(m->hold_count == 0,
 1318             ("vm_page_alloc_init: page %p is held", m));
 1319         KASSERT(m->busy == 0,
 1320             ("vm_page_alloc_init: page %p is busy", m));
 1321         KASSERT(m->dirty == 0,
 1322             ("vm_page_alloc_init: page %p is dirty", m));
 1323         KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 1324             ("vm_page_alloc_init: page %p has unexpected memattr %d",
 1325             m, pmap_page_get_memattr(m)));
 1326         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1327         drop = NULL;
 1328         if ((m->flags & PG_CACHED) != 0) {
 1329                 m->valid = 0;
 1330                 m_object = m->object;
 1331                 vm_page_cache_remove(m);
 1332                 if (m_object->type == OBJT_VNODE &&
 1333                     m_object->cache == NULL)
 1334                         drop = m_object->handle;
 1335         } else {
 1336                 KASSERT(VM_PAGE_IS_FREE(m),
 1337                     ("vm_page_alloc_init: page %p is not free", m));
 1338                 KASSERT(m->valid == 0,
 1339                     ("vm_page_alloc_init: free page %p is valid", m));
 1340                 cnt.v_free_count--;
 1341         }
 1342         if (m->flags & PG_ZERO)
 1343                 vm_page_zero_count--;
 1344         /* Don't clear the PG_ZERO flag; we'll need it later. */
 1345         m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
 1346         m->oflags = 0;
 1347         /* Unmanaged pages don't use "act_count". */
 1348         return (drop);
 1349 }
 1350 
 1351 /*
 1352  *      vm_page_alloc_freelist:
 1353  * 
 1354  *      Allocate a page from the specified freelist.
 1355  *      Only the ALLOC_CLASS values in req are honored, other request flags
 1356  *      are ignored.
 1357  */
 1358 vm_page_t
 1359 vm_page_alloc_freelist(int flind, int req)
 1360 {
 1361         struct vnode *drop;
 1362         vm_page_t m;
 1363         int page_req;
 1364 
 1365         m = NULL;
 1366         page_req = req & VM_ALLOC_CLASS_MASK;
 1367         mtx_lock(&vm_page_queue_free_mtx);
 1368         /*
 1369          * Do not allocate reserved pages unless the req has asked for it.
 1370          */
 1371         if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
 1372             (page_req == VM_ALLOC_SYSTEM && 
 1373             cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
 1374             (page_req == VM_ALLOC_INTERRUPT &&
 1375             cnt.v_free_count + cnt.v_cache_count > 0)) {
 1376                 m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
 1377         }
 1378         if (m == NULL) {
 1379                 mtx_unlock(&vm_page_queue_free_mtx);
 1380                 return (NULL);
 1381         }
 1382         drop = vm_page_alloc_init(m);
 1383         mtx_unlock(&vm_page_queue_free_mtx);
 1384         if (drop)
 1385                 vdrop(drop);
 1386         return (m);
 1387 }
 1388 
 1389 /*
 1390  *      vm_wait:        (also see VM_WAIT macro)
 1391  *
 1392  *      Block until free pages are available for allocation
 1393  *      - Called in various places before memory allocations.
 1394  */
 1395 void
 1396 vm_wait(void)
 1397 {
 1398 
 1399         mtx_lock(&vm_page_queue_free_mtx);
 1400         if (curproc == pageproc) {
 1401                 vm_pageout_pages_needed = 1;
 1402                 msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 1403                     PDROP | PSWP, "VMWait", 0);
 1404         } else {
 1405                 if (!vm_pages_needed) {
 1406                         vm_pages_needed = 1;
 1407                         wakeup(&vm_pages_needed);
 1408                 }
 1409                 msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
 1410                     "vmwait", 0);
 1411         }
 1412 }
 1413 
 1414 /*
 1415  *      vm_waitpfault:  (also see VM_WAITPFAULT macro)
 1416  *
 1417  *      Block until free pages are available for allocation
 1418  *      - Called only in vm_fault so that processes page faulting
 1419  *        can be easily tracked.
 1420  *      - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
 1421  *        processes will be able to grab memory first.  Do not change
 1422  *        this balance without careful testing first.
 1423  */
 1424 void
 1425 vm_waitpfault(void)
 1426 {
 1427 
 1428         mtx_lock(&vm_page_queue_free_mtx);
 1429         if (!vm_pages_needed) {
 1430                 vm_pages_needed = 1;
 1431                 wakeup(&vm_pages_needed);
 1432         }
 1433         msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
 1434             "pfault", 0);
 1435 }
 1436 
 1437 /*
 1438  *      vm_page_requeue:
 1439  *
 1440  *      If the given page is contained within a page queue, move it to the tail
 1441  *      of that queue.
 1442  *
 1443  *      The page queues must be locked.
 1444  */
 1445 void
 1446 vm_page_requeue(vm_page_t m)
 1447 {
 1448         int queue = VM_PAGE_GETQUEUE(m);
 1449         struct vpgqueues *vpq;
 1450 
 1451         if (queue != PQ_NONE) {
 1452                 vpq = &vm_page_queues[queue];
 1453                 TAILQ_REMOVE(&vpq->pl, m, pageq);
 1454                 TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
 1455         }
 1456 }
 1457 
 1458 /*
 1459  *      vm_pageq_remove:
 1460  *
 1461  *      Remove a page from its queue.
 1462  *
 1463  *      The queue containing the given page must be locked.
 1464  *      This routine may not block.
 1465  */
 1466 void
 1467 vm_pageq_remove(vm_page_t m)
 1468 {
 1469         int queue = VM_PAGE_GETQUEUE(m);
 1470         struct vpgqueues *pq;
 1471 
 1472         if (queue != PQ_NONE) {
 1473                 VM_PAGE_SETQUEUE2(m, PQ_NONE);
 1474                 pq = &vm_page_queues[queue];
 1475                 TAILQ_REMOVE(&pq->pl, m, pageq);
 1476                 (*pq->cnt)--;
 1477         }
 1478 }
 1479 
 1480 /*
 1481  *      vm_page_enqueue:
 1482  *
 1483  *      Add the given page to the specified queue.
 1484  *
 1485  *      The page queues must be locked.
 1486  */
 1487 static void
 1488 vm_page_enqueue(int queue, vm_page_t m)
 1489 {
 1490         struct vpgqueues *vpq;
 1491 
 1492         vpq = &vm_page_queues[queue];
 1493         VM_PAGE_SETQUEUE2(m, queue);
 1494         TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
 1495         ++*vpq->cnt;
 1496 }
 1497 
 1498 /*
 1499  *      vm_page_activate:
 1500  *
 1501  *      Put the specified page on the active list (if appropriate).
 1502  *      Ensure that act_count is at least ACT_INIT but do not otherwise
 1503  *      mess with it.
 1504  *
 1505  *      The page queues must be locked.
 1506  *      This routine may not block.
 1507  */
 1508 void
 1509 vm_page_activate(vm_page_t m)
 1510 {
 1511 
 1512         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1513         if (VM_PAGE_GETKNOWNQUEUE2(m) != PQ_ACTIVE) {
 1514                 vm_pageq_remove(m);
 1515                 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 1516                         if (m->act_count < ACT_INIT)
 1517                                 m->act_count = ACT_INIT;
 1518                         vm_page_enqueue(PQ_ACTIVE, m);
 1519                 }
 1520         } else {
 1521                 if (m->act_count < ACT_INIT)
 1522                         m->act_count = ACT_INIT;
 1523         }
 1524 }
 1525 
 1526 /*
 1527  *      vm_page_free_wakeup:
 1528  *
 1529  *      Helper routine for vm_page_free_toq() and vm_page_cache().  This
 1530  *      routine is called when a page has been added to the cache or free
 1531  *      queues.
 1532  *
 1533  *      The page queues must be locked.
 1534  *      This routine may not block.
 1535  */
 1536 static inline void
 1537 vm_page_free_wakeup(void)
 1538 {
 1539 
 1540         mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 1541         /*
 1542          * if pageout daemon needs pages, then tell it that there are
 1543          * some free.
 1544          */
 1545         if (vm_pageout_pages_needed &&
 1546             cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
 1547                 wakeup(&vm_pageout_pages_needed);
 1548                 vm_pageout_pages_needed = 0;
 1549         }
 1550         /*
 1551          * wakeup processes that are waiting on memory if we hit a
 1552          * high water mark. And wakeup scheduler process if we have
 1553          * lots of memory. this process will swapin processes.
 1554          */
 1555         if (vm_pages_needed && !vm_page_count_min()) {
 1556                 vm_pages_needed = 0;
 1557                 wakeup(&cnt.v_free_count);
 1558         }
 1559 }
 1560 
 1561 /*
 1562  *      vm_page_free_toq:
 1563  *
 1564  *      Returns the given page to the free list,
 1565  *      disassociating it with any VM object.
 1566  *
 1567  *      Object and page must be locked prior to entry.
 1568  *      This routine may not block.
 1569  */
 1570 
 1571 void
 1572 vm_page_free_toq(vm_page_t m)
 1573 {
 1574 
 1575         if (VM_PAGE_GETQUEUE(m) != PQ_NONE)
 1576                 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1577         KASSERT(!pmap_page_is_mapped(m),
 1578             ("vm_page_free_toq: freeing mapped page %p", m));
 1579         PCPU_INC(cnt.v_tfree);
 1580 
 1581         if (m->busy || VM_PAGE_IS_FREE(m)) {
 1582                 printf(
 1583                 "vm_page_free: pindex(%lu), busy(%d), VPO_BUSY(%d), hold(%d)\n",
 1584                     (u_long)m->pindex, m->busy, (m->oflags & VPO_BUSY) ? 1 : 0,
 1585                     m->hold_count);
 1586                 if (VM_PAGE_IS_FREE(m))
 1587                         panic("vm_page_free: freeing free page");
 1588                 else
 1589                         panic("vm_page_free: freeing busy page");
 1590         }
 1591 
 1592         /*
 1593          * unqueue, then remove page.  Note that we cannot destroy
 1594          * the page here because we do not want to call the pager's
 1595          * callback routine until after we've put the page on the
 1596          * appropriate free queue.
 1597          */
 1598         vm_pageq_remove(m);
 1599         vm_page_remove(m);
 1600 
 1601         /*
 1602          * If fictitious remove object association and
 1603          * return, otherwise delay object association removal.
 1604          */
 1605         if ((m->flags & PG_FICTITIOUS) != 0) {
 1606                 return;
 1607         }
 1608 
 1609         m->valid = 0;
 1610         vm_page_undirty(m);
 1611 
 1612         if (m->wire_count != 0) {
 1613                 if (m->wire_count > 1) {
 1614                         panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx",
 1615                                 m->wire_count, (long)m->pindex);
 1616                 }
 1617                 panic("vm_page_free: freeing wired page");
 1618         }
 1619         if (m->hold_count != 0) {
 1620                 m->flags &= ~PG_ZERO;
 1621                 vm_page_enqueue(PQ_HOLD, m);
 1622         } else {
 1623                 /*
 1624                  * Restore the default memory attribute to the page.
 1625                  */
 1626                 if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 1627                         pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 1628 
 1629                 /*
 1630                  * Insert the page into the physical memory allocator's
 1631                  * cache/free page queues.
 1632                  */
 1633                 mtx_lock(&vm_page_queue_free_mtx);
 1634                 m->flags |= PG_FREE;
 1635                 cnt.v_free_count++;
 1636 #if VM_NRESERVLEVEL > 0
 1637                 if (!vm_reserv_free_page(m))
 1638 #else
 1639                 if (TRUE)
 1640 #endif
 1641                         vm_phys_free_pages(m, 0);
 1642                 if ((m->flags & PG_ZERO) != 0)
 1643                         ++vm_page_zero_count;
 1644                 else
 1645                         vm_page_zero_idle_wakeup();
 1646                 vm_page_free_wakeup();
 1647                 mtx_unlock(&vm_page_queue_free_mtx);
 1648         }
 1649 }
 1650 
 1651 /*
 1652  *      vm_page_wire:
 1653  *
 1654  *      Mark this page as wired down by yet
 1655  *      another map, removing it from paging queues
 1656  *      as necessary.
 1657  *
 1658  *      The page queues must be locked.
 1659  *      This routine may not block.
 1660  */
 1661 void
 1662 vm_page_wire(vm_page_t m)
 1663 {
 1664 
 1665         /*
 1666          * Only bump the wire statistics if the page is not already wired,
 1667          * and only unqueue the page if it is on some queue (if it is unmanaged
 1668          * it is already off the queues).
 1669          */
 1670         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1671         if (m->flags & PG_FICTITIOUS)
 1672                 return;
 1673         if (m->wire_count == 0) {
 1674                 if ((m->flags & PG_UNMANAGED) == 0)
 1675                         vm_pageq_remove(m);
 1676                 atomic_add_int(&cnt.v_wire_count, 1);
 1677         }
 1678         m->wire_count++;
 1679         KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
 1680 }
 1681 
 1682 /*
 1683  *      vm_page_unwire:
 1684  *
 1685  *      Release one wiring of this page, potentially
 1686  *      enabling it to be paged again.
 1687  *
 1688  *      Many pages placed on the inactive queue should actually go
 1689  *      into the cache, but it is difficult to figure out which.  What
 1690  *      we do instead, if the inactive target is well met, is to put
 1691  *      clean pages at the head of the inactive queue instead of the tail.
 1692  *      This will cause them to be moved to the cache more quickly and
 1693  *      if not actively re-referenced, freed more quickly.  If we just
 1694  *      stick these pages at the end of the inactive queue, heavy filesystem
 1695  *      meta-data accesses can cause an unnecessary paging load on memory bound 
 1696  *      processes.  This optimization causes one-time-use metadata to be
 1697  *      reused more quickly.
 1698  *
 1699  *      BUT, if we are in a low-memory situation we have no choice but to
 1700  *      put clean pages on the cache queue.
 1701  *
 1702  *      A number of routines use vm_page_unwire() to guarantee that the page
 1703  *      will go into either the inactive or active queues, and will NEVER
 1704  *      be placed in the cache - for example, just after dirtying a page.
 1705  *      dirty pages in the cache are not allowed.
 1706  *
 1707  *      The page queues must be locked.
 1708  *      This routine may not block.
 1709  */
 1710 void
 1711 vm_page_unwire(vm_page_t m, int activate)
 1712 {
 1713 
 1714         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1715         if (m->flags & PG_FICTITIOUS)
 1716                 return;
 1717         if (m->wire_count > 0) {
 1718                 m->wire_count--;
 1719                 if (m->wire_count == 0) {
 1720                         atomic_subtract_int(&cnt.v_wire_count, 1);
 1721                         if (m->flags & PG_UNMANAGED) {
 1722                                 ;
 1723                         } else if (activate)
 1724                                 vm_page_enqueue(PQ_ACTIVE, m);
 1725                         else {
 1726                                 vm_page_flag_clear(m, PG_WINATCFLS);
 1727                                 vm_page_enqueue(PQ_INACTIVE, m);
 1728                         }
 1729                 }
 1730         } else {
 1731                 panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
 1732         }
 1733 }
 1734 
 1735 
 1736 /*
 1737  * Move the specified page to the inactive queue.  If the page has
 1738  * any associated swap, the swap is deallocated.
 1739  *
 1740  * Normally athead is 0 resulting in LRU operation.  athead is set
 1741  * to 1 if we want this page to be 'as if it were placed in the cache',
 1742  * except without unmapping it from the process address space.
 1743  *
 1744  * This routine may not block.
 1745  */
 1746 static inline void
 1747 _vm_page_deactivate(vm_page_t m, int athead)
 1748 {
 1749 
 1750         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1751 
 1752         /*
 1753          * Ignore if already inactive.
 1754          */
 1755         if (VM_PAGE_INQUEUE2(m, PQ_INACTIVE))
 1756                 return;
 1757         if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 1758                 vm_page_flag_clear(m, PG_WINATCFLS);
 1759                 vm_pageq_remove(m);
 1760                 if (athead)
 1761                         TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
 1762                 else
 1763                         TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
 1764                 VM_PAGE_SETQUEUE2(m, PQ_INACTIVE);
 1765                 cnt.v_inactive_count++;
 1766         }
 1767 }
 1768 
 1769 void
 1770 vm_page_deactivate(vm_page_t m)
 1771 {
 1772     _vm_page_deactivate(m, 0);
 1773 }
 1774 
 1775 /*
 1776  * vm_page_try_to_cache:
 1777  *
 1778  * Returns 0 on failure, 1 on success
 1779  */
 1780 int
 1781 vm_page_try_to_cache(vm_page_t m)
 1782 {
 1783 
 1784         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1785         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 1786         if (m->dirty || m->hold_count || m->busy || m->wire_count ||
 1787             (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
 1788                 return (0);
 1789         }
 1790         pmap_remove_all(m);
 1791         if (m->dirty)
 1792                 return (0);
 1793         vm_page_cache(m);
 1794         return (1);
 1795 }
 1796 
 1797 /*
 1798  * vm_page_try_to_free()
 1799  *
 1800  *      Attempt to free the page.  If we cannot free it, we do nothing.
 1801  *      1 is returned on success, 0 on failure.
 1802  */
 1803 int
 1804 vm_page_try_to_free(vm_page_t m)
 1805 {
 1806 
 1807         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1808         if (m->object != NULL)
 1809                 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 1810         if (m->dirty || m->hold_count || m->busy || m->wire_count ||
 1811             (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
 1812                 return (0);
 1813         }
 1814         pmap_remove_all(m);
 1815         if (m->dirty)
 1816                 return (0);
 1817         vm_page_free(m);
 1818         return (1);
 1819 }
 1820 
 1821 /*
 1822  * vm_page_cache
 1823  *
 1824  * Put the specified page onto the page cache queue (if appropriate).
 1825  *
 1826  * This routine may not block.
 1827  */
 1828 void
 1829 vm_page_cache(vm_page_t m)
 1830 {
 1831         vm_object_t object;
 1832         vm_page_t next, prev, root;
 1833 
 1834         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1835         object = m->object;
 1836         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 1837         if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy ||
 1838             m->hold_count || m->wire_count) {
 1839                 panic("vm_page_cache: attempting to cache busy page");
 1840         }
 1841         pmap_remove_all(m);
 1842         if (m->dirty != 0)
 1843                 panic("vm_page_cache: page %p is dirty", m);
 1844         if (m->valid == 0 || object->type == OBJT_DEFAULT ||
 1845             (object->type == OBJT_SWAP &&
 1846             !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
 1847                 /*
 1848                  * Hypothesis: A cache-elgible page belonging to a
 1849                  * default object or swap object but without a backing
 1850                  * store must be zero filled.
 1851                  */
 1852                 vm_page_free(m);
 1853                 return;
 1854         }
 1855         KASSERT((m->flags & PG_CACHED) == 0,
 1856             ("vm_page_cache: page %p is already cached", m));
 1857         cnt.v_tcached++;
 1858 
 1859         /*
 1860          * Remove the page from the paging queues.
 1861          */
 1862         vm_pageq_remove(m);
 1863 
 1864         /*
 1865          * Remove the page from the object's collection of resident
 1866          * pages. 
 1867          */
 1868         if ((next = TAILQ_NEXT(m, listq)) != NULL && next->left == m) {
 1869                 /*
 1870                  * Since the page's successor in the list is also its parent
 1871                  * in the tree, its right subtree must be empty.
 1872                  */
 1873                 next->left = m->left;
 1874                 KASSERT(m->right == NULL,
 1875                     ("vm_page_cache: page %p has right child", m));
 1876         } else if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
 1877             prev->right == m) {
 1878                 /*
 1879                  * Since the page's predecessor in the list is also its parent
 1880                  * in the tree, its left subtree must be empty.
 1881                  */
 1882                 KASSERT(m->left == NULL,
 1883                     ("vm_page_cache: page %p has left child", m));
 1884                 prev->right = m->right;
 1885         } else {
 1886                 if (m != object->root)
 1887                         vm_page_splay(m->pindex, object->root);
 1888                 if (m->left == NULL)
 1889                         root = m->right;
 1890                 else if (m->right == NULL)
 1891                         root = m->left;
 1892                 else {
 1893                         /*
 1894                          * Move the page's successor to the root, because
 1895                          * pages are usually removed in ascending order.
 1896                          */
 1897                         if (m->right != next)
 1898                                 vm_page_splay(m->pindex, m->right);
 1899                         next->left = m->left;
 1900                         root = next;
 1901                 }
 1902                 object->root = root;
 1903         }
 1904         TAILQ_REMOVE(&object->memq, m, listq);
 1905         object->resident_page_count--;
 1906 
 1907         /*
 1908          * Restore the default memory attribute to the page.
 1909          */
 1910         if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 1911                 pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 1912 
 1913         /*
 1914          * Insert the page into the object's collection of cached pages
 1915          * and the physical memory allocator's cache/free page queues.
 1916          */
 1917         vm_page_flag_clear(m, PG_ZERO);
 1918         mtx_lock(&vm_page_queue_free_mtx);
 1919         m->flags |= PG_CACHED;
 1920         cnt.v_cache_count++;
 1921         root = object->cache;
 1922         if (root == NULL) {
 1923                 m->left = NULL;
 1924                 m->right = NULL;
 1925         } else {
 1926                 root = vm_page_splay(m->pindex, root);
 1927                 if (m->pindex < root->pindex) {
 1928                         m->left = root->left;
 1929                         m->right = root;
 1930                         root->left = NULL;
 1931                 } else if (__predict_false(m->pindex == root->pindex))
 1932                         panic("vm_page_cache: offset already cached");
 1933                 else {
 1934                         m->right = root->right;
 1935                         m->left = root;
 1936                         root->right = NULL;
 1937                 }
 1938         }
 1939         object->cache = m;
 1940 #if VM_NRESERVLEVEL > 0
 1941         if (!vm_reserv_free_page(m)) {
 1942 #else
 1943         if (TRUE) {
 1944 #endif
 1945                 vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
 1946                 vm_phys_free_pages(m, 0);
 1947         }
 1948         vm_page_free_wakeup();
 1949         mtx_unlock(&vm_page_queue_free_mtx);
 1950 
 1951         /*
 1952          * Increment the vnode's hold count if this is the object's only
 1953          * cached page.  Decrement the vnode's hold count if this was
 1954          * the object's only resident page.
 1955          */
 1956         if (object->type == OBJT_VNODE) {
 1957                 if (root == NULL && object->resident_page_count != 0)
 1958                         vhold(object->handle);
 1959                 else if (root != NULL && object->resident_page_count == 0)
 1960                         vdrop(object->handle);
 1961         }
 1962 }
 1963 
 1964 /*
 1965  * vm_page_dontneed
 1966  *
 1967  *      Cache, deactivate, or do nothing as appropriate.  This routine
 1968  *      is typically used by madvise() MADV_DONTNEED.
 1969  *
 1970  *      Generally speaking we want to move the page into the cache so
 1971  *      it gets reused quickly.  However, this can result in a silly syndrome
 1972  *      due to the page recycling too quickly.  Small objects will not be
 1973  *      fully cached.  On the otherhand, if we move the page to the inactive
 1974  *      queue we wind up with a problem whereby very large objects 
 1975  *      unnecessarily blow away our inactive and cache queues.
 1976  *
 1977  *      The solution is to move the pages based on a fixed weighting.  We
 1978  *      either leave them alone, deactivate them, or move them to the cache,
 1979  *      where moving them to the cache has the highest weighting.
 1980  *      By forcing some pages into other queues we eventually force the
 1981  *      system to balance the queues, potentially recovering other unrelated
 1982  *      space from active.  The idea is to not force this to happen too
 1983  *      often.
 1984  */
 1985 void
 1986 vm_page_dontneed(vm_page_t m)
 1987 {
 1988         static int dnweight;
 1989         int dnw;
 1990         int head;
 1991 
 1992         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 1993         dnw = ++dnweight;
 1994 
 1995         /*
 1996          * occassionally leave the page alone
 1997          */
 1998         if ((dnw & 0x01F0) == 0 ||
 1999             VM_PAGE_INQUEUE2(m, PQ_INACTIVE)) {
 2000                 if (m->act_count >= ACT_INIT)
 2001                         --m->act_count;
 2002                 return;
 2003         }
 2004 
 2005         /*
 2006          * Clear any references to the page.  Otherwise, the page daemon will
 2007          * immediately reactivate the page.
 2008          */
 2009         vm_page_flag_clear(m, PG_REFERENCED);
 2010         pmap_clear_reference(m);
 2011 
 2012         if (m->dirty == 0 && pmap_is_modified(m))
 2013                 vm_page_dirty(m);
 2014 
 2015         if (m->dirty || (dnw & 0x0070) == 0) {
 2016                 /*
 2017                  * Deactivate the page 3 times out of 32.
 2018                  */
 2019                 head = 0;
 2020         } else {
 2021                 /*
 2022                  * Cache the page 28 times out of every 32.  Note that
 2023                  * the page is deactivated instead of cached, but placed
 2024                  * at the head of the queue instead of the tail.
 2025                  */
 2026                 head = 1;
 2027         }
 2028         _vm_page_deactivate(m, head);
 2029 }
 2030 
 2031 /*
 2032  * Grab a page, waiting until we are waken up due to the page
 2033  * changing state.  We keep on waiting, if the page continues
 2034  * to be in the object.  If the page doesn't exist, first allocate it
 2035  * and then conditionally zero it.
 2036  *
 2037  * This routine may block.
 2038  */
 2039 vm_page_t
 2040 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 2041 {
 2042         vm_page_t m;
 2043 
 2044         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 2045 retrylookup:
 2046         if ((m = vm_page_lookup(object, pindex)) != NULL) {
 2047                 if (vm_page_sleep_if_busy(m, TRUE, "pgrbwt")) {
 2048                         if ((allocflags & VM_ALLOC_RETRY) == 0)
 2049                                 return (NULL);
 2050                         goto retrylookup;
 2051                 } else {
 2052                         if ((allocflags & VM_ALLOC_WIRED) != 0) {
 2053                                 vm_page_lock_queues();
 2054                                 vm_page_wire(m);
 2055                                 vm_page_unlock_queues();
 2056                         }
 2057                         if ((allocflags & VM_ALLOC_NOBUSY) == 0)
 2058                                 vm_page_busy(m);
 2059                         return (m);
 2060                 }
 2061         }
 2062         m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
 2063         if (m == NULL) {
 2064                 VM_OBJECT_UNLOCK(object);
 2065                 VM_WAIT;
 2066                 VM_OBJECT_LOCK(object);
 2067                 if ((allocflags & VM_ALLOC_RETRY) == 0)
 2068                         return (NULL);
 2069                 goto retrylookup;
 2070         } else if (m->valid != 0)
 2071                 return (m);
 2072         if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 2073                 pmap_zero_page(m);
 2074         return (m);
 2075 }
 2076 
 2077 /*
 2078  * Mapping function for valid bits or for dirty bits in
 2079  * a page.  May not block.
 2080  *
 2081  * Inputs are required to range within a page.
 2082  */
 2083 int
 2084 vm_page_bits(int base, int size)
 2085 {
 2086         int first_bit;
 2087         int last_bit;
 2088 
 2089         KASSERT(
 2090             base + size <= PAGE_SIZE,
 2091             ("vm_page_bits: illegal base/size %d/%d", base, size)
 2092         );
 2093 
 2094         if (size == 0)          /* handle degenerate case */
 2095                 return (0);
 2096 
 2097         first_bit = base >> DEV_BSHIFT;
 2098         last_bit = (base + size - 1) >> DEV_BSHIFT;
 2099 
 2100         return ((2 << last_bit) - (1 << first_bit));
 2101 }
 2102 
 2103 /*
 2104  *      vm_page_set_valid:
 2105  *
 2106  *      Sets portions of a page valid.  The arguments are expected
 2107  *      to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
 2108  *      of any partial chunks touched by the range.  The invalid portion of
 2109  *      such chunks will be zeroed.
 2110  *
 2111  *      (base + size) must be less then or equal to PAGE_SIZE.
 2112  */
 2113 void
 2114 vm_page_set_valid(vm_page_t m, int base, int size)
 2115 {
 2116         int endoff, frag;
 2117 
 2118         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 2119         if (size == 0)  /* handle degenerate case */
 2120                 return;
 2121 
 2122         /*
 2123          * If the base is not DEV_BSIZE aligned and the valid
 2124          * bit is clear, we have to zero out a portion of the
 2125          * first block.
 2126          */
 2127         if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
 2128             (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 2129                 pmap_zero_page_area(m, frag, base - frag);
 2130 
 2131         /*
 2132          * If the ending offset is not DEV_BSIZE aligned and the 
 2133          * valid bit is clear, we have to zero out a portion of
 2134          * the last block.
 2135          */
 2136         endoff = base + size;
 2137         if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
 2138             (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 2139                 pmap_zero_page_area(m, endoff,
 2140                     DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 2141 
 2142         /*
 2143          * Assert that no previously invalid block that is now being validated
 2144          * is already dirty. 
 2145          */
 2146         KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
 2147             ("vm_page_set_valid: page %p is dirty", m)); 
 2148 
 2149         /*
 2150          * Set valid bits inclusive of any overlap.
 2151          */
 2152         m->valid |= vm_page_bits(base, size);
 2153 }
 2154 
 2155 /*
 2156  *      vm_page_set_validclean:
 2157  *
 2158  *      Sets portions of a page valid and clean.  The arguments are expected
 2159  *      to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
 2160  *      of any partial chunks touched by the range.  The invalid portion of
 2161  *      such chunks will be zero'd.
 2162  *
 2163  *      This routine may not block.
 2164  *
 2165  *      (base + size) must be less then or equal to PAGE_SIZE.
 2166  */
 2167 void
 2168 vm_page_set_validclean(vm_page_t m, int base, int size)
 2169 {
 2170         int pagebits;
 2171         int frag;
 2172         int endoff;
 2173 
 2174         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2175         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 2176         if (size == 0)  /* handle degenerate case */
 2177                 return;
 2178 
 2179         /*
 2180          * If the base is not DEV_BSIZE aligned and the valid
 2181          * bit is clear, we have to zero out a portion of the
 2182          * first block.
 2183          */
 2184         if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
 2185             (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 2186                 pmap_zero_page_area(m, frag, base - frag);
 2187 
 2188         /*
 2189          * If the ending offset is not DEV_BSIZE aligned and the 
 2190          * valid bit is clear, we have to zero out a portion of
 2191          * the last block.
 2192          */
 2193         endoff = base + size;
 2194         if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
 2195             (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 2196                 pmap_zero_page_area(m, endoff,
 2197                     DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 2198 
 2199         /*
 2200          * Set valid, clear dirty bits.  If validating the entire
 2201          * page we can safely clear the pmap modify bit.  We also
 2202          * use this opportunity to clear the VPO_NOSYNC flag.  If a process
 2203          * takes a write fault on a MAP_NOSYNC memory area the flag will
 2204          * be set again.
 2205          *
 2206          * We set valid bits inclusive of any overlap, but we can only
 2207          * clear dirty bits for DEV_BSIZE chunks that are fully within
 2208          * the range.
 2209          */
 2210         pagebits = vm_page_bits(base, size);
 2211         m->valid |= pagebits;
 2212 #if 0   /* NOT YET */
 2213         if ((frag = base & (DEV_BSIZE - 1)) != 0) {
 2214                 frag = DEV_BSIZE - frag;
 2215                 base += frag;
 2216                 size -= frag;
 2217                 if (size < 0)
 2218                         size = 0;
 2219         }
 2220         pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
 2221 #endif
 2222         m->dirty &= ~pagebits;
 2223         if (base == 0 && size == PAGE_SIZE) {
 2224                 pmap_clear_modify(m);
 2225                 m->oflags &= ~VPO_NOSYNC;
 2226         }
 2227 }
 2228 
 2229 void
 2230 vm_page_clear_dirty(vm_page_t m, int base, int size)
 2231 {
 2232 
 2233         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2234         m->dirty &= ~vm_page_bits(base, size);
 2235 }
 2236 
 2237 /*
 2238  *      vm_page_set_invalid:
 2239  *
 2240  *      Invalidates DEV_BSIZE'd chunks within a page.  Both the
 2241  *      valid and dirty bits for the effected areas are cleared.
 2242  *
 2243  *      May not block.
 2244  */
 2245 void
 2246 vm_page_set_invalid(vm_page_t m, int base, int size)
 2247 {
 2248         int bits;
 2249 
 2250         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 2251         bits = vm_page_bits(base, size);
 2252         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2253         if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
 2254                 pmap_remove_all(m);
 2255         m->valid &= ~bits;
 2256         m->dirty &= ~bits;
 2257 }
 2258 
 2259 /*
 2260  * vm_page_zero_invalid()
 2261  *
 2262  *      The kernel assumes that the invalid portions of a page contain 
 2263  *      garbage, but such pages can be mapped into memory by user code.
 2264  *      When this occurs, we must zero out the non-valid portions of the
 2265  *      page so user code sees what it expects.
 2266  *
 2267  *      Pages are most often semi-valid when the end of a file is mapped 
 2268  *      into memory and the file's size is not page aligned.
 2269  */
 2270 void
 2271 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
 2272 {
 2273         int b;
 2274         int i;
 2275 
 2276         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 2277         /*
 2278          * Scan the valid bits looking for invalid sections that
 2279          * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
 2280          * valid bit may be set ) have already been zerod by
 2281          * vm_page_set_validclean().
 2282          */
 2283         for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
 2284                 if (i == (PAGE_SIZE / DEV_BSIZE) || 
 2285                     (m->valid & (1 << i))
 2286                 ) {
 2287                         if (i > b) {
 2288                                 pmap_zero_page_area(m, 
 2289                                     b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 2290                         }
 2291                         b = i + 1;
 2292                 }
 2293         }
 2294 
 2295         /*
 2296          * setvalid is TRUE when we can safely set the zero'd areas
 2297          * as being valid.  We can do this if there are no cache consistancy
 2298          * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
 2299          */
 2300         if (setvalid)
 2301                 m->valid = VM_PAGE_BITS_ALL;
 2302 }
 2303 
 2304 /*
 2305  *      vm_page_is_valid:
 2306  *
 2307  *      Is (partial) page valid?  Note that the case where size == 0
 2308  *      will return FALSE in the degenerate case where the page is
 2309  *      entirely invalid, and TRUE otherwise.
 2310  *
 2311  *      May not block.
 2312  */
 2313 int
 2314 vm_page_is_valid(vm_page_t m, int base, int size)
 2315 {
 2316         int bits = vm_page_bits(base, size);
 2317 
 2318         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 2319         if (m->valid && ((m->valid & bits) == bits))
 2320                 return 1;
 2321         else
 2322                 return 0;
 2323 }
 2324 
 2325 /*
 2326  * update dirty bits from pmap/mmu.  May not block.
 2327  */
 2328 void
 2329 vm_page_test_dirty(vm_page_t m)
 2330 {
 2331         if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
 2332                 vm_page_dirty(m);
 2333         }
 2334 }
 2335 
 2336 int so_zerocp_fullpage = 0;
 2337 
 2338 /*
 2339  *      Replace the given page with a copy.  The copied page assumes
 2340  *      the portion of the given page's "wire_count" that is not the
 2341  *      responsibility of this copy-on-write mechanism.
 2342  *
 2343  *      The object containing the given page must have a non-zero
 2344  *      paging-in-progress count and be locked.
 2345  */
 2346 void
 2347 vm_page_cowfault(vm_page_t m)
 2348 {
 2349         vm_page_t mnew;
 2350         vm_object_t object;
 2351         vm_pindex_t pindex;
 2352 
 2353         object = m->object;
 2354         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 2355         KASSERT(object->paging_in_progress != 0,
 2356             ("vm_page_cowfault: object %p's paging-in-progress count is zero.",
 2357             object)); 
 2358         pindex = m->pindex;
 2359 
 2360  retry_alloc:
 2361         pmap_remove_all(m);
 2362         vm_page_remove(m);
 2363         mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 2364         if (mnew == NULL) {
 2365                 vm_page_insert(m, object, pindex);
 2366                 vm_page_unlock_queues();
 2367                 VM_OBJECT_UNLOCK(object);
 2368                 VM_WAIT;
 2369                 VM_OBJECT_LOCK(object);
 2370                 if (m == vm_page_lookup(object, pindex)) {
 2371                         vm_page_lock_queues();
 2372                         goto retry_alloc;
 2373                 } else {
 2374                         /*
 2375                          * Page disappeared during the wait.
 2376                          */
 2377                         vm_page_lock_queues();
 2378                         return;
 2379                 }
 2380         }
 2381 
 2382         if (m->cow == 0) {
 2383                 /* 
 2384                  * check to see if we raced with an xmit complete when 
 2385                  * waiting to allocate a page.  If so, put things back 
 2386                  * the way they were 
 2387                  */
 2388                 vm_page_free(mnew);
 2389                 vm_page_insert(m, object, pindex);
 2390         } else { /* clear COW & copy page */
 2391                 if (!so_zerocp_fullpage)
 2392                         pmap_copy_page(m, mnew);
 2393                 mnew->valid = VM_PAGE_BITS_ALL;
 2394                 vm_page_dirty(mnew);
 2395                 mnew->wire_count = m->wire_count - m->cow;
 2396                 m->wire_count = m->cow;
 2397         }
 2398 }
 2399 
 2400 void 
 2401 vm_page_cowclear(vm_page_t m)
 2402 {
 2403 
 2404         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2405         if (m->cow) {
 2406                 m->cow--;
 2407                 /* 
 2408                  * let vm_fault add back write permission  lazily
 2409                  */
 2410         } 
 2411         /*
 2412          *  sf_buf_free() will free the page, so we needn't do it here
 2413          */ 
 2414 }
 2415 
 2416 int
 2417 vm_page_cowsetup(vm_page_t m)
 2418 {
 2419 
 2420         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 2421         if (m->cow == USHRT_MAX - 1)
 2422                 return (EBUSY);
 2423         m->cow++;
 2424         pmap_remove_write(m);
 2425         return (0);
 2426 }
 2427 
 2428 #include "opt_ddb.h"
 2429 #ifdef DDB
 2430 #include <sys/kernel.h>
 2431 
 2432 #include <ddb/ddb.h>
 2433 
 2434 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 2435 {
 2436         db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
 2437         db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
 2438         db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
 2439         db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
 2440         db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
 2441         db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
 2442         db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
 2443         db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
 2444         db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
 2445         db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
 2446 }
 2447 
 2448 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 2449 {
 2450                 
 2451         db_printf("PQ_FREE:");
 2452         db_printf(" %d", cnt.v_free_count);
 2453         db_printf("\n");
 2454                 
 2455         db_printf("PQ_CACHE:");
 2456         db_printf(" %d", cnt.v_cache_count);
 2457         db_printf("\n");
 2458 
 2459         db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
 2460                 *vm_page_queues[PQ_ACTIVE].cnt,
 2461                 *vm_page_queues[PQ_INACTIVE].cnt);
 2462 }
 2463 #endif /* DDB */
Cache object: 545f66869be97b8babf59fe56e1d12e2
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/vm/vm_page.c

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_page.c