The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_glue.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * (MPSAFE)
    3  *
    4  * Copyright (c) 1991, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * This code is derived from software contributed to Berkeley by
    8  * The Mach Operating System project at Carnegie-Mellon University.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      from: @(#)vm_glue.c     8.6 (Berkeley) 1/5/94
   35  *
   36  *
   37  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   38  * All rights reserved.
   39  *
   40  * Permission to use, copy, modify and distribute this software and
   41  * its documentation is hereby granted, provided that both the copyright
   42  * notice and this permission notice appear in all copies of the
   43  * software, derivative works or modified versions, and any portions
   44  * thereof, and that both notices appear in supporting documentation.
   45  *
   46  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   47  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   48  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   49  *
   50  * Carnegie Mellon requests users of this software to return to
   51  *
   52  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   53  *  School of Computer Science
   54  *  Carnegie Mellon University
   55  *  Pittsburgh PA 15213-3890
   56  *
   57  * any improvements or extensions that they make and grant Carnegie the
   58  * rights to redistribute these changes.
   59  *
   60  * $FreeBSD: src/sys/vm/vm_glue.c,v 1.94.2.4 2003/01/13 22:51:17 dillon Exp $
   61  */
   62 
   63 #include "opt_vm.h"
   64 
   65 #include <sys/param.h>
   66 #include <sys/systm.h>
   67 #include <sys/proc.h>
   68 #include <sys/resourcevar.h>
   69 #include <sys/buf.h>
   70 #include <sys/shm.h>
   71 #include <sys/vmmeter.h>
   72 #include <sys/sysctl.h>
   73 
   74 #include <sys/kernel.h>
   75 #include <sys/unistd.h>
   76 
   77 #include <machine/limits.h>
   78 #include <machine/vmm.h>
   79 
   80 #include <vm/vm.h>
   81 #include <vm/vm_param.h>
   82 #include <sys/lock.h>
   83 #include <vm/pmap.h>
   84 #include <vm/vm_map.h>
   85 #include <vm/vm_page.h>
   86 #include <vm/vm_pageout.h>
   87 #include <vm/vm_kern.h>
   88 #include <vm/vm_extern.h>
   89 
   90 #include <sys/user.h>
   91 #include <vm/vm_page2.h>
   92 #include <sys/thread2.h>
   93 #include <sys/sysref2.h>
   94 
   95 /*
   96  * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
   97  *
   98  * Note: run scheduling should be divorced from the vm system.
   99  */
  100 static void scheduler (void *);
  101 SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL)
  102 
  103 #ifdef INVARIANTS
  104 
  105 static int swap_debug = 0;
  106 SYSCTL_INT(_vm, OID_AUTO, swap_debug,
  107         CTLFLAG_RW, &swap_debug, 0, "");
  108 
  109 #endif
  110 
  111 static int scheduler_notify;
  112 
  113 static void swapout (struct proc *);
  114 
  115 /*
  116  * No requirements.
  117  */
  118 int
  119 kernacc(c_caddr_t addr, int len, int rw)
  120 {
  121         boolean_t rv;
  122         vm_offset_t saddr, eaddr;
  123         vm_prot_t prot;
  124 
  125         KASSERT((rw & (~VM_PROT_ALL)) == 0,
  126             ("illegal ``rw'' argument to kernacc (%x)", rw));
  127 
  128         /*
  129          * The globaldata space is not part of the kernel_map proper,
  130          * check access separately.
  131          */
  132         if (is_globaldata_space((vm_offset_t)addr, (vm_offset_t)(addr + len)))
  133                 return (TRUE);
  134 
  135         /*
  136          * Nominal kernel memory access - check access via kernel_map.
  137          */
  138         if ((vm_offset_t)addr + len > kernel_map.max_offset ||
  139             (vm_offset_t)addr + len < (vm_offset_t)addr) {
  140                 return (FALSE);
  141         }
  142         prot = rw;
  143         saddr = trunc_page((vm_offset_t)addr);
  144         eaddr = round_page((vm_offset_t)addr + len);
  145         rv = vm_map_check_protection(&kernel_map, saddr, eaddr, prot, FALSE);
  146 
  147         return (rv == TRUE);
  148 }
  149 
  150 /*
  151  * No requirements.
  152  */
  153 int
  154 useracc(c_caddr_t addr, int len, int rw)
  155 {
  156         boolean_t rv;
  157         vm_prot_t prot;
  158         vm_map_t map;
  159         vm_map_entry_t save_hint;
  160         vm_offset_t wrap;
  161         vm_offset_t gpa;
  162 
  163         KASSERT((rw & (~VM_PROT_ALL)) == 0,
  164             ("illegal ``rw'' argument to useracc (%x)", rw));
  165         prot = rw;
  166 
  167         if (curthread->td_vmm) {
  168                 if (vmm_vm_get_gpa(curproc, (register_t *)&gpa, (register_t) addr))
  169                         panic("%s: could not get GPA\n", __func__);
  170                 addr = (c_caddr_t) gpa;
  171         }
  172 
  173         /*
  174          * XXX - check separately to disallow access to user area and user
  175          * page tables - they are in the map.
  176          */
  177         wrap = (vm_offset_t)addr + len;
  178         if (wrap > VM_MAX_USER_ADDRESS || wrap < (vm_offset_t)addr) {
  179                 return (FALSE);
  180         }
  181         map = &curproc->p_vmspace->vm_map;
  182         vm_map_lock_read(map);
  183         /*
  184          * We save the map hint, and restore it.  Useracc appears to distort
  185          * the map hint unnecessarily.
  186          */
  187         save_hint = map->hint;
  188         rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
  189                                      round_page(wrap), prot, TRUE);
  190         map->hint = save_hint;
  191         vm_map_unlock_read(map);
  192         
  193         return (rv == TRUE);
  194 }
  195 
  196 /*
  197  * No requirements.
  198  */
  199 void
  200 vslock(caddr_t addr, u_int len)
  201 {
  202         if (len) {
  203                 vm_map_wire(&curproc->p_vmspace->vm_map,
  204                             trunc_page((vm_offset_t)addr),
  205                             round_page((vm_offset_t)addr + len), 0);
  206         }
  207 }
  208 
  209 /*
  210  * No requirements.
  211  */
  212 void
  213 vsunlock(caddr_t addr, u_int len)
  214 {
  215         if (len) {
  216                 vm_map_wire(&curproc->p_vmspace->vm_map,
  217                             trunc_page((vm_offset_t)addr),
  218                             round_page((vm_offset_t)addr + len),
  219                             KM_PAGEABLE);
  220         }
  221 }
  222 
  223 /*
  224  * Implement fork's actions on an address space.
  225  * Here we arrange for the address space to be copied or referenced,
  226  * allocate a user struct (pcb and kernel stack), then call the
  227  * machine-dependent layer to fill those in and make the new process
  228  * ready to run.  The new process is set up so that it returns directly
  229  * to user mode to avoid stack copying and relocation problems.
  230  *
  231  * No requirements.
  232  */
  233 void
  234 vm_fork(struct proc *p1, struct proc *p2, int flags)
  235 {
  236         if ((flags & RFPROC) == 0) {
  237                 /*
  238                  * Divorce the memory, if it is shared, essentially
  239                  * this changes shared memory amongst threads, into
  240                  * COW locally.
  241                  */
  242                 if ((flags & RFMEM) == 0) {
  243                         if (p1->p_vmspace->vm_sysref.refcnt > 1) {
  244                                 vmspace_unshare(p1);
  245                         }
  246                 }
  247                 cpu_fork(ONLY_LWP_IN_PROC(p1), NULL, flags);
  248                 return;
  249         }
  250 
  251         if (flags & RFMEM) {
  252                 vmspace_ref(p1->p_vmspace);
  253                 p2->p_vmspace = p1->p_vmspace;
  254         }
  255 
  256         while (vm_page_count_severe()) {
  257                 vm_wait(0);
  258         }
  259 
  260         if ((flags & RFMEM) == 0) {
  261                 p2->p_vmspace = vmspace_fork(p1->p_vmspace);
  262 
  263                 pmap_pinit2(vmspace_pmap(p2->p_vmspace));
  264 
  265                 if (p1->p_vmspace->vm_shm)
  266                         shmfork(p1, p2);
  267         }
  268 
  269         pmap_init_proc(p2);
  270 }
  271 
  272 /*
  273  * Set default limits for VM system.  Call during proc0's initialization.
  274  *
  275  * Called from the low level boot code only.
  276  */
  277 void
  278 vm_init_limits(struct proc *p)
  279 {
  280         int rss_limit;
  281 
  282         /*
  283          * Set up the initial limits on process VM. Set the maximum resident
  284          * set size to be half of (reasonably) available memory.  Since this
  285          * is a soft limit, it comes into effect only when the system is out
  286          * of memory - half of main memory helps to favor smaller processes,
  287          * and reduces thrashing of the object cache.
  288          */
  289         p->p_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
  290         p->p_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
  291         p->p_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
  292         p->p_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
  293         /* limit the limit to no less than 2MB */
  294         rss_limit = max(vmstats.v_free_count, 512);
  295         p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
  296         p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
  297 }
  298 
  299 /*
  300  * Faultin the specified process.  Note that the process can be in any
  301  * state.  Just clear P_SWAPPEDOUT and call wakeup in case the process is
  302  * sleeping.
  303  *
  304  * No requirements.
  305  */
  306 void
  307 faultin(struct proc *p)
  308 {
  309         if (p->p_flags & P_SWAPPEDOUT) {
  310                 /*
  311                  * The process is waiting in the kernel to return to user
  312                  * mode but cannot until P_SWAPPEDOUT gets cleared.
  313                  */
  314                 lwkt_gettoken(&p->p_token);
  315                 p->p_flags &= ~(P_SWAPPEDOUT | P_SWAPWAIT);
  316 #ifdef INVARIANTS
  317                 if (swap_debug)
  318                         kprintf("swapping in %d (%s)\n", p->p_pid, p->p_comm);
  319 #endif
  320                 wakeup(p);
  321                 lwkt_reltoken(&p->p_token);
  322         }
  323 }
  324 
  325 /*
  326  * Kernel initialization eventually falls through to this function,
  327  * which is process 0.
  328  *
  329  * This swapin algorithm attempts to swap-in processes only if there
  330  * is enough space for them.  Of course, if a process waits for a long
  331  * time, it will be swapped in anyway.
  332  */
  333 struct scheduler_info {
  334         struct proc *pp;
  335         int ppri;
  336 };
  337 
  338 static int scheduler_callback(struct proc *p, void *data);
  339 
  340 static void
  341 scheduler(void *dummy)
  342 {
  343         struct scheduler_info info;
  344         struct proc *p;
  345 
  346         KKASSERT(!IN_CRITICAL_SECT(curthread));
  347 loop:
  348         scheduler_notify = 0;
  349         /*
  350          * Don't try to swap anything in if we are low on memory.
  351          */
  352         if (vm_page_count_severe()) {
  353                 vm_wait(0);
  354                 goto loop;
  355         }
  356 
  357         /*
  358          * Look for a good candidate to wake up
  359          */
  360         info.pp = NULL;
  361         info.ppri = INT_MIN;
  362         allproc_scan(scheduler_callback, &info);
  363 
  364         /*
  365          * Nothing to do, back to sleep for at least 1/10 of a second.  If
  366          * we are woken up, immediately process the next request.  If
  367          * multiple requests have built up the first is processed 
  368          * immediately and the rest are staggered.
  369          */
  370         if ((p = info.pp) == NULL) {
  371                 tsleep(&proc0, 0, "nowork", hz / 10);
  372                 if (scheduler_notify == 0)
  373                         tsleep(&scheduler_notify, 0, "nowork", 0);
  374                 goto loop;
  375         }
  376 
  377         /*
  378          * Fault the selected process in, then wait for a short period of
  379          * time and loop up.
  380          *
  381          * XXX we need a heuristic to get a measure of system stress and
  382          * then adjust our stagger wakeup delay accordingly.
  383          */
  384         lwkt_gettoken(&p->p_token);
  385         faultin(p);
  386         p->p_swtime = 0;
  387         lwkt_reltoken(&p->p_token);
  388         PRELE(p);
  389         tsleep(&proc0, 0, "swapin", hz / 10);
  390         goto loop;
  391 }
  392 
  393 static int
  394 scheduler_callback(struct proc *p, void *data)
  395 {
  396         struct scheduler_info *info = data;
  397         struct lwp *lp;
  398         segsz_t pgs;
  399         int pri;
  400 
  401         if (p->p_flags & P_SWAPWAIT) {
  402                 pri = 0;
  403                 FOREACH_LWP_IN_PROC(lp, p) {
  404                         /* XXX lwp might need a different metric */
  405                         pri += lp->lwp_slptime;
  406                 }
  407                 pri += p->p_swtime - p->p_nice * 8;
  408 
  409                 /*
  410                  * The more pages paged out while we were swapped,
  411                  * the more work we have to do to get up and running
  412                  * again and the lower our wakeup priority.
  413                  *
  414                  * Each second of sleep time is worth ~1MB
  415                  */
  416                 lwkt_gettoken(&p->p_vmspace->vm_map.token);
  417                 pgs = vmspace_resident_count(p->p_vmspace);
  418                 if (pgs < p->p_vmspace->vm_swrss) {
  419                         pri -= (p->p_vmspace->vm_swrss - pgs) /
  420                                 (1024 * 1024 / PAGE_SIZE);
  421                 }
  422                 lwkt_reltoken(&p->p_vmspace->vm_map.token);
  423 
  424                 /*
  425                  * If this process is higher priority and there is
  426                  * enough space, then select this process instead of
  427                  * the previous selection.
  428                  */
  429                 if (pri > info->ppri) {
  430                         if (info->pp)
  431                                 PRELE(info->pp);
  432                         PHOLD(p);
  433                         info->pp = p;
  434                         info->ppri = pri;
  435                 }
  436         }
  437         return(0);
  438 }
  439 
  440 /*
  441  * SMP races ok.
  442  * No requirements.
  443  */
  444 void
  445 swapin_request(void)
  446 {
  447         if (scheduler_notify == 0) {
  448                 scheduler_notify = 1;
  449                 wakeup(&scheduler_notify);
  450         }
  451 }
  452 
  453 #ifndef NO_SWAPPING
  454 
  455 #define swappable(p) \
  456         (((p)->p_lock == 0) && \
  457         ((p)->p_flags & (P_TRACED|P_SYSTEM|P_SWAPPEDOUT|P_WEXIT)) == 0)
  458 
  459 
  460 /*
  461  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
  462  */
  463 static int swap_idle_threshold1 = 15;
  464 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1,
  465         CTLFLAG_RW, &swap_idle_threshold1, 0, "Guaranteed process resident time (sec)");
  466 
  467 /*
  468  * Swap_idle_threshold2 is the time that a process can be idle before
  469  * it will be swapped out, if idle swapping is enabled.  Default is
  470  * one minute.
  471  */
  472 static int swap_idle_threshold2 = 60;
  473 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2,
  474         CTLFLAG_RW, &swap_idle_threshold2, 0, "Time (sec) a process can idle before being swapped");
  475 
  476 /*
  477  * Swapout is driven by the pageout daemon.  Very simple, we find eligible
  478  * procs and mark them as being swapped out.  This will cause the kernel
  479  * to prefer to pageout those proc's pages first and the procs in question 
  480  * will not return to user mode until the swapper tells them they can.
  481  *
  482  * If any procs have been sleeping/stopped for at least maxslp seconds,
  483  * they are swapped.  Else, we swap the longest-sleeping or stopped process,
  484  * if any, otherwise the longest-resident process.
  485  */
  486 
  487 static int swapout_procs_callback(struct proc *p, void *data);
  488 
  489 /*
  490  * No requirements.
  491  */
  492 void
  493 swapout_procs(int action)
  494 {
  495         allproc_scan(swapout_procs_callback, &action);
  496 }
  497 
  498 static int
  499 swapout_procs_callback(struct proc *p, void *data)
  500 {
  501         struct lwp *lp;
  502         int action = *(int *)data;
  503         int minslp = -1;
  504 
  505         if (!swappable(p))
  506                 return(0);
  507 
  508         lwkt_gettoken(&p->p_token);
  509 
  510         /*
  511          * We only consider active processes.
  512          */
  513         if (p->p_stat != SACTIVE && p->p_stat != SSTOP) {
  514                 lwkt_reltoken(&p->p_token);
  515                 return(0);
  516         }
  517 
  518         FOREACH_LWP_IN_PROC(lp, p) {
  519                 /*
  520                  * do not swap out a realtime process
  521                  */
  522                 if (RTP_PRIO_IS_REALTIME(lp->lwp_rtprio.type)) {
  523                         lwkt_reltoken(&p->p_token);
  524                         return(0);
  525                 }
  526 
  527                 /*
  528                  * Guarentee swap_idle_threshold time in memory
  529                  */
  530                 if (lp->lwp_slptime < swap_idle_threshold1) {
  531                         lwkt_reltoken(&p->p_token);
  532                         return(0);
  533                 }
  534 
  535                 /*
  536                  * If the system is under memory stress, or if we
  537                  * are swapping idle processes >= swap_idle_threshold2,
  538                  * then swap the process out.
  539                  */
  540                 if (((action & VM_SWAP_NORMAL) == 0) &&
  541                     (((action & VM_SWAP_IDLE) == 0) ||
  542                      (lp->lwp_slptime < swap_idle_threshold2))) {
  543                         lwkt_reltoken(&p->p_token);
  544                         return(0);
  545                 }
  546 
  547                 if (minslp == -1 || lp->lwp_slptime < minslp)
  548                         minslp = lp->lwp_slptime;
  549         }
  550 
  551         /*
  552          * If the process has been asleep for awhile, swap
  553          * it out.
  554          */
  555         if ((action & VM_SWAP_NORMAL) ||
  556             ((action & VM_SWAP_IDLE) &&
  557              (minslp > swap_idle_threshold2))) {
  558                 swapout(p);
  559         }
  560 
  561         /*
  562          * cleanup our reference
  563          */
  564         lwkt_reltoken(&p->p_token);
  565 
  566         return(0);
  567 }
  568 
  569 /*
  570  * The caller must hold p->p_token
  571  */
  572 static void
  573 swapout(struct proc *p)
  574 {
  575 #ifdef INVARIANTS
  576         if (swap_debug)
  577                 kprintf("swapping out %d (%s)\n", p->p_pid, p->p_comm);
  578 #endif
  579         ++p->p_ru.ru_nswap;
  580 
  581         /*
  582          * remember the process resident count
  583          */
  584         p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
  585         p->p_flags |= P_SWAPPEDOUT;
  586         p->p_swtime = 0;
  587 }
  588 
  589 #endif /* !NO_SWAPPING */
  590 

Cache object: 75193970304a5c0b27f6274f4fd131dd


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.