The Design and Implementation of the FreeBSD Operating System, Second Edition
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/common/os/lgrp.c

Version: -  FREEBSD  -  FREEBSD10  -  FREEBSD9  -  FREEBSD92  -  FREEBSD91  -  FREEBSD90  -  FREEBSD8  -  FREEBSD82  -  FREEBSD81  -  FREEBSD80  -  FREEBSD7  -  FREEBSD74  -  FREEBSD73  -  FREEBSD72  -  FREEBSD71  -  FREEBSD70  -  FREEBSD6  -  FREEBSD64  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  cheribsd  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1  -  FREEBSD-LIBC  -  FREEBSD8-LIBC  -  FREEBSD7-LIBC  -  FREEBSD6-LIBC  -  GLIBC27 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or http://www.opensolaris.org/os/licensing.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
   23  * Use is subject to license terms.
   24  */
   25 
   26 /*
   27  * Basic NUMA support in terms of locality groups
   28  *
   29  * Solaris needs to know which CPUs, memory, etc. are near each other to
   30  * provide good performance on NUMA machines by optimizing for locality.
   31  * In order to do this, a new abstraction called a "locality group (lgroup)"
   32  * has been introduced to keep track of which CPU-like and memory-like hardware
   33  * resources are close to each other.  Currently, latency is the only measure
   34  * used to determine how to group hardware resources into lgroups, but this
   35  * does not limit the groupings to be based solely on latency.  Other factors
   36  * may be used to determine the groupings in the future.
   37  *
   38  * Lgroups are organized into a hieararchy or topology that represents the
   39  * latency topology of the machine.  There is always at least a root lgroup in
   40  * the system.  It represents all the hardware resources in the machine at a
   41  * latency big enough that any hardware resource can at least access any other
   42  * hardware resource within that latency.  A Uniform Memory Access (UMA)
   43  * machine is represented with one lgroup (the root).  In contrast, a NUMA
   44  * machine is represented at least by the root lgroup and some number of leaf
   45  * lgroups where the leaf lgroups contain the hardware resources within the
   46  * least latency of each other and the root lgroup still contains all the
   47  * resources in the machine.  Some number of intermediate lgroups may exist
   48  * which represent more levels of locality than just the local latency of the
   49  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
   50  * (eg. root and intermediate lgroups) contain the next nearest resources to
   51  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
   52  * to the root lgroup shows the hardware resources from closest to farthest
   53  * from the leaf lgroup such that each successive ancestor lgroup contains
   54  * the next nearest resources at the next level of locality from the previous.
   55  *
   56  * The kernel uses the lgroup abstraction to know how to allocate resources
   57  * near a given process/thread.  At fork() and lwp/thread_create() time, a
   58  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
   59  * with the lowest load average.  Binding to a processor or processor set will
   60  * change the home lgroup for a thread.  The scheduler has been modified to try
   61  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
   62  * allocation is lgroup aware too, so memory will be allocated from the current
   63  * thread's home lgroup if possible.  If the desired resources are not
   64  * available, the kernel traverses the lgroup hierarchy going to the parent
   65  * lgroup to find resources at the next level of locality until it reaches the
   66  * root lgroup.
   67  */
   68 
   69 #include <sys/lgrp.h>
   70 #include <sys/lgrp_user.h>
   71 #include <sys/types.h>
   72 #include <sys/mman.h>
   73 #include <sys/param.h>
   74 #include <sys/var.h>
   75 #include <sys/thread.h>
   76 #include <sys/cpuvar.h>
   77 #include <sys/cpupart.h>
   78 #include <sys/kmem.h>
   79 #include <vm/seg.h>
   80 #include <vm/seg_kmem.h>
   81 #include <vm/seg_spt.h>
   82 #include <vm/seg_vn.h>
   83 #include <vm/as.h>
   84 #include <sys/atomic.h>
   85 #include <sys/systm.h>
   86 #include <sys/errno.h>
   87 #include <sys/cmn_err.h>
   88 #include <sys/kstat.h>
   89 #include <sys/sysmacros.h>
   90 #include <sys/pg.h>
   91 #include <sys/promif.h>
   92 #include <sys/sdt.h>
   93 
   94 lgrp_gen_t      lgrp_gen = 0;           /* generation of lgroup hierarchy */
   95 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
   96                                 /* indexed by lgrp_id */
   97 int     nlgrps;                 /* number of lgroups in machine */
   98 int     lgrp_alloc_hint = -1;   /* hint for where to try to allocate next */
   99 int     lgrp_alloc_max = 0;     /* max lgroup ID allocated so far */
  100 
  101 /*
  102  * Kstat data for lgroups.
  103  *
  104  * Actual kstat data is collected in lgrp_stats array.
  105  * The lgrp_kstat_data array of named kstats is used to extract data from
  106  * lgrp_stats and present it to kstat framework. It is protected from partallel
  107  * modifications by lgrp_kstat_mutex. This may cause some contention when
  108  * several kstat commands run in parallel but this is not the
  109  * performance-critical path.
  110  */
  111 extern struct lgrp_stats lgrp_stats[];  /* table of per-lgrp stats */
  112 
  113 /*
  114  * Declare kstat names statically for enums as defined in the header file.
  115  */
  116 LGRP_KSTAT_NAMES;
  117 
  118 static void     lgrp_kstat_init(void);
  119 static int      lgrp_kstat_extract(kstat_t *, int);
  120 static void     lgrp_kstat_reset(lgrp_id_t);
  121 
  122 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
  123 static kmutex_t lgrp_kstat_mutex;
  124 
  125 
  126 /*
  127  * max number of lgroups supported by the platform
  128  */
  129 int     nlgrpsmax = 0;
  130 
  131 /*
  132  * The root lgroup. Represents the set of resources at the system wide
  133  * level of locality.
  134  */
  135 lgrp_t          *lgrp_root = NULL;
  136 
  137 /*
  138  * During system bootstrap cp_default does not contain the list of lgrp load
  139  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
  140  * on-line when cp_default is initialized by cpupart_initialize_default().
  141  * Configuring CPU0 may create a two-level topology with root and one leaf node
  142  * containing CPU0. This topology is initially constructed in a special
  143  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
  144  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
  145  * for all lpl operations until cp_default is fully constructed.
  146  *
  147  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
  148  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
  149  * the first element of lpl_bootstrap_list.
  150  *
  151  * CPUs that are added to the system, but have not yet been assigned to an
  152  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
  153  * on some architectures (x86) it's possible for the slave CPU startup thread
  154  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
  155  */
  156 #define LPL_BOOTSTRAP_SIZE 2
  157 static lpl_t    lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
  158 lpl_t           *lpl_bootstrap;
  159 static lpl_t    *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
  160 static int      lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
  161 
  162 /*
  163  * If cp still references the bootstrap lpl, it has not yet been added to
  164  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
  165  * a thread is trying to allocate memory close to a CPU that has no lgrp.
  166  */
  167 #define LGRP_CPU_HAS_NO_LGRP(cp)        ((cp)->cpu_lpl == lpl_bootstrap)
  168 
  169 static lgrp_t   lroot;
  170 
  171 /*
  172  * Size, in bytes, beyond which random memory allocation policy is applied
  173  * to non-shared memory.  Default is the maximum size, so random memory
  174  * allocation won't be used for non-shared memory by default.
  175  */
  176 size_t  lgrp_privm_random_thresh = (size_t)(-1);
  177 
  178 /* the maximum effect that a single thread can have on it's lgroup's load */
  179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
  180         ((lgrp_loadavg_max_effect) / (ncpu))
  181 uint32_t        lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
  182 
  183 
  184 /*
  185  * Size, in bytes, beyond which random memory allocation policy is applied to
  186  * shared memory.  Default is 8MB (2 ISM pages).
  187  */
  188 size_t  lgrp_shm_random_thresh = 8*1024*1024;
  189 
  190 /*
  191  * Whether to do processor set aware memory allocation by default
  192  */
  193 int     lgrp_mem_pset_aware = 0;
  194 
  195 /*
  196  * Set the default memory allocation policy for root lgroup
  197  */
  198 lgrp_mem_policy_t       lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
  199 
  200 /*
  201  * Set the default memory allocation policy.  For most platforms,
  202  * next touch is sufficient, but some platforms may wish to override
  203  * this.
  204  */
  205 lgrp_mem_policy_t       lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
  206 
  207 
  208 /*
  209  * lgroup CPU event handlers
  210  */
  211 static void     lgrp_cpu_init(struct cpu *);
  212 static void     lgrp_cpu_fini(struct cpu *, lgrp_id_t);
  213 static lgrp_t   *lgrp_cpu_to_lgrp(struct cpu *);
  214 
  215 /*
  216  * lgroup memory event handlers
  217  */
  218 static void     lgrp_mem_init(int, lgrp_handle_t, boolean_t);
  219 static void     lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
  220 static void     lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
  221 
  222 /*
  223  * lgroup CPU partition event handlers
  224  */
  225 static void     lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
  226 static void     lgrp_part_del_cpu(struct cpu *);
  227 
  228 /*
  229  * lgroup framework initialization
  230  */
  231 static void     lgrp_main_init(void);
  232 static void     lgrp_main_mp_init(void);
  233 static void     lgrp_root_init(void);
  234 static void     lgrp_setup(void);
  235 
  236 /*
  237  * lpl topology
  238  */
  239 static void     lpl_init(lpl_t *, lpl_t *, lgrp_t *);
  240 static void     lpl_clear(lpl_t *);
  241 static void     lpl_leaf_insert(lpl_t *, struct cpupart *);
  242 static void     lpl_leaf_remove(lpl_t *, struct cpupart *);
  243 static void     lpl_rset_add(lpl_t *, lpl_t *);
  244 static void     lpl_rset_del(lpl_t *, lpl_t *);
  245 static int      lpl_rset_contains(lpl_t *, lpl_t *);
  246 static void     lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
  247 static void     lpl_child_update(lpl_t *, struct cpupart *);
  248 static int      lpl_pick(lpl_t *, lpl_t *);
  249 static void     lpl_verify_wrapper(struct cpupart *);
  250 
  251 /*
  252  * defines for lpl topology verifier return codes
  253  */
  254 
  255 #define LPL_TOPO_CORRECT                        0
  256 #define LPL_TOPO_PART_HAS_NO_LPL                -1
  257 #define LPL_TOPO_CPUS_NOT_EMPTY                 -2
  258 #define LPL_TOPO_LGRP_MISMATCH                  -3
  259 #define LPL_TOPO_MISSING_PARENT                 -4
  260 #define LPL_TOPO_PARENT_MISMATCH                -5
  261 #define LPL_TOPO_BAD_CPUCNT                     -6
  262 #define LPL_TOPO_RSET_MISMATCH                  -7
  263 #define LPL_TOPO_LPL_ORPHANED                   -8
  264 #define LPL_TOPO_LPL_BAD_NCPU                   -9
  265 #define LPL_TOPO_RSET_MSSNG_LF                  -10
  266 #define LPL_TOPO_CPU_HAS_BAD_LPL                -11
  267 #define LPL_TOPO_NONLEAF_HAS_CPUS               -12
  268 #define LPL_TOPO_LGRP_NOT_LEAF                  -13
  269 #define LPL_TOPO_BAD_RSETCNT                    -14
  270 
  271 /*
  272  * Return whether lgroup optimizations should be enabled on this system
  273  */
  274 int
  275 lgrp_optimizations(void)
  276 {
  277         /*
  278          * System must have more than 2 lgroups to enable lgroup optimizations
  279          *
  280          * XXX This assumes that a 2 lgroup system has an empty root lgroup
  281          * with one child lgroup containing all the resources. A 2 lgroup
  282          * system with a root lgroup directly containing CPUs or memory might
  283          * need lgroup optimizations with its child lgroup, but there
  284          * isn't such a machine for now....
  285          */
  286         if (nlgrps > 2)
  287                 return (1);
  288 
  289         return (0);
  290 }
  291 
  292 /*
  293  * Setup root lgroup
  294  */
  295 static void
  296 lgrp_root_init(void)
  297 {
  298         lgrp_handle_t   hand;
  299         int             i;
  300         lgrp_id_t       id;
  301 
  302         /*
  303          * Create the "root" lgroup
  304          */
  305         ASSERT(nlgrps == 0);
  306         id = nlgrps++;
  307 
  308         lgrp_root = &lroot;
  309 
  310         lgrp_root->lgrp_cpu = NULL;
  311         lgrp_root->lgrp_mnodes = 0;
  312         lgrp_root->lgrp_nmnodes = 0;
  313         hand = lgrp_plat_root_hand();
  314         lgrp_root->lgrp_plathand = hand;
  315 
  316         lgrp_root->lgrp_id = id;
  317         lgrp_root->lgrp_cpucnt = 0;
  318         lgrp_root->lgrp_childcnt = 0;
  319         klgrpset_clear(lgrp_root->lgrp_children);
  320         klgrpset_clear(lgrp_root->lgrp_leaves);
  321         lgrp_root->lgrp_parent = NULL;
  322         lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
  323 
  324         for (i = 0; i < LGRP_RSRC_COUNT; i++)
  325                 klgrpset_clear(lgrp_root->lgrp_set[i]);
  326 
  327         lgrp_root->lgrp_kstat = NULL;
  328 
  329         lgrp_table[id] = lgrp_root;
  330 
  331         /*
  332          * Setup initial lpl list for CPU0 and initial t0 home.
  333          * The only lpl space we have so far is lpl_bootstrap. It is used for
  334          * all topology operations until cp_default is initialized at which
  335          * point t0.t_lpl will be updated.
  336          */
  337         lpl_bootstrap = lpl_bootstrap_list;
  338         t0.t_lpl = lpl_bootstrap;
  339         cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
  340         lpl_bootstrap_list[1].lpl_lgrpid = 1;
  341 
  342         /*
  343          * Set up the bootstrap rset
  344          * Since the bootstrap toplogy has just the root, and a leaf,
  345          * the rset contains just the leaf, and both lpls can use the same rset
  346          */
  347         lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
  348         lpl_bootstrap_list[0].lpl_rset_sz = 1;
  349         lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
  350         lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
  351 
  352         lpl_bootstrap_list[1].lpl_rset_sz = 1;
  353         lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
  354         lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
  355 
  356         cp_default.cp_lgrploads = lpl_bootstrap;
  357 }
  358 
  359 /*
  360  * Initialize the lgroup framework and allow the platform to do the same
  361  *
  362  * This happens in stages during boot and is all funnelled through this routine
  363  * (see definition of lgrp_init_stages_t to see what happens at each stage and
  364  * when)
  365  */
  366 void
  367 lgrp_init(lgrp_init_stages_t stage)
  368 {
  369         /*
  370          * Initialize the platform
  371          */
  372         lgrp_plat_init(stage);
  373 
  374         switch (stage) {
  375         case LGRP_INIT_STAGE1:
  376                 /*
  377                  * Set max number of lgroups supported on this platform which
  378                  * must be less than the max number of lgroups supported by the
  379                  * common lgroup framework (eg. NLGRPS_MAX is max elements in
  380                  * lgrp_table[], etc.)
  381                  */
  382                 nlgrpsmax = lgrp_plat_max_lgrps();
  383                 ASSERT(nlgrpsmax <= NLGRPS_MAX);
  384                 break;
  385 
  386         case LGRP_INIT_STAGE2:
  387                 lgrp_setup();
  388                 break;
  389 
  390         case LGRP_INIT_STAGE4:
  391                 lgrp_main_init();
  392                 break;
  393 
  394         case LGRP_INIT_STAGE5:
  395                 lgrp_main_mp_init();
  396                 break;
  397 
  398         default:
  399                 break;
  400         }
  401 }
  402 
  403 /*
  404  * Create the root and cpu0's lgroup, and set t0's home.
  405  */
  406 static void
  407 lgrp_setup(void)
  408 {
  409         /*
  410          * Setup the root lgroup
  411          */
  412         lgrp_root_init();
  413 
  414         /*
  415          * Add cpu0 to an lgroup
  416          */
  417         lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
  418         lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
  419 }
  420 
  421 /*
  422  * true when lgrp initialization has been completed.
  423  */
  424 int     lgrp_initialized = 0;
  425 
  426 /*
  427  * True when lgrp topology is constructed.
  428  */
  429 int     lgrp_topo_initialized = 0;
  430 
  431 /*
  432  * Init routine called after startup(), /etc/system has been processed,
  433  * and cpu0 has been added to an lgroup.
  434  */
  435 static void
  436 lgrp_main_init(void)
  437 {
  438         cpu_t           *cp = CPU;
  439         lgrp_id_t       lgrpid;
  440         int             i;
  441         extern void     pg_cpu0_reinit();
  442 
  443         /*
  444          * Enforce a valid lgrp_mem_default_policy
  445          */
  446         if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
  447             (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
  448             (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
  449                 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
  450 
  451         /*
  452          * See if mpo should be disabled.
  453          * This may happen in the case of null proc LPA on Starcat.
  454          * The platform won't be able to detect null proc LPA until after
  455          * cpu0 and memory have already been added to lgroups.
  456          * When and if it is detected, the Starcat platform will return
  457          * a different platform handle for cpu0 which is what we check for
  458          * here. If mpo should be disabled move cpu0 to it's rightful place
  459          * (the root), and destroy the remaining lgroups. This effectively
  460          * provides an UMA lgroup topology.
  461          */
  462         lgrpid = cp->cpu_lpl->lpl_lgrpid;
  463         if (lgrp_table[lgrpid]->lgrp_plathand !=
  464             lgrp_plat_cpu_to_hand(cp->cpu_id)) {
  465                 lgrp_part_del_cpu(cp);
  466                 lgrp_cpu_fini(cp, lgrpid);
  467 
  468                 lgrp_cpu_init(cp);
  469                 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
  470 
  471                 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
  472 
  473                 /*
  474                  * Notify the PG subsystem that the CPU's lgrp
  475                  * association has changed
  476                  */
  477                 pg_cpu0_reinit();
  478 
  479                 /*
  480                  * Destroy all lgroups except for root
  481                  */
  482                 for (i = 0; i <= lgrp_alloc_max; i++) {
  483                         if (LGRP_EXISTS(lgrp_table[i]) &&
  484                             lgrp_table[i] != lgrp_root)
  485                                 lgrp_destroy(lgrp_table[i]);
  486                 }
  487 
  488                 /*
  489                  * Fix up root to point at itself for leaves and resources
  490                  * and not have any children
  491                  */
  492                 lgrp_root->lgrp_childcnt = 0;
  493                 klgrpset_clear(lgrp_root->lgrp_children);
  494                 klgrpset_clear(lgrp_root->lgrp_leaves);
  495                 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
  496                 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
  497                 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
  498         }
  499 
  500         /*
  501          * Initialize kstats framework.
  502          */
  503         lgrp_kstat_init();
  504         /*
  505          * cpu0 is finally where it should be, so create it's lgroup's kstats
  506          */
  507         mutex_enter(&cpu_lock);
  508         lgrp_kstat_create(cp);
  509         mutex_exit(&cpu_lock);
  510 
  511         lgrp_initialized = 1;
  512 }
  513 
  514 /*
  515  * Finish lgrp initialization after all CPUS are brought on-line.
  516  * This routine is called after start_other_cpus().
  517  */
  518 static void
  519 lgrp_main_mp_init(void)
  520 {
  521         klgrpset_t changed;
  522 
  523         /*
  524          * Update lgroup topology (if necessary)
  525          */
  526         klgrpset_clear(changed);
  527         (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
  528         lgrp_topo_initialized = 1;
  529 }
  530 
  531 /*
  532  * Change latency of lgroup with specified lgroup platform handle (if one is
  533  * given) or change all lgroups with old latency to new latency
  534  */
  535 void
  536 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
  537     u_longlong_t newtime)
  538 {
  539         lgrp_t          *lgrp;
  540         int             i;
  541 
  542         for (i = 0; i <= lgrp_alloc_max; i++) {
  543                 lgrp = lgrp_table[i];
  544 
  545                 if (!LGRP_EXISTS(lgrp))
  546                         continue;
  547 
  548                 if ((hand == LGRP_NULL_HANDLE &&
  549                     lgrp->lgrp_latency == oldtime) ||
  550                     (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
  551                         lgrp->lgrp_latency = (int)newtime;
  552         }
  553 }
  554 
  555 /*
  556  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
  557  */
  558 void
  559 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
  560 {
  561         klgrpset_t      changed;
  562         cpu_t           *cp;
  563         lgrp_id_t       id;
  564         int             rc;
  565 
  566         switch (event) {
  567         /*
  568          * The following (re)configuration events are common code
  569          * initiated. lgrp_plat_config() is called here to inform the
  570          * platform of the reconfiguration event.
  571          */
  572         case LGRP_CONFIG_CPU_ADD:
  573                 cp = (cpu_t *)resource;
  574 
  575                 /*
  576                  * Initialize the new CPU's lgrp related next/prev
  577                  * links, and give it a bootstrap lpl so that it can
  578                  * survive should it need to enter the dispatcher.
  579                  */
  580                 cp->cpu_next_lpl = cp;
  581                 cp->cpu_prev_lpl = cp;
  582                 cp->cpu_next_lgrp = cp;
  583                 cp->cpu_prev_lgrp = cp;
  584                 cp->cpu_lpl = lpl_bootstrap;
  585 
  586                 lgrp_plat_config(event, resource);
  587                 atomic_add_32(&lgrp_gen, 1);
  588 
  589                 break;
  590         case LGRP_CONFIG_CPU_DEL:
  591                 lgrp_plat_config(event, resource);
  592                 atomic_add_32(&lgrp_gen, 1);
  593 
  594                 break;
  595         case LGRP_CONFIG_CPU_ONLINE:
  596                 cp = (cpu_t *)resource;
  597                 lgrp_cpu_init(cp);
  598                 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
  599                 rc = lpl_topo_verify(cp->cpu_part);
  600                 if (rc != LPL_TOPO_CORRECT) {
  601                         panic("lpl_topo_verify failed: %d", rc);
  602                 }
  603                 lgrp_plat_config(event, resource);
  604                 atomic_add_32(&lgrp_gen, 1);
  605 
  606                 break;
  607         case LGRP_CONFIG_CPU_OFFLINE:
  608                 cp = (cpu_t *)resource;
  609                 id = cp->cpu_lpl->lpl_lgrpid;
  610                 lgrp_part_del_cpu(cp);
  611                 lgrp_cpu_fini(cp, id);
  612                 rc = lpl_topo_verify(cp->cpu_part);
  613                 if (rc != LPL_TOPO_CORRECT) {
  614                         panic("lpl_topo_verify failed: %d", rc);
  615                 }
  616                 lgrp_plat_config(event, resource);
  617                 atomic_add_32(&lgrp_gen, 1);
  618 
  619                 break;
  620         case LGRP_CONFIG_CPUPART_ADD:
  621                 cp = (cpu_t *)resource;
  622                 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
  623                 rc = lpl_topo_verify(cp->cpu_part);
  624                 if (rc != LPL_TOPO_CORRECT) {
  625                         panic("lpl_topo_verify failed: %d", rc);
  626                 }
  627                 lgrp_plat_config(event, resource);
  628 
  629                 break;
  630         case LGRP_CONFIG_CPUPART_DEL:
  631                 cp = (cpu_t *)resource;
  632                 lgrp_part_del_cpu((cpu_t *)resource);
  633                 rc = lpl_topo_verify(cp->cpu_part);
  634                 if (rc != LPL_TOPO_CORRECT) {
  635                         panic("lpl_topo_verify failed: %d", rc);
  636                 }
  637                 lgrp_plat_config(event, resource);
  638 
  639                 break;
  640         /*
  641          * The following events are initiated by the memnode
  642          * subsystem.
  643          */
  644         case LGRP_CONFIG_MEM_ADD:
  645                 lgrp_mem_init((int)resource, where, B_FALSE);
  646                 atomic_add_32(&lgrp_gen, 1);
  647 
  648                 break;
  649         case LGRP_CONFIG_MEM_DEL:
  650                 lgrp_mem_fini((int)resource, where, B_FALSE);
  651                 atomic_add_32(&lgrp_gen, 1);
  652 
  653                 break;
  654         case LGRP_CONFIG_MEM_RENAME: {
  655                 lgrp_config_mem_rename_t *ren_arg =
  656                     (lgrp_config_mem_rename_t *)where;
  657 
  658                 lgrp_mem_rename((int)resource,
  659                     ren_arg->lmem_rename_from,
  660                     ren_arg->lmem_rename_to);
  661                 atomic_add_32(&lgrp_gen, 1);
  662 
  663                 break;
  664         }
  665         case LGRP_CONFIG_GEN_UPDATE:
  666                 atomic_add_32(&lgrp_gen, 1);
  667 
  668                 break;
  669         case LGRP_CONFIG_FLATTEN:
  670                 if (where == 0)
  671                         lgrp_topo_levels = (int)resource;
  672                 else
  673                         (void) lgrp_topo_flatten(resource,
  674                             lgrp_table, lgrp_alloc_max, &changed);
  675 
  676                 break;
  677         /*
  678          * Update any lgroups with old latency to new latency
  679          */
  680         case LGRP_CONFIG_LAT_CHANGE_ALL:
  681                 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
  682                     (u_longlong_t)where);
  683 
  684                 break;
  685         /*
  686          * Update lgroup with specified lgroup platform handle to have
  687          * new latency
  688          */
  689         case LGRP_CONFIG_LAT_CHANGE:
  690                 lgrp_latency_change((lgrp_handle_t)resource, 0,
  691                     (u_longlong_t)where);
  692 
  693                 break;
  694         case LGRP_CONFIG_NOP:
  695 
  696                 break;
  697         default:
  698                 break;
  699         }
  700 
  701 }
  702 
  703 /*
  704  * Called to add lgrp info into cpu structure from cpu_add_unit;
  705  * do not assume cpu is in cpu[] yet!
  706  *
  707  * CPUs are brought online with all other CPUs paused so we can't
  708  * allocate memory or we could deadlock the system, so we rely on
  709  * the platform to statically allocate as much space as we need
  710  * for the lgrp structs and stats.
  711  */
  712 static void
  713 lgrp_cpu_init(struct cpu *cp)
  714 {
  715         klgrpset_t      changed;
  716         int             count;
  717         lgrp_handle_t   hand;
  718         int             first_cpu;
  719         lgrp_t          *my_lgrp;
  720         lgrp_id_t       lgrpid;
  721         struct cpu      *cptr;
  722 
  723         /*
  724          * This is the first time through if the resource set
  725          * for the root lgroup is empty. After cpu0 has been
  726          * initially added to an lgroup, the root's CPU resource
  727          * set can never be empty, since the system's last CPU
  728          * cannot be offlined.
  729          */
  730         if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
  731                 /*
  732                  * First time through.
  733                  */
  734                 first_cpu = 1;
  735         } else {
  736                 /*
  737                  * If cpu0 needs to move lgroups, we may come
  738                  * through here again, at which time cpu_lock won't
  739                  * be held, and lgrp_initialized will be false.
  740                  */
  741                 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
  742                 ASSERT(cp->cpu_part != NULL);
  743                 first_cpu = 0;
  744         }
  745 
  746         hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
  747         my_lgrp = lgrp_hand_to_lgrp(hand);
  748 
  749         if (my_lgrp == NULL) {
  750                 /*
  751                  * Create new lgrp and add it to lgroup topology
  752                  */
  753                 my_lgrp = lgrp_create();
  754                 my_lgrp->lgrp_plathand = hand;
  755                 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
  756                 lgrpid = my_lgrp->lgrp_id;
  757                 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
  758                 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
  759 
  760                 count = 0;
  761                 klgrpset_clear(changed);
  762                 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
  763                     &changed);
  764                 /*
  765                  * May have added new intermediate lgroups, so need to add
  766                  * resources other than CPUs which are added below
  767                  */
  768                 (void) lgrp_mnode_update(changed, NULL);
  769         } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
  770             > 0) {
  771                 /*
  772                  * Leaf lgroup was created, but latency wasn't available
  773                  * then.  So, set latency for it and fill in rest of lgroup
  774                  * topology  now that we know how far it is from other leaf
  775                  * lgroups.
  776                  */
  777                 lgrpid = my_lgrp->lgrp_id;
  778                 klgrpset_clear(changed);
  779                 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
  780                     lgrpid))
  781                         klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
  782                 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
  783                     &changed);
  784 
  785                 /*
  786                  * May have added new intermediate lgroups, so need to add
  787                  * resources other than CPUs which are added below
  788                  */
  789                 (void) lgrp_mnode_update(changed, NULL);
  790         } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
  791             my_lgrp->lgrp_id)) {
  792                 int     i;
  793 
  794                 /*
  795                  * Update existing lgroup and lgroups containing it with CPU
  796                  * resource
  797                  */
  798                 lgrpid = my_lgrp->lgrp_id;
  799                 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
  800                 for (i = 0; i <= lgrp_alloc_max; i++) {
  801                         lgrp_t          *lgrp;
  802 
  803                         lgrp = lgrp_table[i];
  804                         if (!LGRP_EXISTS(lgrp) ||
  805                             !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
  806                                 continue;
  807 
  808                         klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
  809                 }
  810         }
  811 
  812         lgrpid = my_lgrp->lgrp_id;
  813         cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
  814 
  815         /*
  816          * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
  817          * end up in lpl for lgroup 0 whether it is supposed to be in there or
  818          * not since none of lgroup IDs in the lpl's have been set yet.
  819          */
  820         if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
  821                 cp->cpu_lpl->lpl_lgrpid = lgrpid;
  822 
  823         /*
  824          * link the CPU into the lgrp's CPU list
  825          */
  826         if (my_lgrp->lgrp_cpucnt == 0) {
  827                 my_lgrp->lgrp_cpu = cp;
  828                 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
  829         } else {
  830                 cptr = my_lgrp->lgrp_cpu;
  831                 cp->cpu_next_lgrp = cptr;
  832                 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
  833                 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
  834                 cptr->cpu_prev_lgrp = cp;
  835         }
  836         my_lgrp->lgrp_cpucnt++;
  837 }
  838 
  839 lgrp_t *
  840 lgrp_create(void)
  841 {
  842         lgrp_t          *my_lgrp;
  843         lgrp_id_t       lgrpid;
  844         int             i;
  845 
  846         ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
  847 
  848         /*
  849          * Find an open slot in the lgroup table and recycle unused lgroup
  850          * left there if any
  851          */
  852         my_lgrp = NULL;
  853         if (lgrp_alloc_hint == -1)
  854                 /*
  855                  * Allocate from end when hint not set yet because no lgroups
  856                  * have been deleted yet
  857                  */
  858                 lgrpid = nlgrps++;
  859         else {
  860                 /*
  861                  * Start looking for next open slot from hint and leave hint
  862                  * at slot allocated
  863                  */
  864                 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
  865                         my_lgrp = lgrp_table[i];
  866                         if (!LGRP_EXISTS(my_lgrp)) {
  867                                 lgrpid = i;
  868                                 nlgrps++;
  869                                 break;
  870                         }
  871                 }
  872                 lgrp_alloc_hint = lgrpid;
  873         }
  874 
  875         /*
  876          * Keep track of max lgroup ID allocated so far to cut down on searches
  877          */
  878         if (lgrpid > lgrp_alloc_max)
  879                 lgrp_alloc_max = lgrpid;
  880 
  881         /*
  882          * Need to allocate new lgroup if next open slot didn't have one
  883          * for recycling
  884          */
  885         if (my_lgrp == NULL)
  886                 my_lgrp = lgrp_plat_alloc(lgrpid);
  887 
  888         if (nlgrps > nlgrpsmax || my_lgrp == NULL)
  889                 panic("Too many lgrps for platform (%d)", nlgrps);
  890 
  891         my_lgrp->lgrp_id = lgrpid;
  892         my_lgrp->lgrp_latency = 0;
  893         my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
  894         my_lgrp->lgrp_parent = NULL;
  895         my_lgrp->lgrp_childcnt = 0;
  896         my_lgrp->lgrp_mnodes = (mnodeset_t)0;
  897         my_lgrp->lgrp_nmnodes = 0;
  898         klgrpset_clear(my_lgrp->lgrp_children);
  899         klgrpset_clear(my_lgrp->lgrp_leaves);
  900         for (i = 0; i < LGRP_RSRC_COUNT; i++)
  901                 klgrpset_clear(my_lgrp->lgrp_set[i]);
  902 
  903         my_lgrp->lgrp_cpu = NULL;
  904         my_lgrp->lgrp_cpucnt = 0;
  905 
  906         if (my_lgrp->lgrp_kstat != NULL)
  907                 lgrp_kstat_reset(lgrpid);
  908 
  909         lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
  910 
  911         return (my_lgrp);
  912 }
  913 
  914 void
  915 lgrp_destroy(lgrp_t *lgrp)
  916 {
  917         int             i;
  918 
  919         /*
  920          * Unless this lgroup is being destroyed on behalf of
  921          * the boot CPU, cpu_lock must be held
  922          */
  923         ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
  924 
  925         if (nlgrps == 1)
  926                 cmn_err(CE_PANIC, "Can't destroy only lgroup!");
  927 
  928         if (!LGRP_EXISTS(lgrp))
  929                 return;
  930 
  931         /*
  932          * Set hint to lgroup being deleted and try to keep lower numbered
  933          * hints to facilitate finding empty slots
  934          */
  935         if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
  936                 lgrp_alloc_hint = lgrp->lgrp_id;
  937 
  938         /*
  939          * Mark this lgroup to be recycled by setting its lgroup ID to
  940          * LGRP_NONE and clear relevant fields
  941          */
  942         lgrp->lgrp_id = LGRP_NONE;
  943         lgrp->lgrp_latency = 0;
  944         lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
  945         lgrp->lgrp_parent = NULL;
  946         lgrp->lgrp_childcnt = 0;
  947 
  948         klgrpset_clear(lgrp->lgrp_children);
  949         klgrpset_clear(lgrp->lgrp_leaves);
  950         for (i = 0; i < LGRP_RSRC_COUNT; i++)
  951                 klgrpset_clear(lgrp->lgrp_set[i]);
  952 
  953         lgrp->lgrp_mnodes = (mnodeset_t)0;
  954         lgrp->lgrp_nmnodes = 0;
  955 
  956         lgrp->lgrp_cpu = NULL;
  957         lgrp->lgrp_cpucnt = 0;
  958 
  959         nlgrps--;
  960 }
  961 
  962 /*
  963  * Initialize kstat data. Called from lgrp intialization code.
  964  */
  965 static void
  966 lgrp_kstat_init(void)
  967 {
  968         lgrp_stat_t     stat;
  969 
  970         mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
  971 
  972         for (stat = 0; stat < LGRP_NUM_STATS; stat++)
  973                 kstat_named_init(&lgrp_kstat_data[stat],
  974                     lgrp_kstat_names[stat], KSTAT_DATA_INT64);
  975 }
  976 
  977 /*
  978  * initialize an lgrp's kstats if needed
  979  * called with cpu_lock held but not with cpus paused.
  980  * we don't tear these down now because we don't know about
  981  * memory leaving the lgrp yet...
  982  */
  983 
  984 void
  985 lgrp_kstat_create(cpu_t *cp)
  986 {
  987         kstat_t         *lgrp_kstat;
  988         lgrp_id_t       lgrpid;
  989         lgrp_t          *my_lgrp;
  990 
  991         ASSERT(MUTEX_HELD(&cpu_lock));
  992 
  993         lgrpid = cp->cpu_lpl->lpl_lgrpid;
  994         my_lgrp = lgrp_table[lgrpid];
  995 
  996         if (my_lgrp->lgrp_kstat != NULL)
  997                 return; /* already initialized */
  998 
  999         lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
 1000             KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
 1001             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
 1002 
 1003         if (lgrp_kstat != NULL) {
 1004                 lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
 1005                 lgrp_kstat->ks_private = my_lgrp;
 1006                 lgrp_kstat->ks_data = &lgrp_kstat_data;
 1007                 lgrp_kstat->ks_update = lgrp_kstat_extract;
 1008                 my_lgrp->lgrp_kstat = lgrp_kstat;
 1009                 kstat_install(lgrp_kstat);
 1010         }
 1011 }
 1012 
 1013 /*
 1014  * this will do something when we manage to remove now unused lgrps
 1015  */
 1016 
 1017 /* ARGSUSED */
 1018 void
 1019 lgrp_kstat_destroy(cpu_t *cp)
 1020 {
 1021         ASSERT(MUTEX_HELD(&cpu_lock));
 1022 }
 1023 
 1024 /*
 1025  * Called when a CPU is off-lined.
 1026  */
 1027 static void
 1028 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
 1029 {
 1030         lgrp_t *my_lgrp;
 1031         struct cpu *prev;
 1032         struct cpu *next;
 1033 
 1034         ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
 1035 
 1036         prev = cp->cpu_prev_lgrp;
 1037         next = cp->cpu_next_lgrp;
 1038 
 1039         prev->cpu_next_lgrp = next;
 1040         next->cpu_prev_lgrp = prev;
 1041 
 1042         /*
 1043          * just because I'm paranoid doesn't mean...
 1044          */
 1045 
 1046         cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
 1047 
 1048         my_lgrp = lgrp_table[lgrpid];
 1049         my_lgrp->lgrp_cpucnt--;
 1050 
 1051         /*
 1052          * Removing last CPU in lgroup, so update lgroup topology
 1053          */
 1054         if (my_lgrp->lgrp_cpucnt == 0) {
 1055                 klgrpset_t      changed;
 1056                 int             count;
 1057                 int             i;
 1058 
 1059                 my_lgrp->lgrp_cpu = NULL;
 1060 
 1061                 /*
 1062                  * Remove this lgroup from its lgroup CPU resources and remove
 1063                  * lgroup from lgroup topology if it doesn't have any more
 1064                  * resources in it now
 1065                  */
 1066                 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 1067                 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
 1068                         count = 0;
 1069                         klgrpset_clear(changed);
 1070                         count += lgrp_leaf_delete(my_lgrp, lgrp_table,
 1071                             lgrp_alloc_max + 1, &changed);
 1072                         return;
 1073                 }
 1074 
 1075                 /*
 1076                  * This lgroup isn't empty, so just remove it from CPU
 1077                  * resources of any lgroups that contain it as such
 1078                  */
 1079                 for (i = 0; i <= lgrp_alloc_max; i++) {
 1080                         lgrp_t          *lgrp;
 1081 
 1082                         lgrp = lgrp_table[i];
 1083                         if (!LGRP_EXISTS(lgrp) ||
 1084                             !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
 1085                             lgrpid))
 1086                                 continue;
 1087 
 1088                         klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
 1089                 }
 1090                 return;
 1091         }
 1092 
 1093         if (my_lgrp->lgrp_cpu == cp)
 1094                 my_lgrp->lgrp_cpu = next;
 1095 
 1096 }
 1097 
 1098 /*
 1099  * Update memory nodes in target lgroups and return ones that get changed
 1100  */
 1101 int
 1102 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
 1103 {
 1104         int     count;
 1105         int     i;
 1106         int     j;
 1107         lgrp_t  *lgrp;
 1108         lgrp_t  *lgrp_rsrc;
 1109 
 1110         count = 0;
 1111         if (changed)
 1112                 klgrpset_clear(*changed);
 1113 
 1114         if (klgrpset_isempty(target))
 1115                 return (0);
 1116 
 1117         /*
 1118          * Find each lgroup in target lgroups
 1119          */
 1120         for (i = 0; i <= lgrp_alloc_max; i++) {
 1121                 /*
 1122                  * Skip any lgroups that don't exist or aren't in target group
 1123                  */
 1124                 lgrp = lgrp_table[i];
 1125                 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
 1126                         continue;
 1127                 }
 1128 
 1129                 /*
 1130                  * Initialize memnodes for intermediate lgroups to 0
 1131                  * and update them from scratch since they may have completely
 1132                  * changed
 1133                  */
 1134                 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
 1135                         lgrp->lgrp_mnodes = (mnodeset_t)0;
 1136                         lgrp->lgrp_nmnodes = 0;
 1137                 }
 1138 
 1139                 /*
 1140                  * Update memory nodes of of target lgroup with memory nodes
 1141                  * from each lgroup in its lgroup memory resource set
 1142                  */
 1143                 for (j = 0; j <= lgrp_alloc_max; j++) {
 1144                         int     k;
 1145 
 1146                         /*
 1147                          * Skip any lgroups that don't exist or aren't in
 1148                          * memory resources of target lgroup
 1149                          */
 1150                         lgrp_rsrc = lgrp_table[j];
 1151                         if (!LGRP_EXISTS(lgrp_rsrc) ||
 1152                             !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
 1153                             j))
 1154                                 continue;
 1155 
 1156                         /*
 1157                          * Update target lgroup's memnodes to include memnodes
 1158                          * of this lgroup
 1159                          */
 1160                         for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
 1161                                 mnodeset_t      mnode_mask;
 1162 
 1163                                 mnode_mask = (mnodeset_t)1 << k;
 1164                                 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
 1165                                     !(lgrp->lgrp_mnodes & mnode_mask)) {
 1166                                         lgrp->lgrp_mnodes |= mnode_mask;
 1167                                         lgrp->lgrp_nmnodes++;
 1168                                 }
 1169                         }
 1170                         count++;
 1171                         if (changed)
 1172                                 klgrpset_add(*changed, lgrp->lgrp_id);
 1173                 }
 1174         }
 1175 
 1176         return (count);
 1177 }
 1178 
 1179 /*
 1180  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
 1181  * is moved from one board to another. The "from" and "to" arguments specify the
 1182  * source and the destination of the move.
 1183  *
 1184  * See plat_lgrp_config() for a detailed description of the copy-rename
 1185  * semantics.
 1186  *
 1187  * The lgrp_mem_rename() is called by the platform copy-rename code to update
 1188  * the lgroup topology which is changing as memory moves from one lgroup to
 1189  * another. It removes the mnode from the source lgroup and re-inserts it in the
 1190  * target lgroup.
 1191  *
 1192  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
 1193  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
 1194  * copy-rename operation.
 1195  *
 1196  * There is one case which requires special handling. If the system contains
 1197  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
 1198  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
 1199  * lgrp_mem_init), but there is a window when the system has no memory in the
 1200  * lgroup hierarchy. If another thread tries to allocate memory during this
 1201  * window, the allocation will fail, although the system has physical memory.
 1202  * This may cause a system panic or a deadlock (some sleeping memory allocations
 1203  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
 1204  * the mnode back).
 1205  *
 1206  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
 1207  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
 1208  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
 1209  * but it updates the rest of the lgroup topology as if the mnode was actually
 1210  * removed. The lgrp_mem_init() function recognizes that the mnode being
 1211  * inserted represents such a special case and updates the topology
 1212  * appropriately.
 1213  */
 1214 void
 1215 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
 1216 {
 1217         /*
 1218          * Remove the memory from the source node and add it to the destination
 1219          * node.
 1220          */
 1221         lgrp_mem_fini(mnode, from, B_TRUE);
 1222         lgrp_mem_init(mnode, to, B_TRUE);
 1223 }
 1224 
 1225 /*
 1226  * Called to indicate that the lgrp with platform handle "hand" now
 1227  * contains the memory identified by "mnode".
 1228  *
 1229  * LOCKING for this routine is a bit tricky. Usually it is called without
 1230  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
 1231  * callers. During DR of the board containing the caged memory it may be called
 1232  * with cpu_lock already held and CPUs paused.
 1233  *
 1234  * If the insertion is part of the DR copy-rename and the inserted mnode (and
 1235  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
 1236  * dealing with the special case of DR copy-rename described in
 1237  * lgrp_mem_rename().
 1238  */
 1239 void
 1240 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
 1241 {
 1242         klgrpset_t      changed;
 1243         int             count;
 1244         int             i;
 1245         lgrp_t          *my_lgrp;
 1246         lgrp_id_t       lgrpid;
 1247         mnodeset_t      mnodes_mask = ((mnodeset_t)1 << mnode);
 1248         boolean_t       drop_lock = B_FALSE;
 1249         boolean_t       need_synch = B_FALSE;
 1250 
 1251         /*
 1252          * Grab CPU lock (if we haven't already)
 1253          */
 1254         if (!MUTEX_HELD(&cpu_lock)) {
 1255                 mutex_enter(&cpu_lock);
 1256                 drop_lock = B_TRUE;
 1257         }
 1258 
 1259         /*
 1260          * This routine may be called from a context where we already
 1261          * hold cpu_lock, and have already paused cpus.
 1262          */
 1263         if (!cpus_paused())
 1264                 need_synch = B_TRUE;
 1265 
 1266         /*
 1267          * Check if this mnode is already configured and return immediately if
 1268          * it is.
 1269          *
 1270          * NOTE: in special case of copy-rename of the only remaining mnode,
 1271          * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
 1272          * recognize this case and continue as usual, but skip the update to
 1273          * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
 1274          * in topology, temporarily introduced by lgrp_mem_fini().
 1275          */
 1276         if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
 1277             lgrp_root->lgrp_mnodes & mnodes_mask) {
 1278                 if (drop_lock)
 1279                         mutex_exit(&cpu_lock);
 1280                 return;
 1281         }
 1282 
 1283         /*
 1284          * Update lgroup topology with new memory resources, keeping track of
 1285          * which lgroups change
 1286          */
 1287         count = 0;
 1288         klgrpset_clear(changed);
 1289         my_lgrp = lgrp_hand_to_lgrp(hand);
 1290         if (my_lgrp == NULL) {
 1291                 /* new lgrp */
 1292                 my_lgrp = lgrp_create();
 1293                 lgrpid = my_lgrp->lgrp_id;
 1294                 my_lgrp->lgrp_plathand = hand;
 1295                 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
 1296                 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
 1297                 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
 1298 
 1299                 if (need_synch)
 1300                         pause_cpus(NULL);
 1301                 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
 1302                     &changed);
 1303                 if (need_synch)
 1304                         start_cpus();
 1305         } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
 1306             > 0) {
 1307                 /*
 1308                  * Leaf lgroup was created, but latency wasn't available
 1309                  * then.  So, set latency for it and fill in rest of lgroup
 1310                  * topology  now that we know how far it is from other leaf
 1311                  * lgroups.
 1312                  */
 1313                 klgrpset_clear(changed);
 1314                 lgrpid = my_lgrp->lgrp_id;
 1315                 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
 1316                     lgrpid))
 1317                         klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
 1318                 if (need_synch)
 1319                         pause_cpus(NULL);
 1320                 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
 1321                     &changed);
 1322                 if (need_synch)
 1323                         start_cpus();
 1324         } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
 1325             my_lgrp->lgrp_id)) {
 1326                 /*
 1327                  * Add new lgroup memory resource to existing lgroup
 1328                  */
 1329                 lgrpid = my_lgrp->lgrp_id;
 1330                 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
 1331                 klgrpset_add(changed, lgrpid);
 1332                 count++;
 1333                 for (i = 0; i <= lgrp_alloc_max; i++) {
 1334                         lgrp_t          *lgrp;
 1335 
 1336                         lgrp = lgrp_table[i];
 1337                         if (!LGRP_EXISTS(lgrp) ||
 1338                             !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
 1339                                 continue;
 1340 
 1341                         klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
 1342                         klgrpset_add(changed, lgrp->lgrp_id);
 1343                         count++;
 1344                 }
 1345         }
 1346 
 1347         /*
 1348          * Add memory node to lgroup and remove lgroup from ones that need
 1349          * to be updated
 1350          */
 1351         if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
 1352                 my_lgrp->lgrp_mnodes |= mnodes_mask;
 1353                 my_lgrp->lgrp_nmnodes++;
 1354         }
 1355         klgrpset_del(changed, lgrpid);
 1356 
 1357         /*
 1358          * Update memory node information for all lgroups that changed and
 1359          * contain new memory node as a resource
 1360          */
 1361         if (count)
 1362                 (void) lgrp_mnode_update(changed, NULL);
 1363 
 1364         if (drop_lock)
 1365                 mutex_exit(&cpu_lock);
 1366 }
 1367 
 1368 /*
 1369  * Called to indicate that the lgroup associated with the platform
 1370  * handle "hand" no longer contains given memory node
 1371  *
 1372  * LOCKING for this routine is a bit tricky. Usually it is called without
 1373  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
 1374  * callers. During DR of the board containing the caged memory it may be called
 1375  * with cpu_lock already held and CPUs paused.
 1376  *
 1377  * If the deletion is part of the DR copy-rename and the deleted mnode is the
 1378  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
 1379  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
 1380  * the same mnode back into the topology. See lgrp_mem_rename() and
 1381  * lgrp_mem_init() for additional details.
 1382  */
 1383 void
 1384 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
 1385 {
 1386         klgrpset_t      changed;
 1387         int             count;
 1388         int             i;
 1389         lgrp_t          *my_lgrp;
 1390         lgrp_id_t       lgrpid;
 1391         mnodeset_t      mnodes_mask;
 1392         boolean_t       drop_lock = B_FALSE;
 1393         boolean_t       need_synch = B_FALSE;
 1394 
 1395         /*
 1396          * Grab CPU lock (if we haven't already)
 1397          */
 1398         if (!MUTEX_HELD(&cpu_lock)) {
 1399                 mutex_enter(&cpu_lock);
 1400                 drop_lock = B_TRUE;
 1401         }
 1402 
 1403         /*
 1404          * This routine may be called from a context where we already
 1405          * hold cpu_lock and have already paused cpus.
 1406          */
 1407         if (!cpus_paused())
 1408                 need_synch = B_TRUE;
 1409 
 1410         my_lgrp = lgrp_hand_to_lgrp(hand);
 1411 
 1412         /*
 1413          * The lgrp *must* be pre-existing
 1414          */
 1415         ASSERT(my_lgrp != NULL);
 1416 
 1417         /*
 1418          * Delete memory node from lgroups which contain it
 1419          */
 1420         mnodes_mask = ((mnodeset_t)1 << mnode);
 1421         for (i = 0; i <= lgrp_alloc_max; i++) {
 1422                 lgrp_t *lgrp = lgrp_table[i];
 1423                 /*
 1424                  * Skip any non-existent lgroups and any lgroups that don't
 1425                  * contain leaf lgroup of memory as a memory resource
 1426                  */
 1427                 if (!LGRP_EXISTS(lgrp) ||
 1428                     !(lgrp->lgrp_mnodes & mnodes_mask))
 1429                         continue;
 1430 
 1431                 /*
 1432                  * Avoid removing the last mnode from the root in the DR
 1433                  * copy-rename case. See lgrp_mem_rename() for details.
 1434                  */
 1435                 if (is_copy_rename &&
 1436                     (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
 1437                         continue;
 1438 
 1439                 /*
 1440                  * Remove memory node from lgroup.
 1441                  */
 1442                 lgrp->lgrp_mnodes &= ~mnodes_mask;
 1443                 lgrp->lgrp_nmnodes--;
 1444                 ASSERT(lgrp->lgrp_nmnodes >= 0);
 1445         }
 1446         ASSERT(lgrp_root->lgrp_nmnodes > 0);
 1447 
 1448         /*
 1449          * Don't need to update lgroup topology if this lgroup still has memory.
 1450          *
 1451          * In the special case of DR copy-rename with the only mnode being
 1452          * removed, the lgrp_mnodes for the root is always non-zero, but we
 1453          * still need to update the lgroup topology.
 1454          */
 1455         if ((my_lgrp->lgrp_nmnodes > 0) &&
 1456             !(is_copy_rename && (my_lgrp == lgrp_root) &&
 1457             (my_lgrp->lgrp_mnodes == mnodes_mask))) {
 1458                 if (drop_lock)
 1459                         mutex_exit(&cpu_lock);
 1460                 return;
 1461         }
 1462 
 1463         /*
 1464          * This lgroup does not contain any memory now
 1465          */
 1466         klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
 1467 
 1468         /*
 1469          * Remove this lgroup from lgroup topology if it does not contain any
 1470          * resources now
 1471          */
 1472         lgrpid = my_lgrp->lgrp_id;
 1473         count = 0;
 1474         klgrpset_clear(changed);
 1475         if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
 1476                 /*
 1477                  * Delete lgroup when no more resources
 1478                  */
 1479                 if (need_synch)
 1480                         pause_cpus(NULL);
 1481                 count = lgrp_leaf_delete(my_lgrp, lgrp_table,
 1482                     lgrp_alloc_max + 1, &changed);
 1483                 ASSERT(count > 0);
 1484                 if (need_synch)
 1485                         start_cpus();
 1486         } else {
 1487                 /*
 1488                  * Remove lgroup from memory resources of any lgroups that
 1489                  * contain it as such
 1490                  */
 1491                 for (i = 0; i <= lgrp_alloc_max; i++) {
 1492                         lgrp_t          *lgrp;
 1493 
 1494                         lgrp = lgrp_table[i];
 1495                         if (!LGRP_EXISTS(lgrp) ||
 1496                             !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
 1497                             lgrpid))
 1498                                 continue;
 1499 
 1500                         klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
 1501                 }
 1502         }
 1503         if (drop_lock)
 1504                 mutex_exit(&cpu_lock);
 1505 }
 1506 
 1507 /*
 1508  * Return lgroup with given platform handle
 1509  */
 1510 lgrp_t *
 1511 lgrp_hand_to_lgrp(lgrp_handle_t hand)
 1512 {
 1513         int     i;
 1514         lgrp_t  *lgrp;
 1515 
 1516         if (hand == LGRP_NULL_HANDLE)
 1517                 return (NULL);
 1518 
 1519         for (i = 0; i <= lgrp_alloc_max; i++) {
 1520                 lgrp = lgrp_table[i];
 1521                 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
 1522                         return (lgrp);
 1523         }
 1524         return (NULL);
 1525 }
 1526 
 1527 /*
 1528  * Return the home lgroup of the current thread.
 1529  * We must do this with kernel preemption disabled, since we don't want our
 1530  * thread to be re-homed while we're poking around with its lpl, and the lpl
 1531  * should never be NULL.
 1532  *
 1533  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
 1534  * is enabled because of DR.  Callers can use disable kernel preemption
 1535  * around this call to guarantee that the lgroup will be valid beyond this
 1536  * routine, since kernel preemption can be recursive.
 1537  */
 1538 lgrp_t *
 1539 lgrp_home_lgrp(void)
 1540 {
 1541         lgrp_t  *lgrp;
 1542         lpl_t   *lpl;
 1543 
 1544         kpreempt_disable();
 1545 
 1546         lpl = curthread->t_lpl;
 1547         ASSERT(lpl != NULL);
 1548         ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
 1549         ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
 1550         lgrp = lgrp_table[lpl->lpl_lgrpid];
 1551 
 1552         kpreempt_enable();
 1553 
 1554         return (lgrp);
 1555 }
 1556 
 1557 /*
 1558  * Return ID of home lgroup for given thread
 1559  * (See comments for lgrp_home_lgrp() for special care and handling
 1560  * instructions)
 1561  */
 1562 lgrp_id_t
 1563 lgrp_home_id(kthread_t *t)
 1564 {
 1565         lgrp_id_t       lgrp;
 1566         lpl_t           *lpl;
 1567 
 1568         ASSERT(t != NULL);
 1569         /*
 1570          * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
 1571          * cannot since the HAT layer can call into this routine to
 1572          * determine the locality for its data structures in the context
 1573          * of a page fault.
 1574          */
 1575 
 1576         kpreempt_disable();
 1577 
 1578         lpl = t->t_lpl;
 1579         ASSERT(lpl != NULL);
 1580         ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
 1581         lgrp = lpl->lpl_lgrpid;
 1582 
 1583         kpreempt_enable();
 1584 
 1585         return (lgrp);
 1586 }
 1587 
 1588 /*
 1589  * Return lgroup containing the physical memory for the given page frame number
 1590  */
 1591 lgrp_t *
 1592 lgrp_pfn_to_lgrp(pfn_t pfn)
 1593 {
 1594         lgrp_handle_t   hand;
 1595         int             i;
 1596         lgrp_t          *lgrp;
 1597 
 1598         hand = lgrp_plat_pfn_to_hand(pfn);
 1599         if (hand != LGRP_NULL_HANDLE)
 1600                 for (i = 0; i <= lgrp_alloc_max; i++) {
 1601                         lgrp = lgrp_table[i];
 1602                         if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
 1603                                 return (lgrp);
 1604                 }
 1605         return (NULL);
 1606 }
 1607 
 1608 /*
 1609  * Return lgroup containing the physical memory for the given page frame number
 1610  */
 1611 lgrp_t *
 1612 lgrp_phys_to_lgrp(u_longlong_t physaddr)
 1613 {
 1614         lgrp_handle_t   hand;
 1615         int             i;
 1616         lgrp_t          *lgrp;
 1617         pfn_t           pfn;
 1618 
 1619         pfn = btop(physaddr);
 1620         hand = lgrp_plat_pfn_to_hand(pfn);
 1621         if (hand != LGRP_NULL_HANDLE)
 1622                 for (i = 0; i <= lgrp_alloc_max; i++) {
 1623                         lgrp = lgrp_table[i];
 1624                         if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
 1625                                 return (lgrp);
 1626                 }
 1627         return (NULL);
 1628 }
 1629 
 1630 /*
 1631  * Return the leaf lgroup containing the given CPU
 1632  *
 1633  * The caller needs to take precautions necessary to prevent
 1634  * "cpu", and it's lpl from going away across a call to this function.
 1635  * hint: kpreempt_disable()/kpreempt_enable()
 1636  */
 1637 static lgrp_t *
 1638 lgrp_cpu_to_lgrp(cpu_t *cpu)
 1639 {
 1640         return (cpu->cpu_lpl->lpl_lgrp);
 1641 }
 1642 
 1643 /*
 1644  * Return the sum of the partition loads in an lgrp divided by
 1645  * the number of CPUs in the lgrp.  This is our best approximation
 1646  * of an 'lgroup load average' for a useful per-lgroup kstat.
 1647  */
 1648 static uint64_t
 1649 lgrp_sum_loadavgs(lgrp_t *lgrp)
 1650 {
 1651         cpu_t *cpu;
 1652         int ncpu;
 1653         uint64_t loads = 0;
 1654 
 1655         mutex_enter(&cpu_lock);
 1656 
 1657         cpu = lgrp->lgrp_cpu;
 1658         ncpu = lgrp->lgrp_cpucnt;
 1659 
 1660         if (cpu == NULL || ncpu == 0) {
 1661                 mutex_exit(&cpu_lock);
 1662                 return (0ull);
 1663         }
 1664 
 1665         do {
 1666                 loads += cpu->cpu_lpl->lpl_loadavg;
 1667                 cpu = cpu->cpu_next_lgrp;
 1668         } while (cpu != lgrp->lgrp_cpu);
 1669 
 1670         mutex_exit(&cpu_lock);
 1671 
 1672         return (loads / ncpu);
 1673 }
 1674 
 1675 void
 1676 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
 1677 {
 1678         struct lgrp_stats *pstats;
 1679 
 1680         /*
 1681          * Verify that the caller isn't trying to add to
 1682          * a statistic for an lgroup that has gone away
 1683          */
 1684         if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
 1685                 return;
 1686 
 1687         pstats = &lgrp_stats[lgrpid];
 1688         atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
 1689 }
 1690 
 1691 int64_t
 1692 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
 1693 {
 1694         uint64_t val;
 1695         struct lgrp_stats *pstats;
 1696 
 1697         if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
 1698                 return ((int64_t)0);
 1699 
 1700         pstats = &lgrp_stats[lgrpid];
 1701         LGRP_STAT_READ(pstats, stat, val);
 1702         return (val);
 1703 }
 1704 
 1705 /*
 1706  * Reset all kstats for lgrp specified by its lgrpid.
 1707  */
 1708 static void
 1709 lgrp_kstat_reset(lgrp_id_t lgrpid)
 1710 {
 1711         lgrp_stat_t stat;
 1712 
 1713         if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
 1714                 return;
 1715 
 1716         for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
 1717                 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
 1718         }
 1719 }
 1720 
 1721 /*
 1722  * Collect all per-lgrp statistics for the lgrp associated with this
 1723  * kstat, and store them in the ks_data array.
 1724  *
 1725  * The superuser can reset all the running counter statistics for an
 1726  * lgrp by writing to any of the lgrp's stats.
 1727  */
 1728 static int
 1729 lgrp_kstat_extract(kstat_t *ksp, int rw)
 1730 {
 1731         lgrp_stat_t             stat;
 1732         struct kstat_named      *ksd;
 1733         lgrp_t                  *lgrp;
 1734         lgrp_id_t               lgrpid;
 1735 
 1736         lgrp = (lgrp_t *)ksp->ks_private;
 1737 
 1738         ksd = (struct kstat_named *)ksp->ks_data;
 1739         ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
 1740 
 1741         lgrpid = lgrp->lgrp_id;
 1742 
 1743         if (lgrpid == LGRP_NONE) {
 1744                 /*
 1745                  * Return all zeroes as stats for freed lgrp.
 1746                  */
 1747                 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
 1748                         ksd[stat].value.i64 = 0;
 1749                 }
 1750                 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
 1751                 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
 1752                 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
 1753                 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
 1754                 ksd[stat + LGRP_LOADAVG].value.i64 = 0;
 1755         } else if (rw != KSTAT_WRITE) {
 1756                 /*
 1757                  * Handle counter stats
 1758                  */
 1759                 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
 1760                         ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
 1761                 }
 1762 
 1763                 /*
 1764                  * Handle kernel data snapshot stats
 1765                  */
 1766                 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
 1767                 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
 1768                     lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
 1769                 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
 1770                     lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
 1771                 ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
 1772                     lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
 1773                 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
 1774                 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
 1775                     lgrp_loadavg_max_effect;
 1776         } else {
 1777                 lgrp_kstat_reset(lgrpid);
 1778         }
 1779 
 1780         return (0);
 1781 }
 1782 
 1783 int
 1784 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
 1785 {
 1786         cpu_t   *cp;
 1787 
 1788         mutex_enter(&cpu_lock);
 1789 
 1790         if ((cp = cpu_get(id)) == NULL) {
 1791                 mutex_exit(&cpu_lock);
 1792                 return (EINVAL);
 1793         }
 1794 
 1795         if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
 1796                 mutex_exit(&cpu_lock);
 1797                 return (EINVAL);
 1798         }
 1799 
 1800         ASSERT(cp->cpu_lpl != NULL);
 1801 
 1802         *lp = cp->cpu_lpl->lpl_lgrpid;
 1803 
 1804         mutex_exit(&cpu_lock);
 1805 
 1806         return (0);
 1807 }
 1808 
 1809 int
 1810 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
 1811 {
 1812         cpu_t *cp;
 1813 
 1814         mutex_enter(&cpu_lock);
 1815 
 1816         if ((cp = cpu_get(id)) == NULL) {
 1817                 mutex_exit(&cpu_lock);
 1818                 return (EINVAL);
 1819         }
 1820 
 1821         ASSERT(cp->cpu_lpl != NULL);
 1822 
 1823         *lp = cp->cpu_lpl->lpl_loadavg;
 1824 
 1825         mutex_exit(&cpu_lock);
 1826 
 1827         return (0);
 1828 }
 1829 
 1830 /*
 1831  * Add a resource named by lpl_leaf to rset of lpl_target
 1832  *
 1833  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
 1834  * resource. It is adjusted here, as this is presently the only place that we
 1835  * can be certain a resource addition has succeeded.
 1836  *
 1837  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
 1838  * list in order until it reaches a NULL.  (This list is required to be NULL
 1839  * terminated, too).  This is done so that we can mark start pos + 1, so that
 1840  * each lpl is traversed sequentially, but in a different order.  We hope this
 1841  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
 1842  */
 1843 
 1844 void
 1845 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
 1846 {
 1847         int             i;
 1848         int             entry_slot = 0;
 1849 
 1850         /* return if leaf is already present */
 1851         for (i = 0; i < lpl_target->lpl_nrset; i++) {
 1852                 if (lpl_target->lpl_rset[i] == lpl_leaf) {
 1853                         return;
 1854                 }
 1855 
 1856                 if (lpl_target->lpl_rset[i]->lpl_lgrpid >
 1857                     lpl_leaf->lpl_lgrpid) {
 1858                         break;
 1859                 }
 1860         }
 1861 
 1862         /* insert leaf, update counts */
 1863         entry_slot = i;
 1864         i = lpl_target->lpl_nrset++;
 1865 
 1866         /*
 1867          * Start at the end of the rset array and work backwards towards the
 1868          * slot into which the new lpl will be inserted. This effectively
 1869          * preserves the current ordering by scooting everybody over one entry,
 1870          * and placing the new entry into the space created.
 1871          */
 1872         while (i-- > entry_slot) {
 1873                 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
 1874                 lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
 1875                     i + 1;
 1876         }
 1877 
 1878         lpl_target->lpl_rset[entry_slot] = lpl_leaf;
 1879         lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
 1880 
 1881         lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
 1882 }
 1883 
 1884 /*
 1885  * Update each of lpl_parent's children with a reference to their parent.
 1886  * The lgrp topology is used as the reference since it is fully
 1887  * consistent and correct at this point.
 1888  * This should be called after any potential change in lpl_parent's
 1889  * rset.
 1890  */
 1891 static void
 1892 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
 1893 {
 1894         klgrpset_t      children;
 1895         int             i;
 1896 
 1897         children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
 1898         if (klgrpset_isempty(children))
 1899                 return; /* nothing to do */
 1900 
 1901         for (i = 0; i <= lgrp_alloc_max; i++) {
 1902                 if (klgrpset_ismember(children, i)) {
 1903                         /*
 1904                          * (Re)set the parent. It may be incorrect if
 1905                          * lpl_parent is new in the topology.
 1906                          */
 1907                         cp->cp_lgrploads[i].lpl_parent = lpl_parent;
 1908                 }
 1909         }
 1910 }
 1911 
 1912 /*
 1913  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
 1914  *
 1915  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
 1916  * resource. The values are adjusted here, as this is the only place that we can
 1917  * be certain a resource was successfully deleted.
 1918  */
 1919 void
 1920 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
 1921 {
 1922         int i;
 1923         lpl_t *leaf;
 1924 
 1925         if (lpl_target->lpl_nrset == 0)
 1926                 return;
 1927 
 1928         /* find leaf in intermediate node */
 1929         for (i = 0; i < lpl_target->lpl_nrset; i++) {
 1930                 if (lpl_target->lpl_rset[i] == lpl_leaf)
 1931                         break;
 1932         }
 1933 
 1934         /* return if leaf not found */
 1935         if (lpl_target->lpl_rset[i] != lpl_leaf)
 1936                 return;
 1937 
 1938         /* prune leaf, compress array */
 1939         lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
 1940         lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
 1941         lpl_target->lpl_ncpu--;
 1942         do {
 1943                 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
 1944                 /*
 1945                  * Update the lgrp id <=> rset mapping
 1946                  */
 1947                 if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
 1948                         lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
 1949                 }
 1950         } while (i++ < lpl_target->lpl_nrset);
 1951 }
 1952 
 1953 /*
 1954  * Check to see if the resource set of the target lpl contains the
 1955  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
 1956  */
 1957 
 1958 int
 1959 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
 1960 {
 1961         int i;
 1962 
 1963         for (i = 0; i < lpl_target->lpl_nrset; i++) {
 1964                 if (lpl_target->lpl_rset[i] == lpl_leaf)
 1965                         return (1);
 1966         }
 1967 
 1968         return (0);
 1969 }
 1970 
 1971 /*
 1972  * Called when we change cpu lpl membership.  This increments or decrements the
 1973  * per-cpu counter in every lpl in which our leaf appears.
 1974  */
 1975 void
 1976 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
 1977 {
 1978         cpupart_t       *cpupart;
 1979         lgrp_t          *lgrp_leaf;
 1980         lgrp_t          *lgrp_cur;
 1981         lpl_t           *lpl_leaf;
 1982         lpl_t           *lpl_cur;
 1983         int             i;
 1984 
 1985         ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
 1986 
 1987         cpupart = cp->cpu_part;
 1988         lpl_leaf = cp->cpu_lpl;
 1989         lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
 1990 
 1991         for (i = 0; i <= lgrp_alloc_max; i++) {
 1992                 lgrp_cur = lgrp_table[i];
 1993 
 1994                 /*
 1995                  * Don't adjust if the lgrp isn't there, if we're the leaf lpl
 1996                  * for the cpu in question, or if the current lgrp and leaf
 1997                  * don't share the same resources.
 1998                  */
 1999 
 2000                 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
 2001                     !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
 2002                     lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
 2003                         continue;
 2004 
 2005 
 2006                 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
 2007 
 2008                 if (lpl_cur->lpl_nrset > 0) {
 2009                         if (act == LPL_INCREMENT) {
 2010                                 lpl_cur->lpl_ncpu++;
 2011                         } else if (act == LPL_DECREMENT) {
 2012                                 lpl_cur->lpl_ncpu--;
 2013                         }
 2014                 }
 2015         }
 2016 }
 2017 
 2018 /*
 2019  * Initialize lpl with given resources and specified lgrp
 2020  */
 2021 void
 2022 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
 2023 {
 2024         lpl->lpl_lgrpid = lgrp->lgrp_id;
 2025         lpl->lpl_loadavg = 0;
 2026         if (lpl == lpl_leaf)
 2027                 lpl->lpl_ncpu = 1;
 2028         else
 2029                 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
 2030         lpl->lpl_nrset = 1;
 2031         lpl->lpl_rset[0] = lpl_leaf;
 2032         lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
 2033         lpl->lpl_lgrp = lgrp;
 2034         lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
 2035         lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
 2036 }
 2037 
 2038 /*
 2039  * Clear an unused lpl
 2040  */
 2041 void
 2042 lpl_clear(lpl_t *lpl)
 2043 {
 2044         /*
 2045          * Clear out all fields in the lpl except:
 2046          *    lpl_lgrpid - to facilitate debugging
 2047          *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
 2048          *
 2049          * Note that the lpl's rset and id2rset mapping are cleared as well.
 2050          */
 2051         lpl->lpl_loadavg = 0;
 2052         lpl->lpl_ncpu = 0;
 2053         lpl->lpl_lgrp = NULL;
 2054         lpl->lpl_parent = NULL;
 2055         lpl->lpl_cpus = NULL;
 2056         lpl->lpl_nrset = 0;
 2057         lpl->lpl_homed_time = 0;
 2058         bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
 2059         bzero(lpl->lpl_id2rset,
 2060             sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
 2061 }
 2062 
 2063 /*
 2064  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
 2065  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
 2066  * make full use of all of the lgroup topology, but this checks to make sure
 2067  * that for the parts that it does use, it has correctly understood the
 2068  * relationships that exist. This function returns
 2069  * 0 if the topology is correct, and a non-zero error code, for non-debug
 2070  * kernels if incorrect.  Asserts are spread throughout the code to aid in
 2071  * debugging on a DEBUG kernel.
 2072  */
 2073 int
 2074 lpl_topo_verify(cpupart_t *cpupart)
 2075 {
 2076         lgrp_t          *lgrp;
 2077         lpl_t           *lpl;
 2078         klgrpset_t      rset;
 2079         klgrpset_t      cset;
 2080         cpu_t           *cpu;
 2081         cpu_t           *cp_start;
 2082         int             i;
 2083         int             j;
 2084         int             sum;
 2085 
 2086         /* topology can't be incorrect if it doesn't exist */
 2087         if (!lgrp_topo_initialized || !lgrp_initialized)
 2088                 return (LPL_TOPO_CORRECT);
 2089 
 2090         ASSERT(cpupart != NULL);
 2091 
 2092         for (i = 0; i <= lgrp_alloc_max; i++) {
 2093                 lgrp = lgrp_table[i];
 2094                 lpl = NULL;
 2095                 /* make sure lpls are allocated */
 2096                 ASSERT(cpupart->cp_lgrploads);
 2097                 if (!cpupart->cp_lgrploads)
 2098                         return (LPL_TOPO_PART_HAS_NO_LPL);
 2099 
 2100                 lpl = &cpupart->cp_lgrploads[i];
 2101                 /* make sure our index is good */
 2102                 ASSERT(i < cpupart->cp_nlgrploads);
 2103 
 2104                 /* if lgroup doesn't exist, make sure lpl is empty */
 2105                 if (!LGRP_EXISTS(lgrp)) {
 2106                         ASSERT(lpl->lpl_ncpu == 0);
 2107                         if (lpl->lpl_ncpu > 0) {
 2108                                 return (LPL_TOPO_CPUS_NOT_EMPTY);
 2109                         } else {
 2110                                 continue;
 2111                         }
 2112                 }
 2113 
 2114                 /* verify that lgroup and lpl are identically numbered */
 2115                 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
 2116 
 2117                 /* if lgroup isn't in our partition, make sure lpl is empty */
 2118                 if (!klgrpset_intersects(lgrp->lgrp_leaves,
 2119                     cpupart->cp_lgrpset)) {
 2120                         ASSERT(lpl->lpl_ncpu == 0);
 2121                         if (lpl->lpl_ncpu > 0) {
 2122                                 return (LPL_TOPO_CPUS_NOT_EMPTY);
 2123                         }
 2124                         /*
 2125                          * lpl is empty, and lgroup isn't in partition.  verify
 2126                          * that lpl doesn't show up in anyone else's rsets (in
 2127                          * this partition, anyway)
 2128                          */
 2129                         for (j = 0; j < cpupart->cp_nlgrploads; j++) {
 2130                                 lpl_t *i_lpl; /* lpl we're iterating over */
 2131 
 2132                                 i_lpl = &cpupart->cp_lgrploads[j];
 2133 
 2134                                 ASSERT(!lpl_rset_contains(i_lpl, lpl));
 2135                                 if (lpl_rset_contains(i_lpl, lpl)) {
 2136                                         return (LPL_TOPO_LPL_ORPHANED);
 2137                                 }
 2138                         }
 2139                         /* lgroup is empty, and everything is ok. continue */
 2140                         continue;
 2141                 }
 2142 
 2143 
 2144                 /* lgroup is in this partition, now check it against lpl */
 2145 
 2146                 /* do both have matching lgrps? */
 2147                 ASSERT(lgrp == lpl->lpl_lgrp);
 2148                 if (lgrp != lpl->lpl_lgrp) {
 2149                         return (LPL_TOPO_LGRP_MISMATCH);
 2150                 }
 2151 
 2152                 /* do the parent lgroups exist and do they match? */
 2153                 if (lgrp->lgrp_parent) {
 2154                         ASSERT(lpl->lpl_parent);
 2155                         ASSERT(lgrp->lgrp_parent->lgrp_id ==
 2156                             lpl->lpl_parent->lpl_lgrpid);
 2157 
 2158                         if (!lpl->lpl_parent) {
 2159                                 return (LPL_TOPO_MISSING_PARENT);
 2160                         } else if (lgrp->lgrp_parent->lgrp_id !=
 2161                             lpl->lpl_parent->lpl_lgrpid) {
 2162                                 return (LPL_TOPO_PARENT_MISMATCH);
 2163                         }
 2164                 }
 2165 
 2166                 /* only leaf lgroups keep a cpucnt, only check leaves */
 2167                 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
 2168 
 2169                         /* verify that lgrp is also a leaf */
 2170                         ASSERT((lgrp->lgrp_childcnt == 0) &&
 2171                             (klgrpset_ismember(lgrp->lgrp_leaves,
 2172                             lpl->lpl_lgrpid)));
 2173 
 2174                         if ((lgrp->lgrp_childcnt > 0) ||
 2175                             (!klgrpset_ismember(lgrp->lgrp_leaves,
 2176                             lpl->lpl_lgrpid))) {
 2177                                 return (LPL_TOPO_LGRP_NOT_LEAF);
 2178                         }
 2179 
 2180                         ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
 2181                             (lpl->lpl_ncpu > 0));
 2182                         if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
 2183                             (lpl->lpl_ncpu <= 0)) {
 2184                                 return (LPL_TOPO_BAD_CPUCNT);
 2185                         }
 2186 
 2187                         /*
 2188                          * Check that lpl_ncpu also matches the number of
 2189                          * cpus in the lpl's linked list.  This only exists in
 2190                          * leaves, but they should always match.
 2191                          */
 2192                         j = 0;
 2193                         cpu = cp_start = lpl->lpl_cpus;
 2194                         while (cpu != NULL) {
 2195                                 j++;
 2196 
 2197                                 /* check to make sure cpu's lpl is leaf lpl */
 2198                                 ASSERT(cpu->cpu_lpl == lpl);
 2199                                 if (cpu->cpu_lpl != lpl) {
 2200                                         return (LPL_TOPO_CPU_HAS_BAD_LPL);
 2201                                 }
 2202 
 2203                                 /* check next cpu */
 2204                                 if ((cpu = cpu->cpu_next_lpl) != cp_start) {
 2205                                         continue;
 2206                                 } else {
 2207                                         cpu = NULL;
 2208                                 }
 2209                         }
 2210 
 2211                         ASSERT(j == lpl->lpl_ncpu);
 2212                         if (j != lpl->lpl_ncpu) {
 2213                                 return (LPL_TOPO_LPL_BAD_NCPU);
 2214                         }
 2215 
 2216                         /*
 2217                          * Also, check that leaf lpl is contained in all
 2218                          * intermediate lpls that name the leaf as a descendant
 2219                          */
 2220                         for (j = 0; j <= lgrp_alloc_max; j++) {
 2221                                 klgrpset_t intersect;
 2222                                 lgrp_t *lgrp_cand;
 2223                                 lpl_t *lpl_cand;
 2224 
 2225                                 lgrp_cand = lgrp_table[j];
 2226                                 intersect = klgrpset_intersects(
 2227                                     lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
 2228                                     cpupart->cp_lgrpset);
 2229 
 2230                                 if (!LGRP_EXISTS(lgrp_cand) ||
 2231                                     !klgrpset_intersects(lgrp_cand->lgrp_leaves,
 2232                                     cpupart->cp_lgrpset) ||
 2233                                     (intersect == 0))
 2234                                         continue;
 2235 
 2236                                 lpl_cand =
 2237                                     &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
 2238 
 2239                                 if (klgrpset_ismember(intersect,
 2240                                     lgrp->lgrp_id)) {
 2241                                         ASSERT(lpl_rset_contains(lpl_cand,
 2242                                             lpl));
 2243 
 2244                                         if (!lpl_rset_contains(lpl_cand, lpl)) {
 2245                                                 return (LPL_TOPO_RSET_MSSNG_LF);
 2246                                         }
 2247                                 }
 2248                         }
 2249 
 2250                 } else { /* non-leaf specific checks */
 2251 
 2252                         /*
 2253                          * Non-leaf lpls should have lpl_cpus == NULL
 2254                          * verify that this is so
 2255                          */
 2256                         ASSERT(lpl->lpl_cpus == NULL);
 2257                         if (lpl->lpl_cpus != NULL) {
 2258                                 return (LPL_TOPO_NONLEAF_HAS_CPUS);
 2259                         }
 2260 
 2261                         /*
 2262                          * verify that the sum of the cpus in the leaf resources
 2263                          * is equal to the total ncpu in the intermediate
 2264                          */
 2265                         for (j = sum = 0; j < lpl->lpl_nrset; j++) {
 2266                                 sum += lpl->lpl_rset[j]->lpl_ncpu;
 2267                         }
 2268 
 2269                         ASSERT(sum == lpl->lpl_ncpu);
 2270                         if (sum != lpl->lpl_ncpu) {
 2271                                 return (LPL_TOPO_LPL_BAD_NCPU);
 2272                         }
 2273                 }
 2274 
 2275                 /*
 2276                  * Check the rset of the lpl in question.  Make sure that each
 2277                  * rset contains a subset of the resources in
 2278                  * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
 2279                  * sure that each rset doesn't include resources that are
 2280                  * outside of that set.  (Which would be resources somehow not
 2281                  * accounted for).
 2282                  */
 2283                 klgrpset_clear(rset);
 2284                 for (j = 0; j < lpl->lpl_nrset; j++) {
 2285                         klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
 2286                 }
 2287                 klgrpset_copy(cset, rset);
 2288                 /* make sure lpl rset matches lgrp rset */
 2289                 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
 2290                 /* make sure rset is contained with in partition, too */
 2291                 klgrpset_diff(cset, cpupart->cp_lgrpset);
 2292 
 2293                 ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
 2294                 if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
 2295                         return (LPL_TOPO_RSET_MISMATCH);
 2296                 }
 2297 
 2298                 /*
 2299                  * check to make sure lpl_nrset matches the number of rsets
 2300                  * contained in the lpl
 2301                  */
 2302                 for (j = 0; j < lpl->lpl_nrset; j++) {
 2303                         if (lpl->lpl_rset[j] == NULL)
 2304                                 break;
 2305                 }
 2306 
 2307                 ASSERT(j == lpl->lpl_nrset);
 2308                 if (j != lpl->lpl_nrset) {
 2309                         return (LPL_TOPO_BAD_RSETCNT);
 2310                 }
 2311 
 2312         }
 2313         return (LPL_TOPO_CORRECT);
 2314 }
 2315 
 2316 /*
 2317  * Flatten lpl topology to given number of levels.  This is presently only
 2318  * implemented for a flatten to 2 levels, which will prune out the intermediates
 2319  * and home the leaf lpls to the root lpl.
 2320  */
 2321 int
 2322 lpl_topo_flatten(int levels)
 2323 {
 2324         int             i;
 2325         uint_t          sum;
 2326         lgrp_t          *lgrp_cur;
 2327         lpl_t           *lpl_cur;
 2328         lpl_t           *lpl_root;
 2329         cpupart_t       *cp;
 2330 
 2331         if (levels != 2)
 2332                 return (0);
 2333 
 2334         /* called w/ cpus paused - grab no locks! */
 2335         ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
 2336             !lgrp_initialized);
 2337 
 2338         cp = cp_list_head;
 2339         do {
 2340                 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
 2341                 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
 2342 
 2343                 for (i = 0; i <= lgrp_alloc_max; i++) {
 2344                         lgrp_cur = lgrp_table[i];
 2345                         lpl_cur = &cp->cp_lgrploads[i];
 2346 
 2347                         if ((lgrp_cur == lgrp_root) ||
 2348                             (!LGRP_EXISTS(lgrp_cur) &&
 2349                             (lpl_cur->lpl_ncpu == 0)))
 2350                                 continue;
 2351 
 2352                         if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
 2353                                 /*
 2354                                  * this should be a deleted intermediate, so
 2355                                  * clear it
 2356                                  */
 2357                                 lpl_clear(lpl_cur);
 2358                         } else if ((lpl_cur->lpl_nrset == 1) &&
 2359                             (lpl_cur->lpl_rset[0] == lpl_cur) &&
 2360                             ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
 2361                             (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
 2362                                 /*
 2363                                  * this is a leaf whose parent was deleted, or
 2364                                  * whose parent had their lgrp deleted.  (And
 2365                                  * whose parent will soon be deleted).  Point
 2366                                  * this guy back to the root lpl.
 2367                                  */
 2368                                 lpl_cur->lpl_parent = lpl_root;
 2369                                 lpl_rset_add(lpl_root, lpl_cur);
 2370                         }
 2371 
 2372                 }
 2373 
 2374                 /*
 2375                  * Now that we're done, make sure the count on the root lpl is
 2376                  * correct, and update the hints of the children for the sake of
 2377                  * thoroughness
 2378                  */
 2379                 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
 2380                         sum += lpl_root->lpl_rset[i]->lpl_ncpu;
 2381                 }
 2382                 lpl_root->lpl_ncpu = sum;
 2383                 lpl_child_update(lpl_root, cp);
 2384 
 2385                 cp = cp->cp_next;
 2386         } while (cp != cp_list_head);
 2387 
 2388         return (levels);
 2389 }
 2390 
 2391 /*
 2392  * Insert a lpl into the resource hierarchy and create any additional lpls that
 2393  * are necessary to represent the varying states of locality for the cpu
 2394  * resoruces newly added to the partition.
 2395  *
 2396  * This routine is clever enough that it can correctly add resources from the
 2397  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
 2398  * those for which the lpl is a leaf as opposed to simply a named equally local
 2399  * resource).  The one special case that needs additional processing is when a
 2400  * new intermediate lpl is introduced.  Since the main loop only traverses
 2401  * looking to add the leaf resource where it does not yet exist, additional work
 2402  * is necessary to add other leaf resources that may need to exist in the newly
 2403  * created intermediate.  This is performed by the second inner loop, and is
 2404  * only done when the check for more than one overlapping resource succeeds.
 2405  */
 2406 
 2407 void
 2408 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
 2409 {
 2410         int             i;
 2411         int             j;
 2412         int             rset_num_intersect;
 2413         lgrp_t          *lgrp_cur;
 2414         lpl_t           *lpl_cur;
 2415         lpl_t           *lpl_parent;
 2416         lgrp_id_t       parent_id;
 2417         klgrpset_t      rset_intersect; /* resources in cpupart and lgrp */
 2418 
 2419         for (i = 0; i <= lgrp_alloc_max; i++) {
 2420                 lgrp_cur = lgrp_table[i];
 2421 
 2422                 /*
 2423                  * Don't insert if the lgrp isn't there, if the leaf isn't
 2424                  * contained within the current lgrp, or if the current lgrp has
 2425                  * no leaves in this partition
 2426                  */
 2427 
 2428                 if (!LGRP_EXISTS(lgrp_cur) ||
 2429                     !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
 2430                     lpl_leaf->lpl_lgrpid) ||
 2431                     !klgrpset_intersects(lgrp_cur->lgrp_leaves,
 2432                     cpupart->cp_lgrpset))
 2433                         continue;
 2434 
 2435                 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
 2436                 if (lgrp_cur->lgrp_parent != NULL) {
 2437                         /* if lgrp has a parent, assign it properly */
 2438                         parent_id = lgrp_cur->lgrp_parent->lgrp_id;
 2439                         lpl_parent = &cpupart->cp_lgrploads[parent_id];
 2440                 } else {
 2441                         /* if not, make sure parent ptr gets set to null */
 2442                         lpl_parent = NULL;
 2443                 }
 2444 
 2445                 if (lpl_cur == lpl_leaf) {
 2446                         /*
 2447                          * Almost all leaf state was initialized elsewhere.  The
 2448                          * only thing left to do is to set the parent.
 2449                          */
 2450                         lpl_cur->lpl_parent = lpl_parent;
 2451                         continue;
 2452                 }
 2453 
 2454                 lpl_clear(lpl_cur);
 2455                 lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
 2456 
 2457                 lpl_cur->lpl_parent = lpl_parent;
 2458 
 2459                 /* does new lpl need to be populated with other resources? */
 2460                 rset_intersect =
 2461                     klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
 2462                     cpupart->cp_lgrpset);
 2463                 klgrpset_nlgrps(rset_intersect, rset_num_intersect);
 2464 
 2465                 if (rset_num_intersect > 1) {
 2466                         /*
 2467                          * If so, figure out what lpls have resources that
 2468                          * intersect this one, and add them.
 2469                          */
 2470                         for (j = 0; j <= lgrp_alloc_max; j++) {
 2471                                 lgrp_t  *lgrp_cand;     /* candidate lgrp */
 2472                                 lpl_t   *lpl_cand;      /* candidate lpl */
 2473 
 2474                                 lgrp_cand = lgrp_table[j];
 2475                                 if (!LGRP_EXISTS(lgrp_cand) ||
 2476                                     !klgrpset_ismember(rset_intersect,
 2477                                     lgrp_cand->lgrp_id))
 2478                                         continue;
 2479                                 lpl_cand =
 2480                                     &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
 2481                                 lpl_rset_add(lpl_cur, lpl_cand);
 2482                         }
 2483                 }
 2484                 /*
 2485                  * This lpl's rset has changed. Update the hint in it's
 2486                  * children.
 2487                  */
 2488                 lpl_child_update(lpl_cur, cpupart);
 2489         }
 2490 }
 2491 
 2492 /*
 2493  * remove a lpl from the hierarchy of resources, clearing its state when
 2494  * finished.  If the lpls at the intermediate levels of the hierarchy have no
 2495  * remaining resources, or no longer name a leaf resource in the cpu-partition,
 2496  * delete them as well.
 2497  */
 2498 
 2499 void
 2500 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
 2501 {
 2502         int             i;
 2503         lgrp_t          *lgrp_cur;
 2504         lpl_t           *lpl_cur;
 2505         klgrpset_t      leaf_intersect; /* intersection of leaves */
 2506 
 2507         for (i = 0; i <= lgrp_alloc_max; i++) {
 2508                 lgrp_cur = lgrp_table[i];
 2509 
 2510                 /*
 2511                  * Don't attempt to remove from lgrps that aren't there, that
 2512                  * don't contain our leaf, or from the leaf itself. (We do that
 2513                  * later)
 2514                  */
 2515 
 2516                 if (!LGRP_EXISTS(lgrp_cur))
 2517                         continue;
 2518 
 2519                 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
 2520 
 2521                 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
 2522                     lpl_leaf->lpl_lgrpid) ||
 2523                     (lpl_cur == lpl_leaf)) {
 2524                         continue;
 2525                 }
 2526 
 2527                 /*
 2528                  * This is a slightly sleazy simplification in that we have
 2529                  * already marked the cp_lgrpset as no longer containing the
 2530                  * leaf we've deleted.  Any lpls that pass the above checks
 2531                  * based upon lgrp membership but not necessarily cpu-part
 2532                  * membership also get cleared by the checks below.  Currently
 2533                  * this is harmless, as the lpls should be empty anyway.
 2534                  *
 2535                  * In particular, we want to preserve lpls that have additional
 2536                  * leaf resources, even though we don't yet have a processor
 2537                  * architecture that represents resources this way.
 2538                  */
 2539 
 2540                 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
 2541                     cpupart->cp_lgrpset);
 2542 
 2543                 lpl_rset_del(lpl_cur, lpl_leaf);
 2544                 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
 2545                         lpl_clear(lpl_cur);
 2546                 } else {
 2547                         /*
 2548                          * Update this lpl's children
 2549                          */
 2550                         lpl_child_update(lpl_cur, cpupart);
 2551                 }
 2552         }
 2553         lpl_clear(lpl_leaf);
 2554 }
 2555 
 2556 /*
 2557  * add a cpu to a partition in terms of lgrp load avg bookeeping
 2558  *
 2559  * The lpl (cpu partition load average information) is now arranged in a
 2560  * hierarchical fashion whereby resources that are closest, ie. most local, to
 2561  * the cpu in question are considered to be leaves in a tree of resources.
 2562  * There are two general cases for cpu additon:
 2563  *
 2564  * 1. A lpl structure that contains resources already in the hierarchy tree.
 2565  * In this case, all of the associated lpl relationships have been defined, and
 2566  * all that is necessary is that we link the new cpu into the per-lpl list of
 2567  * cpus, and increment the ncpu count of all places where this cpu resource will
 2568  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
 2569  * pushing is accomplished by this routine.
 2570  *
 2571  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
 2572  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
 2573  * construct the hierarchy of state necessary to name it's more distant
 2574  * resources, if they should exist.  The leaf structure is initialized by this
 2575  * routine, as is the cpu-partition state for the lgrp membership.  This routine
 2576  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
 2577  * and builds all of the "ancestoral" state necessary to identify resources at
 2578  * differing levels of locality.
 2579  */
 2580 void
 2581 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
 2582 {
 2583         cpupart_t       *cpupart;
 2584         lgrp_t          *lgrp_leaf;
 2585         lpl_t           *lpl_leaf;
 2586 
 2587         /* called sometimes w/ cpus paused - grab no locks */
 2588         ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
 2589 
 2590         cpupart = cp->cpu_part;
 2591         lgrp_leaf = lgrp_table[lgrpid];
 2592 
 2593         /* don't add non-existent lgrp */
 2594         ASSERT(LGRP_EXISTS(lgrp_leaf));
 2595         lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
 2596         cp->cpu_lpl = lpl_leaf;
 2597 
 2598         /* only leaf lpls contain cpus */
 2599 
 2600         if (lpl_leaf->lpl_ncpu++ == 0) {
 2601                 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
 2602                 klgrpset_add(cpupart->cp_lgrpset, lgrpid);
 2603                 lpl_leaf_insert(lpl_leaf, cpupart);
 2604         } else {
 2605                 /*
 2606                  * the lpl should already exist in the parent, so just update
 2607                  * the count of available CPUs
 2608                  */
 2609                 lpl_cpu_adjcnt(LPL_INCREMENT, cp);
 2610         }
 2611 
 2612         /* link cpu into list of cpus in lpl */
 2613 
 2614         if (lpl_leaf->lpl_cpus) {
 2615                 cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
 2616                 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
 2617                 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
 2618                 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
 2619         } else {
 2620                 /*
 2621                  * We increment ncpu immediately after we create a new leaf
 2622                  * lpl, so assert that ncpu == 1 for the case where we don't
 2623                  * have any cpu pointers yet.
 2624                  */
 2625                 ASSERT(lpl_leaf->lpl_ncpu == 1);
 2626                 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
 2627         }
 2628 
 2629 }
 2630 
 2631 
 2632 /*
 2633  * remove a cpu from a partition in terms of lgrp load avg bookeeping
 2634  *
 2635  * The lpl (cpu partition load average information) is now arranged in a
 2636  * hierarchical fashion whereby resources that are closest, ie. most local, to
 2637  * the cpu in question are considered to be leaves in a tree of resources.
 2638  * There are two removal cases in question:
 2639  *
 2640  * 1. Removal of the resource in the leaf leaves other resources remaining in
 2641  * that leaf.  (Another cpu still exists at this level of locality).  In this
 2642  * case, the count of available cpus is decremented in all assocated lpls by
 2643  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
 2644  * from the per-cpu lpl list.
 2645  *
 2646  * 2. Removal of the resource results in the lpl containing no resources.  (It's
 2647  * empty)  In this case, all of what has occurred for the first step must take
 2648  * place; however, additionally we must remove the lpl structure itself, prune
 2649  * out any stranded lpls that do not directly name a leaf resource, and mark the
 2650  * cpu partition in question as no longer containing resources from the lgrp of
 2651  * the lpl that has been delted.  Cpu-partition changes are handled by this
 2652  * method, but the lpl_leaf_remove function deals with the details of pruning
 2653  * out the empty lpl and any of its orphaned direct ancestors.
 2654  */
 2655 void
 2656 lgrp_part_del_cpu(cpu_t *cp)
 2657 {
 2658         lpl_t           *lpl;
 2659         lpl_t           *leaf_lpl;
 2660         lgrp_t          *lgrp_leaf;
 2661 
 2662         /* called sometimes w/ cpus paused - grab no locks */
 2663 
 2664         ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
 2665 
 2666         lpl = leaf_lpl = cp->cpu_lpl;
 2667         lgrp_leaf = leaf_lpl->lpl_lgrp;
 2668 
 2669         /* don't delete a leaf that isn't there */
 2670         ASSERT(LGRP_EXISTS(lgrp_leaf));
 2671 
 2672         /* no double-deletes */
 2673         ASSERT(lpl->lpl_ncpu);
 2674         if (--lpl->lpl_ncpu == 0) {
 2675                 /*
 2676                  * This was the last cpu in this lgroup for this partition,
 2677                  * clear its bit in the partition's lgroup bitmask
 2678                  */
 2679                 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
 2680 
 2681                 /* eliminate remaning lpl link pointers in cpu, lpl */
 2682                 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
 2683 
 2684                 lpl_leaf_remove(leaf_lpl, cp->cpu_part);
 2685         } else {
 2686 
 2687                 /* unlink cpu from lists of cpus in lpl */
 2688                 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
 2689                 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
 2690                 if (lpl->lpl_cpus == cp) {
 2691                         lpl->lpl_cpus = cp->cpu_next_lpl;
 2692                 }
 2693 
 2694                 /*
 2695                  * Update the cpu count in the lpls associated with parent
 2696                  * lgroups.
 2697                  */
 2698                 lpl_cpu_adjcnt(LPL_DECREMENT, cp);
 2699 
 2700         }
 2701         /* clear cpu's lpl ptr when we're all done */
 2702         cp->cpu_lpl = NULL;
 2703 }
 2704 
 2705 /*
 2706  * Recompute load average for the specified partition/lgrp fragment.
 2707  *
 2708  * We rely on the fact that this routine is called from the clock thread
 2709  * at a point before the clock thread can block (i.e. before its first
 2710  * lock request).  Since the clock thread can not be preempted (since it
 2711  * runs at highest priority), we know that cpu partitions can not change
 2712  * (since doing so would require either the repartition requester or the
 2713  * cpu_pause thread to run on this cpu), so we can update the cpu's load
 2714  * without grabbing cpu_lock.
 2715  */
 2716 void
 2717 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
 2718 {
 2719         uint_t          ncpu;
 2720         int64_t         old, new, f;
 2721 
 2722         /*
 2723          * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
 2724          */
 2725         static short expval[] = {
 2726             0, 3196, 1618, 1083,
 2727             814, 652, 543, 466,
 2728             408, 363, 326, 297,
 2729             272, 251, 233, 218,
 2730             204, 192, 181, 172,
 2731             163, 155, 148, 142,
 2732             136, 130, 125, 121,
 2733             116, 112, 109, 105
 2734         };
 2735 
 2736         /* ASSERT (called from clock level) */
 2737 
 2738         if ((lpl == NULL) ||    /* we're booting - this is easiest for now */
 2739             ((ncpu = lpl->lpl_ncpu) == 0)) {
 2740                 return;
 2741         }
 2742 
 2743         for (;;) {
 2744 
 2745                 if (ncpu >= sizeof (expval) / sizeof (expval[0]))
 2746                         f = expval[1]/ncpu; /* good approx. for large ncpu */
 2747                 else
 2748                         f = expval[ncpu];
 2749 
 2750                 /*
 2751                  * Modify the load average atomically to avoid losing
 2752                  * anticipatory load updates (see lgrp_move_thread()).
 2753                  */
 2754                 if (ageflag) {
 2755                         /*
 2756                          * We're supposed to both update and age the load.
 2757                          * This happens 10 times/sec. per cpu.  We do a
 2758                          * little hoop-jumping to avoid integer overflow.
 2759                          */
 2760                         int64_t         q, r;
 2761 
 2762                         do {
 2763                                 old = new = lpl->lpl_loadavg;
 2764                                 q = (old  >> 16) << 7;
 2765                                 r = (old  & 0xffff) << 7;
 2766                                 new += ((long long)(nrcpus - q) * f -
 2767                                     ((r * f) >> 16)) >> 7;
 2768 
 2769                                 /*
 2770                                  * Check for overflow
 2771                                  */
 2772                                 if (new > LGRP_LOADAVG_MAX)
 2773                                         new = LGRP_LOADAVG_MAX;
 2774                                 else if (new < 0)
 2775                                         new = 0;
 2776                         } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
 2777                             new) != old);
 2778                 } else {
 2779                         /*
 2780                          * We're supposed to update the load, but not age it.
 2781                          * This option is used to update the load (which either
 2782                          * has already been aged in this 1/10 sec. interval or
 2783                          * soon will be) to account for a remotely executing
 2784                          * thread.
 2785                          */
 2786                         do {
 2787                                 old = new = lpl->lpl_loadavg;
 2788                                 new += f;
 2789                                 /*
 2790                                  * Check for overflow
 2791                                  * Underflow not possible here
 2792                                  */
 2793                                 if (new < old)
 2794                                         new = LGRP_LOADAVG_MAX;
 2795                         } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
 2796                             new) != old);
 2797                 }
 2798 
 2799                 /*
 2800                  * Do the same for this lpl's parent
 2801                  */
 2802                 if ((lpl = lpl->lpl_parent) == NULL)
 2803                         break;
 2804                 ncpu = lpl->lpl_ncpu;
 2805         }
 2806 }
 2807 
 2808 /*
 2809  * Initialize lpl topology in the target based on topology currently present in
 2810  * lpl_bootstrap.
 2811  *
 2812  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
 2813  * initialize cp_default list of lpls. Up to this point all topology operations
 2814  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
 2815  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
 2816  * `target' points to the list of lpls in cp_default and `size' is the size of
 2817  * this list.
 2818  *
 2819  * This function walks the lpl topology in lpl_bootstrap and does for things:
 2820  *
 2821  * 1) Copies all fields from lpl_bootstrap to the target.
 2822  *
 2823  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
 2824  *
 2825  * 3) Updates lpl_parent pointers to point to the lpls in the target list
 2826  *    instead of lpl_bootstrap.
 2827  *
 2828  * 4) Updates pointers in the resource list of the target to point to the lpls
 2829  *    in the target list instead of lpl_bootstrap.
 2830  *
 2831  * After lpl_topo_bootstrap() completes, target contains the same information
 2832  * that would be present there if it were used during boot instead of
 2833  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
 2834  * and it is bzeroed.
 2835  */
 2836 void
 2837 lpl_topo_bootstrap(lpl_t *target, int size)
 2838 {
 2839         lpl_t   *lpl = lpl_bootstrap;
 2840         lpl_t   *target_lpl = target;
 2841         lpl_t   **rset;
 2842         int     *id2rset;
 2843         int     sz;
 2844         int     howmany;
 2845         int     id;
 2846         int     i;
 2847 
 2848         /*
 2849          * The only target that should be passed here is cp_default lpl list.
 2850          */
 2851         ASSERT(target == cp_default.cp_lgrploads);
 2852         ASSERT(size == cp_default.cp_nlgrploads);
 2853         ASSERT(!lgrp_topo_initialized);
 2854         ASSERT(ncpus == 1);
 2855 
 2856         howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
 2857         for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
 2858                 /*
 2859                  * Copy all fields from lpl, except for the rset,
 2860                  * lgrp id <=> rset mapping storage,
 2861                  * and amount of storage
 2862                  */
 2863                 rset = target_lpl->lpl_rset;
 2864                 id2rset = target_lpl->lpl_id2rset;
 2865                 sz = target_lpl->lpl_rset_sz;
 2866 
 2867                 *target_lpl = *lpl;
 2868 
 2869                 target_lpl->lpl_rset_sz = sz;
 2870                 target_lpl->lpl_rset = rset;
 2871                 target_lpl->lpl_id2rset = id2rset;
 2872 
 2873                 /*
 2874                  * Substitute CPU0 lpl pointer with one relative to target.
 2875                  */
 2876                 if (lpl->lpl_cpus == CPU) {
 2877                         ASSERT(CPU->cpu_lpl == lpl);
 2878                         CPU->cpu_lpl = target_lpl;
 2879                 }
 2880 
 2881                 /*
 2882                  * Substitute parent information with parent relative to target.
 2883                  */
 2884                 if (lpl->lpl_parent != NULL)
 2885                         target_lpl->lpl_parent = (lpl_t *)
 2886                             (((uintptr_t)lpl->lpl_parent -
 2887                             (uintptr_t)lpl_bootstrap) +
 2888                             (uintptr_t)target);
 2889 
 2890                 /*
 2891                  * Walk over resource set substituting pointers relative to
 2892                  * lpl_bootstrap's rset to pointers relative to target's
 2893                  */
 2894                 ASSERT(lpl->lpl_nrset <= 1);
 2895 
 2896                 for (id = 0; id < lpl->lpl_nrset; id++) {
 2897                         if (lpl->lpl_rset[id] != NULL) {
 2898                                 target_lpl->lpl_rset[id] = (lpl_t *)
 2899                                     (((uintptr_t)lpl->lpl_rset[id] -
 2900                                     (uintptr_t)lpl_bootstrap) +
 2901                                     (uintptr_t)target);
 2902                         }
 2903                         target_lpl->lpl_id2rset[id] =
 2904                             lpl->lpl_id2rset[id];
 2905                 }
 2906         }
 2907 
 2908         /*
 2909          * Clean up the bootstrap lpls since we have switched over to the
 2910          * actual lpl array in the default cpu partition.
 2911          *
 2912          * We still need to keep one empty lpl around for newly starting
 2913          * slave CPUs to reference should they need to make it through the
 2914          * dispatcher prior to their lgrp/lpl initialization.
 2915          *
 2916          * The lpl related dispatcher code has been designed to work properly
 2917          * (and without extra checks) for this special case of a zero'ed
 2918          * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
 2919          * with lgrpid 0 and an empty resource set. Iteration over the rset
 2920          * array by the dispatcher is also NULL terminated for this reason.
 2921          *
 2922          * This provides the desired behaviour for an uninitialized CPU.
 2923          * It shouldn't see any other CPU to either dispatch to or steal
 2924          * from until it is properly initialized.
 2925          */
 2926         bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
 2927         bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
 2928         bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
 2929 
 2930         lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
 2931         lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
 2932 }
 2933 
 2934 /*
 2935  * If the lowest load among the lgroups a process' threads are currently
 2936  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
 2937  * expanding the process to a new lgroup.
 2938  */
 2939 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
 2940 lgrp_load_t     lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
 2941 
 2942 #define LGRP_EXPAND_PROC_THRESH(ncpu) \
 2943         ((lgrp_expand_proc_thresh) / (ncpu))
 2944 
 2945 /*
 2946  * A process will be expanded to a new lgroup only if the difference between
 2947  * the lowest load on the lgroups the process' thread's are currently spread
 2948  * across and the lowest load on the other lgroups in the process' partition
 2949  * is greater than lgrp_expand_proc_diff.
 2950  */
 2951 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
 2952 lgrp_load_t     lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
 2953 
 2954 #define LGRP_EXPAND_PROC_DIFF(ncpu) \
 2955         ((lgrp_expand_proc_diff) / (ncpu))
 2956 
 2957 /*
 2958  * The loadavg tolerance accounts for "noise" inherent in the load, which may
 2959  * be present due to impreciseness of the load average decay algorithm.
 2960  *
 2961  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
 2962  * tolerance is scaled by the number of cpus in the lgroup just like
 2963  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
 2964  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
 2965  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
 2966  */
 2967 uint32_t        lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
 2968 #define LGRP_LOADAVG_TOLERANCE(ncpu)    \
 2969         ((lgrp_loadavg_tolerance) / ncpu)
 2970 
 2971 /*
 2972  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
 2973  * average is above this threshold
 2974  */
 2975 uint32_t        lgrp_load_thresh = UINT32_MAX;
 2976 
 2977 /*
 2978  * lgrp_choose() will try to skip any lgroups with less memory
 2979  * than this free when choosing a home lgroup
 2980  */
 2981 pgcnt_t lgrp_mem_free_thresh = 0;
 2982 
 2983 /*
 2984  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
 2985  * one based on one of the following policies:
 2986  * - Random selection
 2987  * - Pseudo round robin placement
 2988  * - Longest time since a thread was last placed
 2989  */
 2990 #define LGRP_CHOOSE_RANDOM      1
 2991 #define LGRP_CHOOSE_RR          2
 2992 #define LGRP_CHOOSE_TIME        3
 2993 
 2994 int     lgrp_choose_policy = LGRP_CHOOSE_TIME;
 2995 
 2996 /*
 2997  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
 2998  * be bound to a CPU or processor set.
 2999  *
 3000  * Arguments:
 3001  *      t               The thread
 3002  *      cpupart         The partition the thread belongs to.
 3003  *
 3004  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
 3005  *       disabled, or thread_lock held (at splhigh) to protect against the CPU
 3006  *       partitions changing out from under us and assumes that given thread is
 3007  *       protected.  Also, called sometimes w/ cpus paused or kernel preemption
 3008  *       disabled, so don't grab any locks because we should never block under
 3009  *       those conditions.
 3010  */
 3011 lpl_t *
 3012 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
 3013 {
 3014         lgrp_load_t     bestload, bestrload;
 3015         int             lgrpid_offset, lgrp_count;
 3016         lgrp_id_t       lgrpid, lgrpid_start;
 3017         lpl_t           *lpl, *bestlpl, *bestrlpl;
 3018         klgrpset_t      lgrpset;
 3019         proc_t          *p;
 3020 
 3021         ASSERT(t != NULL);
 3022         ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
 3023             THREAD_LOCK_HELD(t));
 3024         ASSERT(cpupart != NULL);
 3025 
 3026         p = t->t_procp;
 3027 
 3028         /* A process should always be in an active partition */
 3029         ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
 3030 
 3031         bestlpl = bestrlpl = NULL;
 3032         bestload = bestrload = LGRP_LOADAVG_MAX;
 3033         lgrpset = cpupart->cp_lgrpset;
 3034 
 3035         switch (lgrp_choose_policy) {
 3036         case LGRP_CHOOSE_RR:
 3037                 lgrpid = cpupart->cp_lgrp_hint;
 3038                 do {
 3039                         if (++lgrpid > lgrp_alloc_max)
 3040                                 lgrpid = 0;
 3041                 } while (!klgrpset_ismember(lgrpset, lgrpid));
 3042 
 3043                 break;
 3044         default:
 3045         case LGRP_CHOOSE_TIME:
 3046         case LGRP_CHOOSE_RANDOM:
 3047                 klgrpset_nlgrps(lgrpset, lgrp_count);
 3048                 lgrpid_offset =
 3049                     (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
 3050                 for (lgrpid = 0; ; lgrpid++) {
 3051                         if (klgrpset_ismember(lgrpset, lgrpid)) {
 3052                                 if (--lgrpid_offset == 0)
 3053                                         break;
 3054                         }
 3055                 }
 3056                 break;
 3057         }
 3058 
 3059         lgrpid_start = lgrpid;
 3060 
 3061         DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
 3062             lgrp_id_t, cpupart->cp_lgrp_hint);
 3063 
 3064         /*
 3065          * Use lgroup affinities (if any) to choose best lgroup
 3066          *
 3067          * NOTE: Assumes that thread is protected from going away and its
 3068          *       lgroup affinities won't change (ie. p_lock, or
 3069          *       thread_lock() being held and/or CPUs paused)
 3070          */
 3071         if (t->t_lgrp_affinity) {
 3072                 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
 3073                 if (lpl != NULL)
 3074                         return (lpl);
 3075         }
 3076 
 3077         ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
 3078 
 3079         do {
 3080                 pgcnt_t npgs;
 3081 
 3082                 /*
 3083                  * Skip any lgroups outside of thread's pset
 3084                  */
 3085                 if (!klgrpset_ismember(lgrpset, lgrpid)) {
 3086                         if (++lgrpid > lgrp_alloc_max)
 3087                                 lgrpid = 0;     /* wrap the search */
 3088                         continue;
 3089                 }
 3090 
 3091                 /*
 3092                  * Skip any non-leaf lgroups
 3093                  */
 3094                 if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
 3095                         continue;
 3096 
 3097                 /*
 3098                  * Skip any lgroups without enough free memory
 3099                  * (when threshold set to nonzero positive value)
 3100                  */
 3101                 if (lgrp_mem_free_thresh > 0) {
 3102                         npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
 3103                         if (npgs < lgrp_mem_free_thresh) {
 3104                                 if (++lgrpid > lgrp_alloc_max)
 3105                                         lgrpid = 0;     /* wrap the search */
 3106                                 continue;
 3107                         }
 3108                 }
 3109 
 3110                 lpl = &cpupart->cp_lgrploads[lgrpid];
 3111                 if (klgrpset_isempty(p->p_lgrpset) ||
 3112                     klgrpset_ismember(p->p_lgrpset, lgrpid)) {
 3113                         /*
 3114                          * Either this is a new process or the process already
 3115                          * has threads on this lgrp, so this is a preferred
 3116                          * lgroup for the thread.
 3117                          */
 3118                         if (bestlpl == NULL ||
 3119                             lpl_pick(lpl, bestlpl)) {
 3120                                 bestload = lpl->lpl_loadavg;
 3121                                 bestlpl = lpl;
 3122                         }
 3123                 } else {
 3124                         /*
 3125                          * The process doesn't have any threads on this lgrp,
 3126                          * but we're willing to consider this lgrp if the load
 3127                          * difference is big enough to justify splitting up
 3128                          * the process' threads.
 3129                          */
 3130                         if (bestrlpl == NULL ||
 3131                             lpl_pick(lpl, bestrlpl)) {
 3132                                 bestrload = lpl->lpl_loadavg;
 3133                                 bestrlpl = lpl;
 3134                         }
 3135                 }
 3136                 if (++lgrpid > lgrp_alloc_max)
 3137                         lgrpid = 0;     /* wrap the search */
 3138         } while (lgrpid != lgrpid_start);
 3139 
 3140         /*
 3141          * Return root lgroup if threshold isn't set to maximum value and
 3142          * lowest lgroup load average more than a certain threshold
 3143          */
 3144         if (lgrp_load_thresh != UINT32_MAX &&
 3145             bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
 3146                 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
 3147 
 3148         /*
 3149          * If all the lgroups over which the thread's process is spread are
 3150          * heavily loaded, or otherwise undesirable, we'll consider placing
 3151          * the thread on one of the other leaf lgroups in the thread's
 3152          * partition.
 3153          */
 3154         if ((bestlpl == NULL) ||
 3155             ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
 3156             (bestrload < bestload) &&   /* paranoid about wraparound */
 3157             (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
 3158             bestload))) {
 3159                 bestlpl = bestrlpl;
 3160         }
 3161 
 3162         if (bestlpl == NULL) {
 3163                 /*
 3164                  * No lgroup looked particularly good, but we still
 3165                  * have to pick something. Go with the randomly selected
 3166                  * legal lgroup we started with above.
 3167                  */
 3168                 bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
 3169         }
 3170 
 3171         cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
 3172         bestlpl->lpl_homed_time = gethrtime_unscaled();
 3173 
 3174         ASSERT(bestlpl->lpl_ncpu > 0);
 3175         return (bestlpl);
 3176 }
 3177 
 3178 /*
 3179  * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
 3180  * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
 3181  */
 3182 static int
 3183 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
 3184 {
 3185         lgrp_load_t     l1, l2;
 3186         lgrp_load_t     tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
 3187 
 3188         l1 = lpl1->lpl_loadavg;
 3189         l2 = lpl2->lpl_loadavg;
 3190 
 3191         if ((l1 + tolerance < l2) && (l1 < l2)) {
 3192                 /* lpl1 is significantly less loaded than lpl2 */
 3193                 return (1);
 3194         }
 3195 
 3196         if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
 3197             l1 + tolerance >= l2 && l1 < l2 &&
 3198             lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
 3199                 /*
 3200                  * lpl1's load is within the tolerance of lpl2. We're
 3201                  * willing to consider it be to better however if
 3202                  * it has been longer since we last homed a thread there
 3203                  */
 3204                 return (1);
 3205         }
 3206 
 3207         return (0);
 3208 }
 3209 
 3210 /*
 3211  * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
 3212  * process that uses text replication changed home lgrp. This info is used by
 3213  * segvn asyncronous thread to detect if it needs to recheck what lgrps
 3214  * should be used for text replication.
 3215  */
 3216 static uint64_t lgrp_trthr_moves = 0;
 3217 
 3218 uint64_t
 3219 lgrp_get_trthr_migrations(void)
 3220 {
 3221         return (lgrp_trthr_moves);
 3222 }
 3223 
 3224 void
 3225 lgrp_update_trthr_migrations(uint64_t incr)
 3226 {
 3227         atomic_add_64(&lgrp_trthr_moves, incr);
 3228 }
 3229 
 3230 /*
 3231  * An LWP is expected to be assigned to an lgroup for at least this long
 3232  * for its anticipatory load to be justified.  NOTE that this value should
 3233  * not be set extremely huge (say, larger than 100 years), to avoid problems
 3234  * with overflow in the calculation that uses it.
 3235  */
 3236 #define LGRP_MIN_NSEC   (NANOSEC / 10)          /* 1/10 of a second */
 3237 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
 3238 
 3239 /*
 3240  * Routine to change a thread's lgroup affiliation.  This routine updates
 3241  * the thread's kthread_t struct and its process' proc_t struct to note the
 3242  * thread's new lgroup affiliation, and its lgroup affinities.
 3243  *
 3244  * Note that this is the only routine that modifies a thread's t_lpl field,
 3245  * and that adds in or removes anticipatory load.
 3246  *
 3247  * If the thread is exiting, newlpl is NULL.
 3248  *
 3249  * Locking:
 3250  * The following lock must be held on entry:
 3251  *      cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
 3252  *              doesn't get removed from t's partition
 3253  *
 3254  * This routine is not allowed to grab any locks, since it may be called
 3255  * with cpus paused (such as from cpu_offline).
 3256  */
 3257 void
 3258 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
 3259 {
 3260         proc_t          *p;
 3261         lpl_t           *lpl, *oldlpl;
 3262         lgrp_id_t       oldid;
 3263         kthread_t       *tp;
 3264         uint_t          ncpu;
 3265         lgrp_load_t     old, new;
 3266 
 3267         ASSERT(t);
 3268         ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
 3269             THREAD_LOCK_HELD(t));
 3270 
 3271         /*
 3272          * If not changing lpls, just return
 3273          */
 3274         if ((oldlpl = t->t_lpl) == newlpl)
 3275                 return;
 3276 
 3277         /*
 3278          * Make sure the thread's lwp hasn't exited (if so, this thread is now
 3279          * associated with process 0 rather than with its original process).
 3280          */
 3281         if (t->t_proc_flag & TP_LWPEXIT) {
 3282                 if (newlpl != NULL) {
 3283                         t->t_lpl = newlpl;
 3284                 }
 3285                 return;
 3286         }
 3287 
 3288         p = ttoproc(t);
 3289 
 3290         /*
 3291          * If the thread had a previous lgroup, update its process' p_lgrpset
 3292          * to account for it being moved from its old lgroup.
 3293          */
 3294         if ((oldlpl != NULL) && /* thread had a previous lgroup */
 3295             (p->p_tlist != NULL)) {
 3296                 oldid = oldlpl->lpl_lgrpid;
 3297 
 3298                 if (newlpl != NULL)
 3299                         lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
 3300 
 3301                 if ((do_lgrpset_delete) &&
 3302                     (klgrpset_ismember(p->p_lgrpset, oldid))) {
 3303                         for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
 3304                                 /*
 3305                                  * Check if a thread other than the thread
 3306                                  * that's moving is assigned to the same
 3307                                  * lgroup as the thread that's moving.  Note
 3308                                  * that we have to compare lgroup IDs, rather
 3309                                  * than simply comparing t_lpl's, since the
 3310                                  * threads may belong to different partitions
 3311                                  * but be assigned to the same lgroup.
 3312                                  */
 3313                                 ASSERT(tp->t_lpl != NULL);
 3314 
 3315                                 if ((tp != t) &&
 3316                                     (tp->t_lpl->lpl_lgrpid == oldid)) {
 3317                                         /*
 3318                                          * Another thread is assigned to the
 3319                                          * same lgroup as the thread that's
 3320                                          * moving, p_lgrpset doesn't change.
 3321                                          */
 3322                                         break;
 3323                                 } else if (tp == p->p_tlist) {
 3324                                         /*
 3325                                          * No other thread is assigned to the
 3326                                          * same lgroup as the exiting thread,
 3327                                          * clear the lgroup's bit in p_lgrpset.
 3328                                          */
 3329                                         klgrpset_del(p->p_lgrpset, oldid);
 3330                                         break;
 3331                                 }
 3332                         }
 3333                 }
 3334 
 3335                 /*
 3336                  * If this thread was assigned to its old lgroup for such a
 3337                  * short amount of time that the anticipatory load that was
 3338                  * added on its behalf has aged very little, remove that
 3339                  * anticipatory load.
 3340                  */
 3341                 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
 3342                     ((ncpu = oldlpl->lpl_ncpu) > 0)) {
 3343                         lpl = oldlpl;
 3344                         for (;;) {
 3345                                 do {
 3346                                         old = new = lpl->lpl_loadavg;
 3347                                         new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
 3348                                         if (new > old) {
 3349                                                 /*
 3350                                                  * this can happen if the load
 3351                                                  * average was aged since we
 3352                                                  * added in the anticipatory
 3353                                                  * load
 3354                                                  */
 3355                                                 new = 0;
 3356                                         }
 3357                                 } while (cas32(
 3358                                     (lgrp_load_t *)&lpl->lpl_loadavg, old,
 3359                                     new) != old);
 3360 
 3361                                 lpl = lpl->lpl_parent;
 3362                                 if (lpl == NULL)
 3363                                         break;
 3364 
 3365                                 ncpu = lpl->lpl_ncpu;
 3366                                 ASSERT(ncpu > 0);
 3367                         }
 3368                 }
 3369         }
 3370         /*
 3371          * If the thread has a new lgroup (i.e. it's not exiting), update its
 3372          * t_lpl and its process' p_lgrpset, and apply an anticipatory load
 3373          * to its new lgroup to account for its move to its new lgroup.
 3374          */
 3375         if (newlpl != NULL) {
 3376                 /*
 3377                  * This thread is moving to a new lgroup
 3378                  */
 3379                 t->t_lpl = newlpl;
 3380                 if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
 3381                         p->p_t1_lgrpid = newlpl->lpl_lgrpid;
 3382                         membar_producer();
 3383                         if (p->p_tr_lgrpid != LGRP_NONE &&
 3384                             p->p_tr_lgrpid != p->p_t1_lgrpid) {
 3385                                 lgrp_update_trthr_migrations(1);
 3386                         }
 3387                 }
 3388 
 3389                 /*
 3390                  * Reflect move in load average of new lgroup
 3391                  * unless it is root lgroup
 3392                  */
 3393                 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
 3394                         return;
 3395 
 3396                 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
 3397                         klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
 3398                 }
 3399 
 3400                 /*
 3401                  * It'll take some time for the load on the new lgroup
 3402                  * to reflect this thread's placement on it.  We'd
 3403                  * like not, however, to have all threads between now
 3404                  * and then also piling on to this lgroup.  To avoid
 3405                  * this pileup, we anticipate the load this thread
 3406                  * will generate on its new lgroup.  The goal is to
 3407                  * make the lgroup's load appear as though the thread
 3408                  * had been there all along.  We're very conservative
 3409                  * in calculating this anticipatory load, we assume
 3410                  * the worst case case (100% CPU-bound thread).  This
 3411                  * may be modified in the future to be more accurate.
 3412                  */
 3413                 lpl = newlpl;
 3414                 for (;;) {
 3415                         ncpu = lpl->lpl_ncpu;
 3416                         ASSERT(ncpu > 0);
 3417                         do {
 3418                                 old = new = lpl->lpl_loadavg;
 3419                                 new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
 3420                                 /*
 3421                                  * Check for overflow
 3422                                  * Underflow not possible here
 3423                                  */
 3424                                 if (new < old)
 3425                                         new = UINT32_MAX;
 3426                         } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
 3427                             new) != old);
 3428 
 3429                         lpl = lpl->lpl_parent;
 3430                         if (lpl == NULL)
 3431                                 break;
 3432                 }
 3433                 t->t_anttime = gethrtime();
 3434         }
 3435 }
 3436 
 3437 /*
 3438  * Return lgroup memory allocation policy given advice from madvise(3C)
 3439  */
 3440 lgrp_mem_policy_t
 3441 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
 3442 {
 3443         switch (advice) {
 3444         case MADV_ACCESS_LWP:
 3445                 return (LGRP_MEM_POLICY_NEXT);
 3446         case MADV_ACCESS_MANY:
 3447                 return (LGRP_MEM_POLICY_RANDOM);
 3448         default:
 3449                 return (lgrp_mem_policy_default(size, type));
 3450         }
 3451 }
 3452 
 3453 /*
 3454  * Figure out default policy
 3455  */
 3456 lgrp_mem_policy_t
 3457 lgrp_mem_policy_default(size_t size, int type)
 3458 {
 3459         cpupart_t               *cp;
 3460         lgrp_mem_policy_t       policy;
 3461         size_t                  pset_mem_size;
 3462 
 3463         /*
 3464          * Randomly allocate memory across lgroups for shared memory
 3465          * beyond a certain threshold
 3466          */
 3467         if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
 3468             (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
 3469                 /*
 3470                  * Get total memory size of current thread's pset
 3471                  */
 3472                 kpreempt_disable();
 3473                 cp = curthread->t_cpupart;
 3474                 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
 3475                 kpreempt_enable();
 3476 
 3477                 /*
 3478                  * Choose policy to randomly allocate memory across
 3479                  * lgroups in pset if it will fit and is not default
 3480                  * partition.  Otherwise, allocate memory randomly
 3481                  * across machine.
 3482                  */
 3483                 if (lgrp_mem_pset_aware && size < pset_mem_size)
 3484                         policy = LGRP_MEM_POLICY_RANDOM_PSET;
 3485                 else
 3486                         policy = LGRP_MEM_POLICY_RANDOM;
 3487         } else
 3488                 /*
 3489                  * Apply default policy for private memory and
 3490                  * shared memory under the respective random
 3491                  * threshold.
 3492                  */
 3493                 policy = lgrp_mem_default_policy;
 3494 
 3495         return (policy);
 3496 }
 3497 
 3498 /*
 3499  * Get memory allocation policy for this segment
 3500  */
 3501 lgrp_mem_policy_info_t *
 3502 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
 3503 {
 3504         lgrp_mem_policy_info_t  *policy_info;
 3505         extern struct seg_ops   segspt_ops;
 3506         extern struct seg_ops   segspt_shmops;
 3507 
 3508         /*
 3509          * This is for binary compatibility to protect against third party
 3510          * segment drivers which haven't recompiled to allow for
 3511          * SEGOP_GETPOLICY()
 3512          */
 3513         if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
 3514             seg->s_ops != &segspt_shmops)
 3515                 return (NULL);
 3516 
 3517         policy_info = NULL;
 3518         if (seg->s_ops->getpolicy != NULL)
 3519                 policy_info = SEGOP_GETPOLICY(seg, vaddr);
 3520 
 3521         return (policy_info);
 3522 }
 3523 
 3524 /*
 3525  * Set policy for allocating private memory given desired policy, policy info,
 3526  * size in bytes of memory that policy is being applied.
 3527  * Return 0 if policy wasn't set already and 1 if policy was set already
 3528  */
 3529 int
 3530 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
 3531     lgrp_mem_policy_info_t *policy_info, size_t size)
 3532 {
 3533 
 3534         ASSERT(policy_info != NULL);
 3535 
 3536         if (policy == LGRP_MEM_POLICY_DEFAULT)
 3537                 policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
 3538 
 3539         /*
 3540          * Policy set already?
 3541          */
 3542         if (policy == policy_info->mem_policy)
 3543                 return (1);
 3544 
 3545         /*
 3546          * Set policy
 3547          */
 3548         policy_info->mem_policy = policy;
 3549         policy_info->mem_lgrpid = LGRP_NONE;
 3550 
 3551         return (0);
 3552 }
 3553 
 3554 
 3555 /*
 3556  * Get shared memory allocation policy with given tree and offset
 3557  */
 3558 lgrp_mem_policy_info_t *
 3559 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
 3560     u_offset_t vn_off)
 3561 {
 3562         u_offset_t              off;
 3563         lgrp_mem_policy_info_t  *policy_info;
 3564         lgrp_shm_policy_seg_t   *policy_seg;
 3565         lgrp_shm_locality_t     *shm_locality;
 3566         avl_tree_t              *tree;
 3567         avl_index_t             where;
 3568 
 3569         /*
 3570          * Get policy segment tree from anon_map or vnode and use specified
 3571          * anon index or vnode offset as offset
 3572          *
 3573          * Assume that no lock needs to be held on anon_map or vnode, since
 3574          * they should be protected by their reference count which must be
 3575          * nonzero for an existing segment
 3576          */
 3577         if (amp) {
 3578                 ASSERT(amp->refcnt != 0);
 3579                 shm_locality = amp->locality;
 3580                 if (shm_locality == NULL)
 3581                         return (NULL);
 3582                 tree = shm_locality->loc_tree;
 3583                 off = ptob(anon_index);
 3584         } else if (vp) {
 3585                 shm_locality = vp->v_locality;
 3586                 if (shm_locality == NULL)
 3587                         return (NULL);
 3588                 ASSERT(shm_locality->loc_count != 0);
 3589                 tree = shm_locality->loc_tree;
 3590                 off = vn_off;
 3591         }
 3592 
 3593         if (tree == NULL)
 3594                 return (NULL);
 3595 
 3596         /*
 3597          * Lookup policy segment for offset into shared object and return
 3598          * policy info
 3599          */
 3600         rw_enter(&shm_locality->loc_lock, RW_READER);
 3601         policy_info = NULL;
 3602         policy_seg = avl_find(tree, &off, &where);
 3603         if (policy_seg)
 3604                 policy_info = &policy_seg->shm_policy;
 3605         rw_exit(&shm_locality->loc_lock);
 3606 
 3607         return (policy_info);
 3608 }
 3609 
 3610 /*
 3611  * Default memory allocation policy for kernel segmap pages
 3612  */
 3613 lgrp_mem_policy_t       lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
 3614 
 3615 /*
 3616  * Return lgroup to use for allocating memory
 3617  * given the segment and address
 3618  *
 3619  * There isn't any mutual exclusion that exists between calls
 3620  * to this routine and DR, so this routine and whomever calls it
 3621  * should be mindful of the possibility that the lgrp returned
 3622  * may be deleted. If this happens, dereferences of the lgrp
 3623  * pointer will still be safe, but the resources in the lgrp will
 3624  * be gone, and LGRP_EXISTS() will no longer be true.
 3625  */
 3626 lgrp_t *
 3627 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
 3628 {
 3629         int                     i;
 3630         lgrp_t                  *lgrp;
 3631         klgrpset_t              lgrpset;
 3632         int                     lgrps_spanned;
 3633         unsigned long           off;
 3634         lgrp_mem_policy_t       policy;
 3635         lgrp_mem_policy_info_t  *policy_info;
 3636         ushort_t                random;
 3637         int                     stat = 0;
 3638         extern struct seg       *segkmap;
 3639 
 3640         /*
 3641          * Just return null if the lgrp framework hasn't finished
 3642          * initializing or if this is a UMA machine.
 3643          */
 3644         if (nlgrps == 1 || !lgrp_initialized)
 3645                 return (lgrp_root);
 3646 
 3647         /*
 3648          * Get memory allocation policy for this segment
 3649          */
 3650         policy = lgrp_mem_default_policy;
 3651         if (seg != NULL) {
 3652                 if (seg->s_as == &kas) {
 3653                         if (seg == segkmap)
 3654                                 policy = lgrp_segmap_default_policy;
 3655                         if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
 3656                             policy == LGRP_MEM_POLICY_RANDOM_PSET)
 3657                                 policy = LGRP_MEM_POLICY_RANDOM;
 3658                 } else {
 3659                         policy_info = lgrp_mem_policy_get(seg, vaddr);
 3660                         if (policy_info != NULL) {
 3661                                 policy = policy_info->mem_policy;
 3662                                 if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
 3663                                         lgrp_id_t id = policy_info->mem_lgrpid;
 3664                                         ASSERT(id != LGRP_NONE);
 3665                                         ASSERT(id < NLGRPS_MAX);
 3666                                         lgrp = lgrp_table[id];
 3667                                         if (!LGRP_EXISTS(lgrp)) {
 3668                                                 policy = LGRP_MEM_POLICY_NEXT;
 3669                                         } else {
 3670                                                 lgrp_stat_add(id,
 3671                                                     LGRP_NUM_NEXT_SEG, 1);
 3672                                                 return (lgrp);
 3673                                         }
 3674                                 }
 3675                         }
 3676                 }
 3677         }
 3678         lgrpset = 0;
 3679 
 3680         /*
 3681          * Initialize lgroup to home by default
 3682          */
 3683         lgrp = lgrp_home_lgrp();
 3684 
 3685         /*
 3686          * When homing threads on root lgrp, override default memory
 3687          * allocation policies with root lgroup memory allocation policy
 3688          */
 3689         if (lgrp == lgrp_root)
 3690                 policy = lgrp_mem_policy_root;
 3691 
 3692         /*
 3693          * Implement policy
 3694          */
 3695         switch (policy) {
 3696         case LGRP_MEM_POLICY_NEXT_CPU:
 3697 
 3698                 /*
 3699                  * Return lgroup of current CPU which faulted on memory
 3700                  * If the CPU isn't currently in an lgrp, then opt to
 3701                  * allocate from the root.
 3702                  *
 3703                  * Kernel preemption needs to be disabled here to prevent
 3704                  * the current CPU from going away before lgrp is found.
 3705                  */
 3706                 if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
 3707                         lgrp = lgrp_root;
 3708                 } else {
 3709                         kpreempt_disable();
 3710                         lgrp = lgrp_cpu_to_lgrp(CPU);
 3711                         kpreempt_enable();
 3712                 }
 3713                 break;
 3714 
 3715         case LGRP_MEM_POLICY_NEXT:
 3716         case LGRP_MEM_POLICY_DEFAULT:
 3717         default:
 3718 
 3719                 /*
 3720                  * Just return current thread's home lgroup
 3721                  * for default policy (next touch)
 3722                  * If the thread is homed to the root,
 3723                  * then the default policy is random across lgroups.
 3724                  * Fallthrough to the random case.
 3725                  */
 3726                 if (lgrp != lgrp_root) {
 3727                         if (policy == LGRP_MEM_POLICY_NEXT)
 3728                                 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
 3729                         else
 3730                                 lgrp_stat_add(lgrp->lgrp_id,
 3731                                     LGRP_NUM_DEFAULT, 1);
 3732                         break;
 3733                 }
 3734                 /* LINTED fallthrough on case statement */
 3735         case LGRP_MEM_POLICY_RANDOM:
 3736 
 3737                 /*
 3738                  * Return a random leaf lgroup with memory
 3739                  */
 3740                 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
 3741                 /*
 3742                  * Count how many lgroups are spanned
 3743                  */
 3744                 klgrpset_nlgrps(lgrpset, lgrps_spanned);
 3745 
 3746                 /*
 3747                  * There may be no memnodes in the root lgroup during DR copy
 3748                  * rename on a system with only two boards (memnodes)
 3749                  * configured. In this case just return the root lgrp.
 3750                  */
 3751                 if (lgrps_spanned == 0) {
 3752                         lgrp = lgrp_root;
 3753                         break;
 3754                 }
 3755 
 3756                 /*
 3757                  * Pick a random offset within lgroups spanned
 3758                  * and return lgroup at that offset
 3759                  */
 3760                 random = (ushort_t)gethrtime() >> 4;
 3761                 off = random % lgrps_spanned;
 3762                 ASSERT(off <= lgrp_alloc_max);
 3763 
 3764                 for (i = 0; i <= lgrp_alloc_max; i++) {
 3765                         if (!klgrpset_ismember(lgrpset, i))
 3766                                 continue;
 3767                         if (off)
 3768                                 off--;
 3769                         else {
 3770                                 lgrp = lgrp_table[i];
 3771                                 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
 3772                                     1);
 3773                                 break;
 3774                         }
 3775                 }
 3776                 break;
 3777 
 3778         case LGRP_MEM_POLICY_RANDOM_PROC:
 3779 
 3780                 /*
 3781                  * Grab copy of bitmask of lgroups spanned by
 3782                  * this process
 3783                  */
 3784                 klgrpset_copy(lgrpset, curproc->p_lgrpset);
 3785                 stat = LGRP_NUM_RANDOM_PROC;
 3786 
 3787                 /* LINTED fallthrough on case statement */
 3788         case LGRP_MEM_POLICY_RANDOM_PSET:
 3789 
 3790                 if (!stat)
 3791                         stat = LGRP_NUM_RANDOM_PSET;
 3792 
 3793                 if (klgrpset_isempty(lgrpset)) {
 3794                         /*
 3795                          * Grab copy of bitmask of lgroups spanned by
 3796                          * this processor set
 3797                          */
 3798                         kpreempt_disable();
 3799                         klgrpset_copy(lgrpset,
 3800                             curthread->t_cpupart->cp_lgrpset);
 3801                         kpreempt_enable();
 3802                 }
 3803 
 3804                 /*
 3805                  * Count how many lgroups are spanned
 3806                  */
 3807                 klgrpset_nlgrps(lgrpset, lgrps_spanned);
 3808                 ASSERT(lgrps_spanned <= nlgrps);
 3809 
 3810                 /*
 3811                  * Probably lgrps_spanned should be always non-zero, but to be
 3812                  * on the safe side we return lgrp_root if it is empty.
 3813                  */
 3814                 if (lgrps_spanned == 0) {
 3815                         lgrp = lgrp_root;
 3816                         break;
 3817                 }
 3818 
 3819                 /*
 3820                  * Pick a random offset within lgroups spanned
 3821                  * and return lgroup at that offset
 3822                  */
 3823                 random = (ushort_t)gethrtime() >> 4;
 3824                 off = random % lgrps_spanned;
 3825                 ASSERT(off <= lgrp_alloc_max);
 3826 
 3827                 for (i = 0; i <= lgrp_alloc_max; i++) {
 3828                         if (!klgrpset_ismember(lgrpset, i))
 3829                                 continue;
 3830                         if (off)
 3831                                 off--;
 3832                         else {
 3833                                 lgrp = lgrp_table[i];
 3834                                 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
 3835                                     1);
 3836                                 break;
 3837                         }
 3838                 }
 3839                 break;
 3840 
 3841         case LGRP_MEM_POLICY_ROUNDROBIN:
 3842 
 3843                 /*
 3844                  * Use offset within segment to determine
 3845                  * offset from home lgroup to choose for
 3846                  * next lgroup to allocate memory from
 3847                  */
 3848                 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
 3849                     (lgrp_alloc_max + 1);
 3850 
 3851                 kpreempt_disable();
 3852                 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
 3853                 i = lgrp->lgrp_id;
 3854                 kpreempt_enable();
 3855 
 3856                 while (off > 0) {
 3857                         i = (i + 1) % (lgrp_alloc_max + 1);
 3858                         lgrp = lgrp_table[i];
 3859                         if (klgrpset_ismember(lgrpset, i))
 3860                                 off--;
 3861                 }
 3862                 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
 3863 
 3864                 break;
 3865         }
 3866 
 3867         ASSERT(lgrp != NULL);
 3868         return (lgrp);
 3869 }
 3870 
 3871 /*
 3872  * Return the number of pages in an lgroup
 3873  *
 3874  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
 3875  *       could cause tests that rely on the numat driver to fail....
 3876  */
 3877 pgcnt_t
 3878 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
 3879 {
 3880         lgrp_t *lgrp;
 3881 
 3882         lgrp = lgrp_table[lgrpid];
 3883         if (!LGRP_EXISTS(lgrp) ||
 3884             klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
 3885             !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
 3886                 return (0);
 3887 
 3888         return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
 3889 }
 3890 
 3891 /*
 3892  * Initialize lgroup shared memory allocation policy support
 3893  */
 3894 void
 3895 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
 3896 {
 3897         lgrp_shm_locality_t     *shm_locality;
 3898 
 3899         /*
 3900          * Initialize locality field in anon_map
 3901          * Don't need any locks because this is called when anon_map is
 3902          * allocated, but not used anywhere yet.
 3903          */
 3904         if (amp) {
 3905                 ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 3906                 if (amp->locality == NULL) {
 3907                         /*
 3908                          * Allocate and initialize shared memory locality info
 3909                          * and set anon_map locality pointer to it
 3910                          * Drop lock across kmem_alloc(KM_SLEEP)
 3911                          */
 3912                         ANON_LOCK_EXIT(&amp->a_rwlock);
 3913                         shm_locality = kmem_alloc(sizeof (*shm_locality),
 3914                             KM_SLEEP);
 3915                         rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
 3916                             NULL);
 3917                         shm_locality->loc_count = 1;    /* not used for amp */
 3918                         shm_locality->loc_tree = NULL;
 3919 
 3920                         /*
 3921                          * Reacquire lock and check to see whether anyone beat
 3922                          * us to initializing the locality info
 3923                          */
 3924                         ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
 3925                         if (amp->locality != NULL) {
 3926                                 rw_destroy(&shm_locality->loc_lock);
 3927                                 kmem_free(shm_locality,
 3928                                     sizeof (*shm_locality));
 3929                         } else
 3930                                 amp->locality = shm_locality;
 3931                 }
 3932                 ANON_LOCK_EXIT(&amp->a_rwlock);
 3933                 return;
 3934         }
 3935 
 3936         /*
 3937          * Allocate shared vnode policy info if vnode is not locality aware yet
 3938          */
 3939         mutex_enter(&vp->v_lock);
 3940         if ((vp->v_flag & V_LOCALITY) == 0) {
 3941                 /*
 3942                  * Allocate and initialize shared memory locality info
 3943                  */
 3944                 mutex_exit(&vp->v_lock);
 3945                 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
 3946                 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
 3947                 shm_locality->loc_count = 1;
 3948                 shm_locality->loc_tree = NULL;
 3949 
 3950                 /*
 3951                  * Point vnode locality field at shared vnode policy info
 3952                  * and set locality aware flag in vnode
 3953                  */
 3954                 mutex_enter(&vp->v_lock);
 3955                 if ((vp->v_flag & V_LOCALITY) == 0) {
 3956                         vp->v_locality = shm_locality;
 3957                         vp->v_flag |= V_LOCALITY;
 3958                 } else {
 3959                         /*
 3960                          * Lost race so free locality info and increment count.
 3961                          */
 3962                         rw_destroy(&shm_locality->loc_lock);
 3963                         kmem_free(shm_locality, sizeof (*shm_locality));
 3964                         shm_locality = vp->v_locality;
 3965                         shm_locality->loc_count++;
 3966                 }
 3967                 mutex_exit(&vp->v_lock);
 3968 
 3969                 return;
 3970         }
 3971 
 3972         /*
 3973          * Increment reference count of number of segments mapping this vnode
 3974          * shared
 3975          */
 3976         shm_locality = vp->v_locality;
 3977         shm_locality->loc_count++;
 3978         mutex_exit(&vp->v_lock);
 3979 }
 3980 
 3981 /*
 3982  * Destroy the given shared memory policy segment tree
 3983  */
 3984 void
 3985 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
 3986 {
 3987         lgrp_shm_policy_seg_t   *cur;
 3988         lgrp_shm_policy_seg_t   *next;
 3989 
 3990         if (tree == NULL)
 3991                 return;
 3992 
 3993         cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
 3994         while (cur != NULL) {
 3995                 next = AVL_NEXT(tree, cur);
 3996                 avl_remove(tree, cur);
 3997                 kmem_free(cur, sizeof (*cur));
 3998                 cur = next;
 3999         }
 4000         kmem_free(tree, sizeof (avl_tree_t));
 4001 }
 4002 
 4003 /*
 4004  * Uninitialize lgroup shared memory allocation policy support
 4005  */
 4006 void
 4007 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
 4008 {
 4009         lgrp_shm_locality_t     *shm_locality;
 4010 
 4011         /*
 4012          * For anon_map, deallocate shared memory policy tree and
 4013          * zero locality field
 4014          * Don't need any locks because anon_map is being freed
 4015          */
 4016         if (amp) {
 4017                 if (amp->locality == NULL)
 4018                         return;
 4019                 shm_locality = amp->locality;
 4020                 shm_locality->loc_count = 0;    /* not really used for amp */
 4021                 rw_destroy(&shm_locality->loc_lock);
 4022                 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
 4023                 kmem_free(shm_locality, sizeof (*shm_locality));
 4024                 amp->locality = 0;
 4025                 return;
 4026         }
 4027 
 4028         /*
 4029          * For vnode, decrement reference count of segments mapping this vnode
 4030          * shared and delete locality info if reference count drops to 0
 4031          */
 4032         mutex_enter(&vp->v_lock);
 4033         shm_locality = vp->v_locality;
 4034         shm_locality->loc_count--;
 4035 
 4036         if (shm_locality->loc_count == 0) {
 4037                 rw_destroy(&shm_locality->loc_lock);
 4038                 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
 4039                 kmem_free(shm_locality, sizeof (*shm_locality));
 4040                 vp->v_locality = 0;
 4041                 vp->v_flag &= ~V_LOCALITY;
 4042         }
 4043         mutex_exit(&vp->v_lock);
 4044 }
 4045 
 4046 /*
 4047  * Compare two shared memory policy segments
 4048  * Used by AVL tree code for searching
 4049  */
 4050 int
 4051 lgrp_shm_policy_compar(const void *x, const void *y)
 4052 {
 4053         lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
 4054         lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
 4055 
 4056         if (a->shm_off < b->shm_off)
 4057                 return (-1);
 4058         if (a->shm_off >= b->shm_off + b->shm_size)
 4059                 return (1);
 4060         return (0);
 4061 }
 4062 
 4063 /*
 4064  * Concatenate seg1 with seg2 and remove seg2
 4065  */
 4066 static int
 4067 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
 4068     lgrp_shm_policy_seg_t *seg2)
 4069 {
 4070         if (!seg1 || !seg2 ||
 4071             seg1->shm_off + seg1->shm_size != seg2->shm_off ||
 4072             seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
 4073                 return (-1);
 4074 
 4075         seg1->shm_size += seg2->shm_size;
 4076         avl_remove(tree, seg2);
 4077         kmem_free(seg2, sizeof (*seg2));
 4078         return (0);
 4079 }
 4080 
 4081 /*
 4082  * Split segment at given offset and return rightmost (uppermost) segment
 4083  * Assumes that there are no overlapping segments
 4084  */
 4085 static lgrp_shm_policy_seg_t *
 4086 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
 4087     u_offset_t off)
 4088 {
 4089         lgrp_shm_policy_seg_t   *newseg;
 4090         avl_index_t             where;
 4091 
 4092         ASSERT(seg != NULL);
 4093         ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
 4094 
 4095         if (!seg || off < seg->shm_off || off > seg->shm_off +
 4096             seg->shm_size)
 4097                 return (NULL);
 4098 
 4099         if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
 4100                 return (seg);
 4101 
 4102         /*
 4103          * Adjust size of left segment and allocate new (right) segment
 4104          */
 4105         newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
 4106         newseg->shm_policy = seg->shm_policy;
 4107         newseg->shm_off = off;
 4108         newseg->shm_size = seg->shm_size - (off - seg->shm_off);
 4109         seg->shm_size = off - seg->shm_off;
 4110 
 4111         /*
 4112          * Find where to insert new segment in AVL tree and insert it
 4113          */
 4114         (void) avl_find(tree, &off, &where);
 4115         avl_insert(tree, newseg, where);
 4116 
 4117         return (newseg);
 4118 }
 4119 
 4120 /*
 4121  * Set shared memory allocation policy on specified shared object at given
 4122  * offset and length
 4123  *
 4124  * Return 0 if policy wasn't set already, 1 if policy was set already, and
 4125  * -1 if can't set policy.
 4126  */
 4127 int
 4128 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
 4129     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
 4130 {
 4131         u_offset_t              eoff;
 4132         lgrp_shm_policy_seg_t   *next;
 4133         lgrp_shm_policy_seg_t   *newseg;
 4134         u_offset_t              off;
 4135         u_offset_t              oldeoff;
 4136         lgrp_shm_policy_seg_t   *prev;
 4137         int                     retval;
 4138         lgrp_shm_policy_seg_t   *seg;
 4139         lgrp_shm_locality_t     *shm_locality;
 4140         avl_tree_t              *tree;
 4141         avl_index_t             where;
 4142 
 4143         ASSERT(amp || vp);
 4144         ASSERT((len & PAGEOFFSET) == 0);
 4145 
 4146         if (len == 0)
 4147                 return (-1);
 4148 
 4149         retval = 0;
 4150 
 4151         /*
 4152          * Get locality info and starting offset into shared object
 4153          * Try anon map first and then vnode
 4154          * Assume that no locks need to be held on anon_map or vnode, since
 4155          * it should be protected by its reference count which must be nonzero
 4156          * for an existing segment.
 4157          */
 4158         if (amp) {
 4159                 /*
 4160                  * Get policy info from anon_map
 4161                  *
 4162                  */
 4163                 ASSERT(amp->refcnt != 0);
 4164                 if (amp->locality == NULL)
 4165                         lgrp_shm_policy_init(amp, NULL);
 4166                 shm_locality = amp->locality;
 4167                 off = ptob(anon_index);
 4168         } else if (vp) {
 4169                 /*
 4170                  * Get policy info from vnode
 4171                  */
 4172                 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
 4173                         lgrp_shm_policy_init(NULL, vp);
 4174                 shm_locality = vp->v_locality;
 4175                 ASSERT(shm_locality->loc_count != 0);
 4176                 off = vn_off;
 4177         } else
 4178                 return (-1);
 4179 
 4180         ASSERT((off & PAGEOFFSET) == 0);
 4181 
 4182         /*
 4183          * Figure out default policy
 4184          */
 4185         if (policy == LGRP_MEM_POLICY_DEFAULT)
 4186                 policy = lgrp_mem_policy_default(len, MAP_SHARED);
 4187 
 4188         /*
 4189          * Create AVL tree if there isn't one yet
 4190          * and set locality field to point at it
 4191          */
 4192         rw_enter(&shm_locality->loc_lock, RW_WRITER);
 4193         tree = shm_locality->loc_tree;
 4194         if (!tree) {
 4195                 rw_exit(&shm_locality->loc_lock);
 4196 
 4197                 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
 4198 
 4199                 rw_enter(&shm_locality->loc_lock, RW_WRITER);
 4200                 if (shm_locality->loc_tree == NULL) {
 4201                         avl_create(tree, lgrp_shm_policy_compar,
 4202                             sizeof (lgrp_shm_policy_seg_t),
 4203                             offsetof(lgrp_shm_policy_seg_t, shm_tree));
 4204                         shm_locality->loc_tree = tree;
 4205                 } else {
 4206                         /*
 4207                          * Another thread managed to set up the tree
 4208                          * before we could. Free the tree we allocated
 4209                          * and use the one that's already there.
 4210                          */
 4211                         kmem_free(tree, sizeof (*tree));
 4212                         tree = shm_locality->loc_tree;
 4213                 }
 4214         }
 4215 
 4216         /*
 4217          * Set policy
 4218          *
 4219          * Need to maintain hold on writer's lock to keep tree from
 4220          * changing out from under us
 4221          */
 4222         while (len != 0) {
 4223                 /*
 4224                  * Find policy segment for specified offset into shared object
 4225                  */
 4226                 seg = avl_find(tree, &off, &where);
 4227 
 4228                 /*
 4229                  * Didn't find any existing segment that contains specified
 4230                  * offset, so allocate new segment, insert it, and concatenate
 4231                  * with adjacent segments if possible
 4232                  */
 4233                 if (seg == NULL) {
 4234                         newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
 4235                             KM_SLEEP);
 4236                         newseg->shm_policy.mem_policy = policy;
 4237                         newseg->shm_policy.mem_lgrpid = LGRP_NONE;
 4238                         newseg->shm_off = off;
 4239                         avl_insert(tree, newseg, where);
 4240 
 4241                         /*
 4242                          * Check to see whether new segment overlaps with next
 4243                          * one, set length of new segment accordingly, and
 4244                          * calculate remaining length and next offset
 4245                          */
 4246                         seg = AVL_NEXT(tree, newseg);
 4247                         if (seg == NULL || off + len <= seg->shm_off) {
 4248                                 newseg->shm_size = len;
 4249                                 len = 0;
 4250                         } else {
 4251                                 newseg->shm_size = seg->shm_off - off;
 4252                                 off = seg->shm_off;
 4253                                 len -= newseg->shm_size;
 4254                         }
 4255 
 4256                         /*
 4257                          * Try to concatenate new segment with next and
 4258                          * previous ones, since they might have the same policy
 4259                          * now.  Grab previous and next segments first because
 4260                          * they will change on concatenation.
 4261                          */
 4262                         prev =  AVL_PREV(tree, newseg);
 4263                         next = AVL_NEXT(tree, newseg);
 4264                         (void) lgrp_shm_policy_concat(tree, newseg, next);
 4265                         (void) lgrp_shm_policy_concat(tree, prev, newseg);
 4266 
 4267                         continue;
 4268                 }
 4269 
 4270                 eoff = off + len;
 4271                 oldeoff = seg->shm_off + seg->shm_size;
 4272 
 4273                 /*
 4274                  * Policy set already?
 4275                  */
 4276                 if (policy == seg->shm_policy.mem_policy) {
 4277                         /*
 4278                          * Nothing left to do if offset and length
 4279                          * fall within this segment
 4280                          */
 4281                         if (eoff <= oldeoff) {
 4282                                 retval = 1;
 4283                                 break;
 4284                         } else {
 4285                                 len = eoff - oldeoff;
 4286                                 off = oldeoff;
 4287                                 continue;
 4288                         }
 4289                 }
 4290 
 4291                 /*
 4292                  * Specified offset and length match existing segment exactly
 4293                  */
 4294                 if (off == seg->shm_off && len == seg->shm_size) {
 4295                         /*
 4296                          * Set policy and update current length
 4297                          */
 4298                         seg->shm_policy.mem_policy = policy;
 4299                         seg->shm_policy.mem_lgrpid = LGRP_NONE;
 4300                         len = 0;
 4301 
 4302                         /*
 4303                          * Try concatenating new segment with previous and next
 4304                          * segments, since they might have the same policy now.
 4305                          * Grab previous and next segments first because they
 4306                          * will change on concatenation.
 4307                          */
 4308                         prev =  AVL_PREV(tree, seg);
 4309                         next = AVL_NEXT(tree, seg);
 4310                         (void) lgrp_shm_policy_concat(tree, seg, next);
 4311                         (void) lgrp_shm_policy_concat(tree, prev, seg);
 4312                 } else {
 4313                         /*
 4314                          * Specified offset and length only apply to part of
 4315                          * existing segment
 4316                          */
 4317 
 4318                         /*
 4319                          * New segment starts in middle of old one, so split
 4320                          * new one off near beginning of old one
 4321                          */
 4322                         newseg = NULL;
 4323                         if (off > seg->shm_off) {
 4324                                 newseg = lgrp_shm_policy_split(tree, seg, off);
 4325 
 4326                                 /*
 4327                                  * New segment ends where old one did, so try
 4328                                  * to concatenate with next segment
 4329                                  */
 4330                                 if (eoff == oldeoff) {
 4331                                         newseg->shm_policy.mem_policy = policy;
 4332                                         newseg->shm_policy.mem_lgrpid =
 4333                                             LGRP_NONE;
 4334                                         (void) lgrp_shm_policy_concat(tree,
 4335                                             newseg, AVL_NEXT(tree, newseg));
 4336                                         break;
 4337                                 }
 4338                         }
 4339 
 4340                         /*
 4341                          * New segment ends before old one, so split off end of
 4342                          * old one
 4343                          */
 4344                         if (eoff < oldeoff) {
 4345                                 if (newseg) {
 4346                                         (void) lgrp_shm_policy_split(tree,
 4347                                             newseg, eoff);
 4348                                         newseg->shm_policy.mem_policy = policy;
 4349                                         newseg->shm_policy.mem_lgrpid =
 4350                                             LGRP_NONE;
 4351                                 } else {
 4352                                         (void) lgrp_shm_policy_split(tree, seg,
 4353                                             eoff);
 4354                                         seg->shm_policy.mem_policy = policy;
 4355                                         seg->shm_policy.mem_lgrpid = LGRP_NONE;
 4356                                 }
 4357 
 4358                                 if (off == seg->shm_off)
 4359                                         (void) lgrp_shm_policy_concat(tree,
 4360                                             AVL_PREV(tree, seg), seg);
 4361                                 break;
 4362                         }
 4363 
 4364                         /*
 4365                          * Calculate remaining length and next offset
 4366                          */
 4367                         len = eoff - oldeoff;
 4368                         off = oldeoff;
 4369                 }
 4370         }
 4371 
 4372         rw_exit(&shm_locality->loc_lock);
 4373         return (retval);
 4374 }
 4375 
 4376 /*
 4377  * Return the best memnode from which to allocate memory given
 4378  * an lgroup.
 4379  *
 4380  * "c" is for cookie, which is good enough for me.
 4381  * It references a cookie struct that should be zero'ed to initialize.
 4382  * The cookie should live on the caller's stack.
 4383  *
 4384  * The routine returns -1 when:
 4385  *      - traverse is 0, and all the memnodes in "lgrp" have been returned.
 4386  *      - traverse is 1, and all the memnodes in the system have been
 4387  *        returned.
 4388  */
 4389 int
 4390 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
 4391 {
 4392         lgrp_t          *lp = c->lmc_lgrp;
 4393         mnodeset_t      nodes = c->lmc_nodes;
 4394         int             cnt = c->lmc_cnt;
 4395         int             offset, mnode;
 4396 
 4397         extern int      max_mem_nodes;
 4398 
 4399         /*
 4400          * If the set is empty, and the caller is willing, traverse
 4401          * up the hierarchy until we find a non-empty set.
 4402          */
 4403         while (nodes == (mnodeset_t)0 || cnt <= 0) {
 4404                 if (c->lmc_scope == LGRP_SRCH_LOCAL ||
 4405                     ((lp = lp->lgrp_parent) == NULL))
 4406                         return (-1);
 4407 
 4408                 nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
 4409                 cnt = lp->lgrp_nmnodes - c->lmc_ntried;
 4410         }
 4411 
 4412         /*
 4413          * Select a memnode by picking one at a "random" offset.
 4414          * Because of DR, memnodes can come and go at any time.
 4415          * This code must be able to cope with the possibility
 4416          * that the nodes count "cnt" is inconsistent with respect
 4417          * to the number of elements actually in "nodes", and
 4418          * therefore that the offset chosen could be greater than
 4419          * the number of elements in the set (some memnodes may
 4420          * have dissapeared just before cnt was read).
 4421          * If this happens, the search simply wraps back to the
 4422          * beginning of the set.
 4423          */
 4424         ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
 4425         offset = c->lmc_rand % cnt;
 4426         do {
 4427                 for (mnode = 0; mnode < max_mem_nodes; mnode++)
 4428                         if (nodes & ((mnodeset_t)1 << mnode))
 4429                                 if (!offset--)
 4430                                         break;
 4431         } while (mnode >= max_mem_nodes);
 4432 
 4433         /* Found a node. Store state before returning. */
 4434         c->lmc_lgrp = lp;
 4435         c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
 4436         c->lmc_cnt = cnt - 1;
 4437         c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
 4438         c->lmc_ntried++;
 4439 
 4440         return (mnode);
 4441 }

Cache object: 4b6ca1dd3e4edd34846246b06220c599


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.