The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_cpuset.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
    5  * All rights reserved.
    6  * 
    7  * Copyright (c) 2008 Nokia Corporation
    8  * All rights reserved.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice unmodified, this list of conditions, and the following
   15  *    disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   30  *
   31  */
   32 
   33 #include <sys/cdefs.h>
   34 __FBSDID("$FreeBSD: releng/12.0/sys/kern/kern_cpuset.c 340401 2018-11-13 18:21:47Z markj $");
   35 
   36 #include "opt_ddb.h"
   37 
   38 #include <sys/param.h>
   39 #include <sys/systm.h>
   40 #include <sys/sysctl.h>
   41 #include <sys/ctype.h>
   42 #include <sys/sysproto.h>
   43 #include <sys/jail.h>
   44 #include <sys/kernel.h>
   45 #include <sys/lock.h>
   46 #include <sys/malloc.h>
   47 #include <sys/mutex.h>
   48 #include <sys/priv.h>
   49 #include <sys/proc.h>
   50 #include <sys/refcount.h>
   51 #include <sys/sched.h>
   52 #include <sys/smp.h>
   53 #include <sys/syscallsubr.h>
   54 #include <sys/capsicum.h>
   55 #include <sys/cpuset.h>
   56 #include <sys/domainset.h>
   57 #include <sys/sx.h>
   58 #include <sys/queue.h>
   59 #include <sys/libkern.h>
   60 #include <sys/limits.h>
   61 #include <sys/bus.h>
   62 #include <sys/interrupt.h>
   63 #include <sys/vmmeter.h>
   64 
   65 #include <vm/uma.h>
   66 #include <vm/vm.h>
   67 #include <vm/vm_object.h>
   68 #include <vm/vm_page.h>
   69 #include <vm/vm_pageout.h>
   70 #include <vm/vm_extern.h>
   71 #include <vm/vm_param.h>
   72 #include <vm/vm_phys.h>
   73 #include <vm/vm_pagequeue.h>
   74 
   75 #ifdef DDB
   76 #include <ddb/ddb.h>
   77 #endif /* DDB */
   78 
   79 /*
   80  * cpusets provide a mechanism for creating and manipulating sets of
   81  * processors for the purpose of constraining the scheduling of threads to
   82  * specific processors.
   83  *
   84  * Each process belongs to an identified set, by default this is set 1.  Each
   85  * thread may further restrict the cpus it may run on to a subset of this
   86  * named set.  This creates an anonymous set which other threads and processes
   87  * may not join by number.
   88  *
   89  * The named set is referred to herein as the 'base' set to avoid ambiguity.
   90  * This set is usually a child of a 'root' set while the anonymous set may
   91  * simply be referred to as a mask.  In the syscall api these are referred to
   92  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
   93  *
   94  * Threads inherit their set from their creator whether it be anonymous or
   95  * not.  This means that anonymous sets are immutable because they may be
   96  * shared.  To modify an anonymous set a new set is created with the desired
   97  * mask and the same parent as the existing anonymous set.  This gives the
   98  * illusion of each thread having a private mask.
   99  *
  100  * Via the syscall apis a user may ask to retrieve or modify the root, base,
  101  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
  102  * modifies all numbered and anonymous child sets to comply with the new mask.
  103  * Modifying a pid or tid's mask applies only to that tid but must still
  104  * exist within the assigned parent set.
  105  *
  106  * A thread may not be assigned to a group separate from other threads in
  107  * the process.  This is to remove ambiguity when the setid is queried with
  108  * a pid argument.  There is no other technical limitation.
  109  *
  110  * This somewhat complex arrangement is intended to make it easy for
  111  * applications to query available processors and bind their threads to
  112  * specific processors while also allowing administrators to dynamically
  113  * reprovision by changing sets which apply to groups of processes.
  114  *
  115  * A simple application should not concern itself with sets at all and
  116  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
  117  * meaning 'curthread'.  It may query available cpus for that tid with a
  118  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  119  */
  120 
  121 LIST_HEAD(domainlist, domainset);
  122 struct domainset __read_mostly domainset_fixed[MAXMEMDOM];
  123 struct domainset __read_mostly domainset_prefer[MAXMEMDOM];
  124 struct domainset __read_mostly domainset_roundrobin;
  125 
  126 static uma_zone_t cpuset_zone;
  127 static uma_zone_t domainset_zone;
  128 static struct mtx cpuset_lock;
  129 static struct setlist cpuset_ids;
  130 static struct domainlist cpuset_domains;
  131 static struct unrhdr *cpuset_unr;
  132 static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel;
  133 static struct domainset domainset0, domainset2;
  134 
  135 /* Return the size of cpuset_t at the kernel level */
  136 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
  137     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
  138 
  139 cpuset_t *cpuset_root;
  140 cpuset_t cpuset_domain[MAXMEMDOM];
  141 
  142 static int domainset_valid(const struct domainset *, const struct domainset *);
  143 
  144 /*
  145  * Find the first non-anonymous set starting from 'set'.
  146  */
  147 static struct cpuset *
  148 cpuset_getbase(struct cpuset *set)
  149 {
  150 
  151         if (set->cs_id == CPUSET_INVALID)
  152                 set = set->cs_parent;
  153         return (set);
  154 }
  155 
  156 /*
  157  * Walks up the tree from 'set' to find the root.
  158  */
  159 static struct cpuset *
  160 cpuset_getroot(struct cpuset *set)
  161 {
  162 
  163         while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
  164                 set = set->cs_parent;
  165         return (set);
  166 }
  167 
  168 /*
  169  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  170  */
  171 struct cpuset *
  172 cpuset_ref(struct cpuset *set)
  173 {
  174 
  175         refcount_acquire(&set->cs_ref);
  176         return (set);
  177 }
  178 
  179 /*
  180  * Walks up the tree from 'set' to find the root.  Returns the root
  181  * referenced.
  182  */
  183 static struct cpuset *
  184 cpuset_refroot(struct cpuset *set)
  185 {
  186 
  187         return (cpuset_ref(cpuset_getroot(set)));
  188 }
  189 
  190 /*
  191  * Find the first non-anonymous set starting from 'set'.  Returns this set
  192  * referenced.  May return the passed in set with an extra ref if it is
  193  * not anonymous. 
  194  */
  195 static struct cpuset *
  196 cpuset_refbase(struct cpuset *set)
  197 {
  198 
  199         return (cpuset_ref(cpuset_getbase(set)));
  200 }
  201 
  202 /*
  203  * Release a reference in a context where it is safe to allocate.
  204  */
  205 void
  206 cpuset_rel(struct cpuset *set)
  207 {
  208         cpusetid_t id;
  209 
  210         if (refcount_release(&set->cs_ref) == 0)
  211                 return;
  212         mtx_lock_spin(&cpuset_lock);
  213         LIST_REMOVE(set, cs_siblings);
  214         id = set->cs_id;
  215         if (id != CPUSET_INVALID)
  216                 LIST_REMOVE(set, cs_link);
  217         mtx_unlock_spin(&cpuset_lock);
  218         cpuset_rel(set->cs_parent);
  219         uma_zfree(cpuset_zone, set);
  220         if (id != CPUSET_INVALID)
  221                 free_unr(cpuset_unr, id);
  222 }
  223 
  224 /*
  225  * Deferred release must be used when in a context that is not safe to
  226  * allocate/free.  This places any unreferenced sets on the list 'head'.
  227  */
  228 static void
  229 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
  230 {
  231 
  232         if (refcount_release(&set->cs_ref) == 0)
  233                 return;
  234         mtx_lock_spin(&cpuset_lock);
  235         LIST_REMOVE(set, cs_siblings);
  236         if (set->cs_id != CPUSET_INVALID)
  237                 LIST_REMOVE(set, cs_link);
  238         LIST_INSERT_HEAD(head, set, cs_link);
  239         mtx_unlock_spin(&cpuset_lock);
  240 }
  241 
  242 /*
  243  * Complete a deferred release.  Removes the set from the list provided to
  244  * cpuset_rel_defer.
  245  */
  246 static void
  247 cpuset_rel_complete(struct cpuset *set)
  248 {
  249         LIST_REMOVE(set, cs_link);
  250         cpuset_rel(set->cs_parent);
  251         uma_zfree(cpuset_zone, set);
  252 }
  253 
  254 /*
  255  * Find a set based on an id.  Returns it with a ref.
  256  */
  257 static struct cpuset *
  258 cpuset_lookup(cpusetid_t setid, struct thread *td)
  259 {
  260         struct cpuset *set;
  261 
  262         if (setid == CPUSET_INVALID)
  263                 return (NULL);
  264         mtx_lock_spin(&cpuset_lock);
  265         LIST_FOREACH(set, &cpuset_ids, cs_link)
  266                 if (set->cs_id == setid)
  267                         break;
  268         if (set)
  269                 cpuset_ref(set);
  270         mtx_unlock_spin(&cpuset_lock);
  271 
  272         KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
  273         if (set != NULL && jailed(td->td_ucred)) {
  274                 struct cpuset *jset, *tset;
  275 
  276                 jset = td->td_ucred->cr_prison->pr_cpuset;
  277                 for (tset = set; tset != NULL; tset = tset->cs_parent)
  278                         if (tset == jset)
  279                                 break;
  280                 if (tset == NULL) {
  281                         cpuset_rel(set);
  282                         set = NULL;
  283                 }
  284         }
  285 
  286         return (set);
  287 }
  288 
  289 /*
  290  * Create a set in the space provided in 'set' with the provided parameters.
  291  * The set is returned with a single ref.  May return EDEADLK if the set
  292  * will have no valid cpu based on restrictions from the parent.
  293  */
  294 static int
  295 _cpuset_create(struct cpuset *set, struct cpuset *parent,
  296     const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
  297 {
  298 
  299         if (domain == NULL)
  300                 domain = parent->cs_domain;
  301         if (mask == NULL)
  302                 mask = &parent->cs_mask;
  303         if (!CPU_OVERLAP(&parent->cs_mask, mask))
  304                 return (EDEADLK);
  305         /* The domain must be prepared ahead of time. */
  306         if (!domainset_valid(parent->cs_domain, domain))
  307                 return (EDEADLK);
  308         CPU_COPY(mask, &set->cs_mask);
  309         LIST_INIT(&set->cs_children);
  310         refcount_init(&set->cs_ref, 1);
  311         set->cs_flags = 0;
  312         mtx_lock_spin(&cpuset_lock);
  313         set->cs_domain = domain;
  314         CPU_AND(&set->cs_mask, &parent->cs_mask);
  315         set->cs_id = id;
  316         set->cs_parent = cpuset_ref(parent);
  317         LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
  318         if (set->cs_id != CPUSET_INVALID)
  319                 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
  320         mtx_unlock_spin(&cpuset_lock);
  321 
  322         return (0);
  323 }
  324 
  325 /*
  326  * Create a new non-anonymous set with the requested parent and mask.  May
  327  * return failures if the mask is invalid or a new number can not be
  328  * allocated.
  329  */
  330 static int
  331 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
  332 {
  333         struct cpuset *set;
  334         cpusetid_t id;
  335         int error;
  336 
  337         id = alloc_unr(cpuset_unr);
  338         if (id == -1)
  339                 return (ENFILE);
  340         *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
  341         error = _cpuset_create(set, parent, mask, NULL, id);
  342         if (error == 0)
  343                 return (0);
  344         free_unr(cpuset_unr, id);
  345         uma_zfree(cpuset_zone, set);
  346 
  347         return (error);
  348 }
  349 
  350 static void
  351 cpuset_freelist_add(struct setlist *list, int count)
  352 {
  353         struct cpuset *set;
  354         int i;
  355 
  356         for (i = 0; i < count; i++) {
  357                 set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
  358                 LIST_INSERT_HEAD(list, set, cs_link);
  359         }
  360 }
  361 
  362 static void
  363 cpuset_freelist_init(struct setlist *list, int count)
  364 {
  365 
  366         LIST_INIT(list);
  367         cpuset_freelist_add(list, count);
  368 }
  369 
  370 static void
  371 cpuset_freelist_free(struct setlist *list)
  372 {
  373         struct cpuset *set;
  374 
  375         while ((set = LIST_FIRST(list)) != NULL) {
  376                 LIST_REMOVE(set, cs_link);
  377                 uma_zfree(cpuset_zone, set);
  378         }
  379 }
  380 
  381 static void
  382 domainset_freelist_add(struct domainlist *list, int count)
  383 {
  384         struct domainset *set;
  385         int i;
  386 
  387         for (i = 0; i < count; i++) {
  388                 set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
  389                 LIST_INSERT_HEAD(list, set, ds_link);
  390         }
  391 }
  392 
  393 static void
  394 domainset_freelist_init(struct domainlist *list, int count)
  395 {
  396 
  397         LIST_INIT(list);
  398         domainset_freelist_add(list, count);
  399 }
  400 
  401 static void
  402 domainset_freelist_free(struct domainlist *list)
  403 {
  404         struct domainset *set;
  405 
  406         while ((set = LIST_FIRST(list)) != NULL) {
  407                 LIST_REMOVE(set, ds_link);
  408                 uma_zfree(domainset_zone, set);
  409         }
  410 }
  411 
  412 /* Copy a domainset preserving mask and policy. */
  413 static void
  414 domainset_copy(const struct domainset *from, struct domainset *to)
  415 {
  416 
  417         DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
  418         to->ds_policy = from->ds_policy;
  419         to->ds_prefer = from->ds_prefer;
  420 }
  421 
  422 /* Return 1 if mask and policy are equal, otherwise 0. */
  423 static int
  424 domainset_equal(const struct domainset *one, const struct domainset *two)
  425 {
  426 
  427         return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
  428             one->ds_policy == two->ds_policy &&
  429             one->ds_prefer == two->ds_prefer);
  430 }
  431 
  432 /* Return 1 if child is a valid subset of parent. */
  433 static int
  434 domainset_valid(const struct domainset *parent, const struct domainset *child)
  435 {
  436         if (child->ds_policy != DOMAINSET_POLICY_PREFER)
  437                 return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
  438         return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
  439 }
  440 
  441 static int
  442 domainset_restrict(const struct domainset *parent,
  443     const struct domainset *child)
  444 {
  445         if (child->ds_policy != DOMAINSET_POLICY_PREFER)
  446                 return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
  447         return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
  448 }
  449 
  450 /*
  451  * Lookup or create a domainset.  The key is provided in ds_mask and
  452  * ds_policy.  If the domainset does not yet exist the storage in
  453  * 'domain' is used to insert.  Otherwise this storage is freed to the
  454  * domainset_zone and the existing domainset is returned.
  455  */
  456 static struct domainset *
  457 _domainset_create(struct domainset *domain, struct domainlist *freelist)
  458 {
  459         struct domainset *ndomain;
  460         int i, j, max;
  461 
  462         KASSERT(domain->ds_cnt <= vm_ndomains,
  463             ("invalid domain count in domainset %p", domain));
  464         KASSERT(domain->ds_policy != DOMAINSET_POLICY_PREFER ||
  465             domain->ds_prefer < vm_ndomains,
  466             ("invalid preferred domain in domains %p", domain));
  467 
  468         mtx_lock_spin(&cpuset_lock);
  469         LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
  470                 if (domainset_equal(ndomain, domain))
  471                         break;
  472         /*
  473          * If the domain does not yet exist we insert it and initialize
  474          * various iteration helpers which are not part of the key.
  475          */
  476         if (ndomain == NULL) {
  477                 LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
  478                 domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
  479                 max = DOMAINSET_FLS(&domain->ds_mask) + 1;
  480                 for (i = 0, j = 0; i < max; i++)
  481                         if (DOMAINSET_ISSET(i, &domain->ds_mask))
  482                                 domain->ds_order[j++] = i;
  483         }
  484         mtx_unlock_spin(&cpuset_lock);
  485         if (ndomain == NULL)
  486                 return (domain);
  487         if (freelist != NULL)
  488                 LIST_INSERT_HEAD(freelist, domain, ds_link);
  489         else
  490                 uma_zfree(domainset_zone, domain);
  491         return (ndomain);
  492         
  493 }
  494 
  495 /*
  496  * Are any of the domains in the mask empty?  If so, silently
  497  * remove them and update the domainset accordingly.  If only empty
  498  * domains are present, we must return failure.
  499  */
  500 static bool
  501 domainset_empty_vm(struct domainset *domain)
  502 {
  503         int i, j, max;
  504 
  505         max = DOMAINSET_FLS(&domain->ds_mask) + 1;
  506         for (i = 0; i < max; i++)
  507                 if (DOMAINSET_ISSET(i, &domain->ds_mask) && VM_DOMAIN_EMPTY(i))
  508                         DOMAINSET_CLR(i, &domain->ds_mask);
  509         domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
  510         max = DOMAINSET_FLS(&domain->ds_mask) + 1;
  511         for (i = j = 0; i < max; i++) {
  512                 if (DOMAINSET_ISSET(i, &domain->ds_mask))
  513                         domain->ds_order[j++] = i;
  514                 else if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
  515                     domain->ds_prefer == i && domain->ds_cnt > 1) {
  516                         domain->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
  517                         domain->ds_prefer = -1;
  518                 }
  519         }
  520 
  521         return (DOMAINSET_EMPTY(&domain->ds_mask));
  522 }
  523 
  524 /*
  525  * Create or lookup a domainset based on the key held in 'domain'.
  526  */
  527 struct domainset *
  528 domainset_create(const struct domainset *domain)
  529 {
  530         struct domainset *ndomain;
  531 
  532         /*
  533          * Validate the policy.  It must specify a useable policy number with
  534          * only valid domains.  Preferred must include the preferred domain
  535          * in the mask.
  536          */
  537         if (domain->ds_policy <= DOMAINSET_POLICY_INVALID ||
  538             domain->ds_policy > DOMAINSET_POLICY_MAX)
  539                 return (NULL);
  540         if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
  541             !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask))
  542                 return (NULL);
  543         if (!DOMAINSET_SUBSET(&domainset0.ds_mask, &domain->ds_mask))
  544                 return (NULL);
  545         ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
  546         domainset_copy(domain, ndomain);
  547         return _domainset_create(ndomain, NULL);
  548 }
  549 
  550 /*
  551  * Update thread domainset pointers.
  552  */
  553 static void
  554 domainset_notify(void)
  555 {
  556         struct thread *td;
  557         struct proc *p;
  558 
  559         sx_slock(&allproc_lock);
  560         FOREACH_PROC_IN_SYSTEM(p) {
  561                 PROC_LOCK(p);
  562                 if (p->p_state == PRS_NEW) {
  563                         PROC_UNLOCK(p);
  564                         continue;
  565                 }
  566                 FOREACH_THREAD_IN_PROC(p, td) {
  567                         thread_lock(td);
  568                         td->td_domain.dr_policy = td->td_cpuset->cs_domain;
  569                         thread_unlock(td);
  570                 }
  571                 PROC_UNLOCK(p);
  572         }
  573         sx_sunlock(&allproc_lock);
  574         kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
  575 }
  576 
  577 /*
  578  * Create a new set that is a subset of a parent.
  579  */
  580 static struct domainset *
  581 domainset_shadow(const struct domainset *pdomain,
  582     const struct domainset *domain, struct domainlist *freelist)
  583 {
  584         struct domainset *ndomain;
  585 
  586         ndomain = LIST_FIRST(freelist);
  587         LIST_REMOVE(ndomain, ds_link);
  588 
  589         /*
  590          * Initialize the key from the request.
  591          */
  592         domainset_copy(domain, ndomain);
  593 
  594         /*
  595          * Restrict the key by the parent.
  596          */
  597         DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
  598 
  599         return _domainset_create(ndomain, freelist);
  600 }
  601 
  602 /*
  603  * Recursively check for errors that would occur from applying mask to
  604  * the tree of sets starting at 'set'.  Checks for sets that would become
  605  * empty as well as RDONLY flags.
  606  */
  607 static int
  608 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
  609 {
  610         struct cpuset *nset;
  611         cpuset_t newmask;
  612         int error;
  613 
  614         mtx_assert(&cpuset_lock, MA_OWNED);
  615         if (set->cs_flags & CPU_SET_RDONLY)
  616                 return (EPERM);
  617         if (check_mask) {
  618                 if (!CPU_OVERLAP(&set->cs_mask, mask))
  619                         return (EDEADLK);
  620                 CPU_COPY(&set->cs_mask, &newmask);
  621                 CPU_AND(&newmask, mask);
  622         } else
  623                 CPU_COPY(mask, &newmask);
  624         error = 0;
  625         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
  626                 if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
  627                         break;
  628         return (error);
  629 }
  630 
  631 /*
  632  * Applies the mask 'mask' without checking for empty sets or permissions.
  633  */
  634 static void
  635 cpuset_update(struct cpuset *set, cpuset_t *mask)
  636 {
  637         struct cpuset *nset;
  638 
  639         mtx_assert(&cpuset_lock, MA_OWNED);
  640         CPU_AND(&set->cs_mask, mask);
  641         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
  642                 cpuset_update(nset, &set->cs_mask);
  643 
  644         return;
  645 }
  646 
  647 /*
  648  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
  649  * mask to restrict all children in the tree.  Checks for validity before
  650  * applying the changes.
  651  */
  652 static int
  653 cpuset_modify(struct cpuset *set, cpuset_t *mask)
  654 {
  655         struct cpuset *root;
  656         int error;
  657 
  658         error = priv_check(curthread, PRIV_SCHED_CPUSET);
  659         if (error)
  660                 return (error);
  661         /*
  662          * In case we are called from within the jail
  663          * we do not allow modifying the dedicated root
  664          * cpuset of the jail but may still allow to
  665          * change child sets.
  666          */
  667         if (jailed(curthread->td_ucred) &&
  668             set->cs_flags & CPU_SET_ROOT)
  669                 return (EPERM);
  670         /*
  671          * Verify that we have access to this set of
  672          * cpus.
  673          */
  674         root = cpuset_getroot(set);
  675         mtx_lock_spin(&cpuset_lock);
  676         if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
  677                 error = EINVAL;
  678                 goto out;
  679         }
  680         error = cpuset_testupdate(set, mask, 0);
  681         if (error)
  682                 goto out;
  683         CPU_COPY(mask, &set->cs_mask);
  684         cpuset_update(set, mask);
  685 out:
  686         mtx_unlock_spin(&cpuset_lock);
  687 
  688         return (error);
  689 }
  690 
  691 /*
  692  * Recursively check for errors that would occur from applying mask to
  693  * the tree of sets starting at 'set'.  Checks for sets that would become
  694  * empty as well as RDONLY flags.
  695  */
  696 static int
  697 cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
  698     struct domainset *orig, int *count, int check_mask)
  699 {
  700         struct cpuset *nset;
  701         struct domainset *domain;
  702         struct domainset newset;
  703         int error;
  704 
  705         mtx_assert(&cpuset_lock, MA_OWNED);
  706         if (set->cs_flags & CPU_SET_RDONLY)
  707                 return (EPERM);
  708         domain = set->cs_domain;
  709         domainset_copy(domain, &newset);
  710         if (!domainset_equal(domain, orig)) {
  711                 if (!domainset_restrict(domain, dset))
  712                         return (EDEADLK);
  713                 DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
  714                 /* Count the number of domains that are changing. */
  715                 (*count)++;
  716         }
  717         error = 0;
  718         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
  719                 if ((error = cpuset_testupdate_domain(nset, &newset, domain,
  720                     count, 1)) != 0)
  721                         break;
  722         return (error);
  723 }
  724 
  725 /*
  726  * Applies the mask 'mask' without checking for empty sets or permissions.
  727  */
  728 static void
  729 cpuset_update_domain(struct cpuset *set, struct domainset *domain,
  730     struct domainset *orig, struct domainlist *domains)
  731 {
  732         struct cpuset *nset;
  733 
  734         mtx_assert(&cpuset_lock, MA_OWNED);
  735         /*
  736          * If this domainset has changed from the parent we must calculate
  737          * a new set.  Otherwise it simply inherits from the parent.  When
  738          * we inherit from the parent we get a new mask and policy.  If the
  739          * set is modified from the parent we keep the policy and only
  740          * update the mask.
  741          */
  742         if (set->cs_domain != orig) {
  743                 orig = set->cs_domain;
  744                 set->cs_domain = domainset_shadow(domain, orig, domains);
  745         } else
  746                 set->cs_domain = domain;
  747         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
  748                 cpuset_update_domain(nset, set->cs_domain, orig, domains);
  749 
  750         return;
  751 }
  752 
  753 /*
  754  * Modify the set 'set' to use a copy the domainset provided.  Apply this new
  755  * mask to restrict all children in the tree.  Checks for validity before
  756  * applying the changes.
  757  */
  758 static int
  759 cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
  760 {
  761         struct domainlist domains;
  762         struct domainset temp;
  763         struct domainset *dset;
  764         struct cpuset *root;
  765         int ndomains, needed;
  766         int error;
  767 
  768         error = priv_check(curthread, PRIV_SCHED_CPUSET);
  769         if (error)
  770                 return (error);
  771         /*
  772          * In case we are called from within the jail
  773          * we do not allow modifying the dedicated root
  774          * cpuset of the jail but may still allow to
  775          * change child sets.
  776          */
  777         if (jailed(curthread->td_ucred) &&
  778             set->cs_flags & CPU_SET_ROOT)
  779                 return (EPERM);
  780         domainset_freelist_init(&domains, 0);
  781         domain = domainset_create(domain);
  782         ndomains = needed = 0;
  783         do {
  784                 if (ndomains < needed) {
  785                         domainset_freelist_add(&domains, needed - ndomains);
  786                         ndomains = needed;
  787                 }
  788                 root = cpuset_getroot(set);
  789                 mtx_lock_spin(&cpuset_lock);
  790                 dset = root->cs_domain;
  791                 /*
  792                  * Verify that we have access to this set of domains.
  793                  */
  794                 if (root && !domainset_valid(dset, domain)) {
  795                         error = EINVAL;
  796                         goto out;
  797                 }
  798                 /*
  799                  * If applying prefer we keep the current set as the fallback.
  800                  */
  801                 if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
  802                         DOMAINSET_COPY(&set->cs_domain->ds_mask,
  803                             &domain->ds_mask);
  804                 /*
  805                  * Determine whether we can apply this set of domains and
  806                  * how many new domain structures it will require.
  807                  */
  808                 domainset_copy(domain, &temp);
  809                 needed = 0;
  810                 error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
  811                     &needed, 0);
  812                 if (error)
  813                         goto out;
  814         } while (ndomains < needed);
  815         dset = set->cs_domain;
  816         cpuset_update_domain(set, domain, dset, &domains);
  817 out:
  818         mtx_unlock_spin(&cpuset_lock);
  819         domainset_freelist_free(&domains);
  820         if (error == 0)
  821                 domainset_notify();
  822 
  823         return (error);
  824 }
  825 
  826 /*
  827  * Resolve the 'which' parameter of several cpuset apis.
  828  *
  829  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
  830  * checks for permission via p_cansched().
  831  *
  832  * For WHICH_SET returns a valid set with a new reference.
  833  *
  834  * -1 may be supplied for any argument to mean the current proc/thread or
  835  * the base set of the current thread.  May fail with ESRCH/EPERM.
  836  */
  837 int
  838 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
  839     struct cpuset **setp)
  840 {
  841         struct cpuset *set;
  842         struct thread *td;
  843         struct proc *p;
  844         int error;
  845 
  846         *pp = p = NULL;
  847         *tdp = td = NULL;
  848         *setp = set = NULL;
  849         switch (which) {
  850         case CPU_WHICH_PID:
  851                 if (id == -1) {
  852                         PROC_LOCK(curproc);
  853                         p = curproc;
  854                         break;
  855                 }
  856                 if ((p = pfind(id)) == NULL)
  857                         return (ESRCH);
  858                 break;
  859         case CPU_WHICH_TID:
  860                 if (id == -1) {
  861                         PROC_LOCK(curproc);
  862                         p = curproc;
  863                         td = curthread;
  864                         break;
  865                 }
  866                 td = tdfind(id, -1);
  867                 if (td == NULL)
  868                         return (ESRCH);
  869                 p = td->td_proc;
  870                 break;
  871         case CPU_WHICH_CPUSET:
  872                 if (id == -1) {
  873                         thread_lock(curthread);
  874                         set = cpuset_refbase(curthread->td_cpuset);
  875                         thread_unlock(curthread);
  876                 } else
  877                         set = cpuset_lookup(id, curthread);
  878                 if (set) {
  879                         *setp = set;
  880                         return (0);
  881                 }
  882                 return (ESRCH);
  883         case CPU_WHICH_JAIL:
  884         {
  885                 /* Find `set' for prison with given id. */
  886                 struct prison *pr;
  887 
  888                 sx_slock(&allprison_lock);
  889                 pr = prison_find_child(curthread->td_ucred->cr_prison, id);
  890                 sx_sunlock(&allprison_lock);
  891                 if (pr == NULL)
  892                         return (ESRCH);
  893                 cpuset_ref(pr->pr_cpuset);
  894                 *setp = pr->pr_cpuset;
  895                 mtx_unlock(&pr->pr_mtx);
  896                 return (0);
  897         }
  898         case CPU_WHICH_IRQ:
  899         case CPU_WHICH_DOMAIN:
  900                 return (0);
  901         default:
  902                 return (EINVAL);
  903         }
  904         error = p_cansched(curthread, p);
  905         if (error) {
  906                 PROC_UNLOCK(p);
  907                 return (error);
  908         }
  909         if (td == NULL)
  910                 td = FIRST_THREAD_IN_PROC(p);
  911         *pp = p;
  912         *tdp = td;
  913         return (0);
  914 }
  915 
  916 static int
  917 cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
  918     const struct domainset *domain)
  919 {
  920         struct cpuset *parent;
  921         struct domainset *dset;
  922 
  923         parent = cpuset_getbase(set);
  924         /*
  925          * If we are restricting a cpu mask it must be a subset of the
  926          * parent or invalid CPUs have been specified.
  927          */
  928         if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
  929                 return (EINVAL);
  930 
  931         /*
  932          * If we are restricting a domain mask it must be a subset of the
  933          * parent or invalid domains have been specified.
  934          */
  935         dset = parent->cs_domain;
  936         if (domain != NULL && !domainset_valid(dset, domain))
  937                 return (EINVAL);
  938 
  939         return (0);
  940 }
  941 
  942 /*
  943  * Create an anonymous set with the provided mask in the space provided by
  944  * 'nset'.  If the passed in set is anonymous we use its parent otherwise
  945  * the new set is a child of 'set'.
  946  */
  947 static int
  948 cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
  949    const cpuset_t *mask, const struct domainset *domain,
  950    struct setlist *cpusets, struct domainlist *domains)
  951 {
  952         struct cpuset *parent;
  953         struct cpuset *nset;
  954         struct domainset *dset;
  955         struct domainset *d;
  956         int error;
  957 
  958         error = cpuset_testshadow(set, mask, domain);
  959         if (error)
  960                 return (error);
  961 
  962         parent = cpuset_getbase(set);
  963         dset = parent->cs_domain;
  964         if (mask == NULL)
  965                 mask = &set->cs_mask;
  966         if (domain != NULL)
  967                 d = domainset_shadow(dset, domain, domains);
  968         else
  969                 d = set->cs_domain;
  970         nset = LIST_FIRST(cpusets);
  971         error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID);
  972         if (error == 0) {
  973                 LIST_REMOVE(nset, cs_link);
  974                 *nsetp = nset;
  975         }
  976         return (error);
  977 }
  978 
  979 static struct cpuset *
  980 cpuset_update_thread(struct thread *td, struct cpuset *nset)
  981 {
  982         struct cpuset *tdset;
  983 
  984         tdset = td->td_cpuset;
  985         td->td_cpuset = nset;
  986         td->td_domain.dr_policy = nset->cs_domain;
  987         sched_affinity(td);
  988 
  989         return (tdset);
  990 }
  991 
  992 static int
  993 cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
  994     struct domainset *domain)
  995 {
  996         struct cpuset *parent;
  997 
  998         parent = cpuset_getbase(tdset);
  999         if (mask == NULL)
 1000                 mask = &tdset->cs_mask;
 1001         if (domain == NULL)
 1002                 domain = tdset->cs_domain;
 1003         return cpuset_testshadow(parent, mask, domain);
 1004 }
 1005 
 1006 static int
 1007 cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
 1008     struct domainset *domain, struct cpuset **nsetp,
 1009     struct setlist *freelist, struct domainlist *domainlist)
 1010 {
 1011         struct cpuset *parent;
 1012 
 1013         parent = cpuset_getbase(tdset);
 1014         if (mask == NULL)
 1015                 mask = &tdset->cs_mask;
 1016         if (domain == NULL)
 1017                 domain = tdset->cs_domain;
 1018         return cpuset_shadow(parent, nsetp, mask, domain, freelist,
 1019             domainlist);
 1020 }
 1021 
 1022 static int
 1023 cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
 1024     cpuset_t *mask, struct domainset *domain)
 1025 {
 1026         struct cpuset *parent;
 1027 
 1028         parent = cpuset_getbase(tdset);
 1029 
 1030         /*
 1031          * If the thread restricted its mask then apply that same
 1032          * restriction to the new set, otherwise take it wholesale.
 1033          */
 1034         if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
 1035                 CPU_COPY(&tdset->cs_mask, mask);
 1036                 CPU_AND(mask, &set->cs_mask);
 1037         } else
 1038                 CPU_COPY(&set->cs_mask, mask);
 1039 
 1040         /*
 1041          * If the thread restricted the domain then we apply the
 1042          * restriction to the new set but retain the policy.
 1043          */
 1044         if (tdset->cs_domain != parent->cs_domain) {
 1045                 domainset_copy(tdset->cs_domain, domain);
 1046                 DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
 1047         } else
 1048                 domainset_copy(set->cs_domain, domain);
 1049 
 1050         if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
 1051                 return (EDEADLK);
 1052 
 1053         return (0);
 1054 }
 1055 
 1056 static int
 1057 cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
 1058 {
 1059         struct domainset domain;
 1060         cpuset_t mask;
 1061 
 1062         if (tdset->cs_id != CPUSET_INVALID)
 1063                 return (0);
 1064         return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 1065 }
 1066 
 1067 static int
 1068 cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
 1069     struct cpuset **nsetp, struct setlist *freelist,
 1070     struct domainlist *domainlist)
 1071 {
 1072         struct domainset domain;
 1073         cpuset_t mask;
 1074         int error;
 1075 
 1076         /*
 1077          * If we're replacing on a thread that has not constrained the
 1078          * original set we can simply accept the new set.
 1079          */
 1080         if (tdset->cs_id != CPUSET_INVALID) {
 1081                 *nsetp = cpuset_ref(set);
 1082                 return (0);
 1083         }
 1084         error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 1085         if (error)
 1086                 return (error);
 1087 
 1088         return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist,
 1089             domainlist);
 1090 }
 1091 
 1092 /*
 1093  * Handle three cases for updating an entire process.
 1094  *
 1095  * 1) Set is non-null.  This reparents all anonymous sets to the provided
 1096  *    set and replaces all non-anonymous td_cpusets with the provided set.
 1097  * 2) Mask is non-null.  This replaces or creates anonymous sets for every
 1098  *    thread with the existing base as a parent.
 1099  * 3) domain is non-null.  This creates anonymous sets for every thread
 1100  *    and replaces the domain set.
 1101  *
 1102  * This is overly complicated because we can't allocate while holding a 
 1103  * spinlock and spinlocks must be held while changing and examining thread
 1104  * state.
 1105  */
 1106 static int
 1107 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
 1108     struct domainset *domain)
 1109 {
 1110         struct setlist freelist;
 1111         struct setlist droplist;
 1112         struct domainlist domainlist;
 1113         struct cpuset *nset;
 1114         struct thread *td;
 1115         struct proc *p;
 1116         int threads;
 1117         int nfree;
 1118         int error;
 1119 
 1120         /*
 1121          * The algorithm requires two passes due to locking considerations.
 1122          * 
 1123          * 1) Lookup the process and acquire the locks in the required order.
 1124          * 2) If enough cpusets have not been allocated release the locks and
 1125          *    allocate them.  Loop.
 1126          */
 1127         cpuset_freelist_init(&freelist, 1);
 1128         domainset_freelist_init(&domainlist, 1);
 1129         nfree = 1;
 1130         LIST_INIT(&droplist);
 1131         nfree = 0;
 1132         for (;;) {
 1133                 error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 1134                 if (error)
 1135                         goto out;
 1136                 if (nfree >= p->p_numthreads)
 1137                         break;
 1138                 threads = p->p_numthreads;
 1139                 PROC_UNLOCK(p);
 1140                 if (nfree < threads) {
 1141                         cpuset_freelist_add(&freelist, threads - nfree);
 1142                         domainset_freelist_add(&domainlist, threads - nfree);
 1143                         nfree = threads;
 1144                 }
 1145         }
 1146         PROC_LOCK_ASSERT(p, MA_OWNED);
 1147         /*
 1148          * Now that the appropriate locks are held and we have enough cpusets,
 1149          * make sure the operation will succeed before applying changes. The
 1150          * proc lock prevents td_cpuset from changing between calls.
 1151          */
 1152         error = 0;
 1153         FOREACH_THREAD_IN_PROC(p, td) {
 1154                 thread_lock(td);
 1155                 if (set != NULL)
 1156                         error = cpuset_setproc_test_setthread(td->td_cpuset,
 1157                             set);
 1158                 else
 1159                         error = cpuset_setproc_test_maskthread(td->td_cpuset,
 1160                             mask, domain);
 1161                 thread_unlock(td);
 1162                 if (error)
 1163                         goto unlock_out;
 1164         }
 1165         /*
 1166          * Replace each thread's cpuset while using deferred release.  We
 1167          * must do this because the thread lock must be held while operating
 1168          * on the thread and this limits the type of operations allowed.
 1169          */
 1170         FOREACH_THREAD_IN_PROC(p, td) {
 1171                 thread_lock(td);
 1172                 if (set != NULL)
 1173                         error = cpuset_setproc_setthread(td->td_cpuset, set,
 1174                             &nset, &freelist, &domainlist);
 1175                 else
 1176                         error = cpuset_setproc_maskthread(td->td_cpuset, mask,
 1177                             domain, &nset, &freelist, &domainlist);
 1178                 if (error) {
 1179                         thread_unlock(td);
 1180                         break;
 1181                 }
 1182                 cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
 1183                 thread_unlock(td);
 1184         }
 1185 unlock_out:
 1186         PROC_UNLOCK(p);
 1187 out:
 1188         while ((nset = LIST_FIRST(&droplist)) != NULL)
 1189                 cpuset_rel_complete(nset);
 1190         cpuset_freelist_free(&freelist);
 1191         domainset_freelist_free(&domainlist);
 1192         return (error);
 1193 }
 1194 
 1195 static int
 1196 bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen)
 1197 {
 1198         size_t bytes;
 1199         int i, once;
 1200         char *p;
 1201 
 1202         once = 0;
 1203         p = buf;
 1204         for (i = 0; i < __bitset_words(setlen); i++) {
 1205                 if (once != 0) {
 1206                         if (bufsiz < 1)
 1207                                 return (0);
 1208                         *p = ',';
 1209                         p++;
 1210                         bufsiz--;
 1211                 } else
 1212                         once = 1;
 1213                 if (bufsiz < sizeof(__STRING(ULONG_MAX)))
 1214                         return (0);
 1215                 bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]);
 1216                 p += bytes;
 1217                 bufsiz -= bytes;
 1218         }
 1219         return (p - buf);
 1220 }
 1221 
 1222 static int
 1223 bitset_strscan(struct bitset *set, int setlen, const char *buf)
 1224 {
 1225         int i, ret;
 1226         const char *p;
 1227 
 1228         BIT_ZERO(setlen, set);
 1229         p = buf;
 1230         for (i = 0; i < __bitset_words(setlen); i++) {
 1231                 if (*p == ',') {
 1232                         p++;
 1233                         continue;
 1234                 }
 1235                 ret = sscanf(p, "%lx", &set->__bits[i]);
 1236                 if (ret == 0 || ret == -1)
 1237                         break;
 1238                 while (isxdigit(*p))
 1239                         p++;
 1240         }
 1241         return (p - buf);
 1242 }
 1243 
 1244 /*
 1245  * Return a string representing a valid layout for a cpuset_t object.
 1246  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
 1247  */
 1248 char *
 1249 cpusetobj_strprint(char *buf, const cpuset_t *set)
 1250 {
 1251 
 1252         bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set,
 1253             CPU_SETSIZE);
 1254         return (buf);
 1255 }
 1256 
 1257 /*
 1258  * Build a valid cpuset_t object from a string representation.
 1259  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
 1260  */
 1261 int
 1262 cpusetobj_strscan(cpuset_t *set, const char *buf)
 1263 {
 1264         char p;
 1265 
 1266         if (strlen(buf) > CPUSETBUFSIZ - 1)
 1267                 return (-1);
 1268 
 1269         p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)];
 1270         if (p != '\0')
 1271                 return (-1);
 1272 
 1273         return (0);
 1274 }
 1275 
 1276 /*
 1277  * Handle a domainset specifier in the sysctl tree.  A poiner to a pointer to
 1278  * a domainset is in arg1.  If the user specifies a valid domainset the
 1279  * pointer is updated.
 1280  *
 1281  * Format is:
 1282  * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred
 1283  */
 1284 int
 1285 sysctl_handle_domainset(SYSCTL_HANDLER_ARGS)
 1286 {
 1287         char buf[DOMAINSETBUFSIZ];
 1288         struct domainset *dset;
 1289         struct domainset key;
 1290         int policy, prefer, error;
 1291         char *p;
 1292 
 1293         dset = *(struct domainset **)arg1;
 1294         error = 0;
 1295 
 1296         if (dset != NULL) {
 1297                 p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ,
 1298                     (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE);
 1299                 sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer);
 1300         } else
 1301                 sprintf(buf, "<NULL>");
 1302         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 1303         if (error != 0 || req->newptr == NULL)
 1304                 return (error);
 1305 
 1306         /*
 1307          * Read in and validate the string.
 1308          */
 1309         memset(&key, 0, sizeof(key));
 1310         p = &buf[bitset_strscan((struct bitset *)&key.ds_mask,
 1311             DOMAINSET_SETSIZE, buf)];
 1312         if (p == buf)
 1313                 return (EINVAL);
 1314         if (sscanf(p, ":%d:%d", &policy, &prefer) != 2)
 1315                 return (EINVAL);
 1316         key.ds_policy = policy;
 1317         key.ds_prefer = prefer;
 1318 
 1319         /* Domainset_create() validates the policy.*/
 1320         dset = domainset_create(&key);
 1321         if (dset == NULL)
 1322                 return (EINVAL);
 1323         *(struct domainset **)arg1 = dset;
 1324 
 1325         return (error);
 1326 }
 1327 
 1328 /*
 1329  * Apply an anonymous mask or a domain to a single thread.
 1330  */
 1331 static int
 1332 _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain)
 1333 {
 1334         struct setlist cpusets;
 1335         struct domainlist domainlist;
 1336         struct cpuset *nset;
 1337         struct cpuset *set;
 1338         struct thread *td;
 1339         struct proc *p;
 1340         int error;
 1341 
 1342         cpuset_freelist_init(&cpusets, 1);
 1343         domainset_freelist_init(&domainlist, domain != NULL);
 1344         error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
 1345         if (error)
 1346                 goto out;
 1347         set = NULL;
 1348         thread_lock(td);
 1349         error = cpuset_shadow(td->td_cpuset, &nset, mask, domain,
 1350             &cpusets, &domainlist);
 1351         if (error == 0)
 1352                 set = cpuset_update_thread(td, nset);
 1353         thread_unlock(td);
 1354         PROC_UNLOCK(p);
 1355         if (set)
 1356                 cpuset_rel(set);
 1357 out:
 1358         cpuset_freelist_free(&cpusets);
 1359         domainset_freelist_free(&domainlist);
 1360         return (error);
 1361 }
 1362 
 1363 /*
 1364  * Apply an anonymous mask to a single thread.
 1365  */
 1366 int
 1367 cpuset_setthread(lwpid_t id, cpuset_t *mask)
 1368 {
 1369 
 1370         return _cpuset_setthread(id, mask, NULL);
 1371 }
 1372 
 1373 /*
 1374  * Apply new cpumask to the ithread.
 1375  */
 1376 int
 1377 cpuset_setithread(lwpid_t id, int cpu)
 1378 {
 1379         cpuset_t mask;
 1380 
 1381         CPU_ZERO(&mask);
 1382         if (cpu == NOCPU)
 1383                 CPU_COPY(cpuset_root, &mask);
 1384         else
 1385                 CPU_SET(cpu, &mask);
 1386         return _cpuset_setthread(id, &mask, NULL);
 1387 }
 1388 
 1389 /*
 1390  * Initialize static domainsets after NUMA information is available.  This is
 1391  * called before memory allocators are initialized.
 1392  */
 1393 void
 1394 domainset_init(void)
 1395 {
 1396         struct domainset *dset;
 1397         int i;
 1398 
 1399         dset = &domainset_roundrobin;
 1400         DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 1401         dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 1402         dset->ds_prefer = -1;
 1403         _domainset_create(dset, NULL);
 1404 
 1405         for (i = 0; i < vm_ndomains; i++) {
 1406                 dset = &domainset_fixed[i];
 1407                 DOMAINSET_ZERO(&dset->ds_mask);
 1408                 DOMAINSET_SET(i, &dset->ds_mask);
 1409                 dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 1410                 _domainset_create(dset, NULL);
 1411 
 1412                 dset = &domainset_prefer[i];
 1413                 DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 1414                 dset->ds_policy = DOMAINSET_POLICY_PREFER;
 1415                 dset->ds_prefer = i;
 1416                 _domainset_create(dset, NULL);
 1417         }
 1418 }
 1419 
 1420 /*
 1421  * Create the domainset for cpuset 0, 1 and cpuset 2.
 1422  */
 1423 void
 1424 domainset_zero(void)
 1425 {
 1426         struct domainset *dset, *tmp;
 1427 
 1428         mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
 1429 
 1430         dset = &domainset0;
 1431         DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 1432         dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
 1433         dset->ds_prefer = -1;
 1434         curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
 1435 
 1436         domainset_copy(dset, &domainset2);
 1437         domainset2.ds_policy = DOMAINSET_POLICY_INTERLEAVE;
 1438         kernel_object->domain.dr_policy = _domainset_create(&domainset2, NULL);
 1439 
 1440         /* Remove empty domains from the global policies. */
 1441         LIST_FOREACH_SAFE(dset, &cpuset_domains, ds_link, tmp)
 1442                 if (domainset_empty_vm(dset))
 1443                         LIST_REMOVE(dset, ds_link);
 1444 }
 1445 
 1446 /*
 1447  * Creates system-wide cpusets and the cpuset for thread0 including three
 1448  * sets:
 1449  * 
 1450  * 0 - The root set which should represent all valid processors in the
 1451  *     system.  It is initially created with a mask of all processors
 1452  *     because we don't know what processors are valid until cpuset_init()
 1453  *     runs.  This set is immutable.
 1454  * 1 - The default set which all processes are a member of until changed.
 1455  *     This allows an administrator to move all threads off of given cpus to
 1456  *     dedicate them to high priority tasks or save power etc.
 1457  * 2 - The kernel set which allows restriction and policy to be applied only
 1458  *     to kernel threads and the kernel_object.
 1459  */
 1460 struct cpuset *
 1461 cpuset_thread0(void)
 1462 {
 1463         struct cpuset *set;
 1464         int i;
 1465         int error __unused;
 1466 
 1467         cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
 1468             NULL, NULL, UMA_ALIGN_CACHE, 0);
 1469         domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
 1470             NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 1471 
 1472         /*
 1473          * Create the root system set (0) for the whole machine.  Doesn't use
 1474          * cpuset_create() due to NULL parent.
 1475          */
 1476         set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 1477         CPU_COPY(&all_cpus, &set->cs_mask);
 1478         LIST_INIT(&set->cs_children);
 1479         LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 1480         set->cs_ref = 1;
 1481         set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY;
 1482         set->cs_domain = &domainset0;
 1483         cpuset_zero = set;
 1484         cpuset_root = &set->cs_mask;
 1485 
 1486         /*
 1487          * Now derive a default (1), modifiable set from that to give out.
 1488          */
 1489         set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 1490         error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1);
 1491         KASSERT(error == 0, ("Error creating default set: %d\n", error));
 1492         cpuset_default = set;
 1493         /*
 1494          * Create the kernel set (2).
 1495          */
 1496         set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 1497         error = _cpuset_create(set, cpuset_zero, NULL, NULL, 2);
 1498         KASSERT(error == 0, ("Error creating kernel set: %d\n", error));
 1499         set->cs_domain = &domainset2;
 1500         cpuset_kernel = set;
 1501 
 1502         /*
 1503          * Initialize the unit allocator. 0 and 1 are allocated above.
 1504          */
 1505         cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
 1506 
 1507         /*
 1508          * If MD code has not initialized per-domain cpusets, place all
 1509          * CPUs in domain 0.
 1510          */
 1511         for (i = 0; i < MAXMEMDOM; i++)
 1512                 if (!CPU_EMPTY(&cpuset_domain[i]))
 1513                         goto domains_set;
 1514         CPU_COPY(&all_cpus, &cpuset_domain[0]);
 1515 domains_set:
 1516 
 1517         return (cpuset_default);
 1518 }
 1519 
 1520 void
 1521 cpuset_kernthread(struct thread *td)
 1522 {
 1523         struct cpuset *set;
 1524 
 1525         thread_lock(td);
 1526         set = td->td_cpuset;
 1527         td->td_cpuset = cpuset_ref(cpuset_kernel);
 1528         thread_unlock(td);
 1529         cpuset_rel(set);
 1530 }
 1531 
 1532 /*
 1533  * Create a cpuset, which would be cpuset_create() but
 1534  * mark the new 'set' as root.
 1535  *
 1536  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
 1537  * for that.
 1538  *
 1539  * In case of no error, returns the set in *setp locked with a reference.
 1540  */
 1541 int
 1542 cpuset_create_root(struct prison *pr, struct cpuset **setp)
 1543 {
 1544         struct cpuset *set;
 1545         int error;
 1546 
 1547         KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
 1548         KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
 1549 
 1550         error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
 1551         if (error)
 1552                 return (error);
 1553 
 1554         KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
 1555             __func__, __LINE__));
 1556 
 1557         /* Mark the set as root. */
 1558         set = *setp;
 1559         set->cs_flags |= CPU_SET_ROOT;
 1560 
 1561         return (0);
 1562 }
 1563 
 1564 int
 1565 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 1566 {
 1567         int error;
 1568 
 1569         KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
 1570         KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 1571 
 1572         cpuset_ref(set);
 1573         error = cpuset_setproc(p->p_pid, set, NULL, NULL);
 1574         if (error)
 1575                 return (error);
 1576         cpuset_rel(set);
 1577         return (0);
 1578 }
 1579 
 1580 #ifndef _SYS_SYSPROTO_H_
 1581 struct cpuset_args {
 1582         cpusetid_t      *setid;
 1583 };
 1584 #endif
 1585 int
 1586 sys_cpuset(struct thread *td, struct cpuset_args *uap)
 1587 {
 1588         struct cpuset *root;
 1589         struct cpuset *set;
 1590         int error;
 1591 
 1592         thread_lock(td);
 1593         root = cpuset_refroot(td->td_cpuset);
 1594         thread_unlock(td);
 1595         error = cpuset_create(&set, root, &root->cs_mask);
 1596         cpuset_rel(root);
 1597         if (error)
 1598                 return (error);
 1599         error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 1600         if (error == 0)
 1601                 error = cpuset_setproc(-1, set, NULL, NULL);
 1602         cpuset_rel(set);
 1603         return (error);
 1604 }
 1605 
 1606 #ifndef _SYS_SYSPROTO_H_
 1607 struct cpuset_setid_args {
 1608         cpuwhich_t      which;
 1609         id_t            id;
 1610         cpusetid_t      setid;
 1611 };
 1612 #endif
 1613 int
 1614 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
 1615 {
 1616 
 1617         return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
 1618 }
 1619 
 1620 int
 1621 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
 1622     id_t id, cpusetid_t setid)
 1623 {
 1624         struct cpuset *set;
 1625         int error;
 1626 
 1627         /*
 1628          * Presently we only support per-process sets.
 1629          */
 1630         if (which != CPU_WHICH_PID)
 1631                 return (EINVAL);
 1632         set = cpuset_lookup(setid, td);
 1633         if (set == NULL)
 1634                 return (ESRCH);
 1635         error = cpuset_setproc(id, set, NULL, NULL);
 1636         cpuset_rel(set);
 1637         return (error);
 1638 }
 1639 
 1640 #ifndef _SYS_SYSPROTO_H_
 1641 struct cpuset_getid_args {
 1642         cpulevel_t      level;
 1643         cpuwhich_t      which;
 1644         id_t            id;
 1645         cpusetid_t      *setid;
 1646 };
 1647 #endif
 1648 int
 1649 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
 1650 {
 1651 
 1652         return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
 1653             uap->setid));
 1654 }
 1655 
 1656 int
 1657 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
 1658     id_t id, cpusetid_t *setid)
 1659 {
 1660         struct cpuset *nset;
 1661         struct cpuset *set;
 1662         struct thread *ttd;
 1663         struct proc *p;
 1664         cpusetid_t tmpid;
 1665         int error;
 1666 
 1667         if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
 1668                 return (EINVAL);
 1669         error = cpuset_which(which, id, &p, &ttd, &set);
 1670         if (error)
 1671                 return (error);
 1672         switch (which) {
 1673         case CPU_WHICH_TID:
 1674         case CPU_WHICH_PID:
 1675                 thread_lock(ttd);
 1676                 set = cpuset_refbase(ttd->td_cpuset);
 1677                 thread_unlock(ttd);
 1678                 PROC_UNLOCK(p);
 1679                 break;
 1680         case CPU_WHICH_CPUSET:
 1681         case CPU_WHICH_JAIL:
 1682                 break;
 1683         case CPU_WHICH_IRQ:
 1684         case CPU_WHICH_DOMAIN:
 1685                 return (EINVAL);
 1686         }
 1687         switch (level) {
 1688         case CPU_LEVEL_ROOT:
 1689                 nset = cpuset_refroot(set);
 1690                 cpuset_rel(set);
 1691                 set = nset;
 1692                 break;
 1693         case CPU_LEVEL_CPUSET:
 1694                 break;
 1695         case CPU_LEVEL_WHICH:
 1696                 break;
 1697         }
 1698         tmpid = set->cs_id;
 1699         cpuset_rel(set);
 1700         if (error == 0)
 1701                 error = copyout(&tmpid, setid, sizeof(tmpid));
 1702 
 1703         return (error);
 1704 }
 1705 
 1706 #ifndef _SYS_SYSPROTO_H_
 1707 struct cpuset_getaffinity_args {
 1708         cpulevel_t      level;
 1709         cpuwhich_t      which;
 1710         id_t            id;
 1711         size_t          cpusetsize;
 1712         cpuset_t        *mask;
 1713 };
 1714 #endif
 1715 int
 1716 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
 1717 {
 1718 
 1719         return (kern_cpuset_getaffinity(td, uap->level, uap->which,
 1720             uap->id, uap->cpusetsize, uap->mask));
 1721 }
 1722 
 1723 int
 1724 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
 1725     id_t id, size_t cpusetsize, cpuset_t *maskp)
 1726 {
 1727         struct thread *ttd;
 1728         struct cpuset *nset;
 1729         struct cpuset *set;
 1730         struct proc *p;
 1731         cpuset_t *mask;
 1732         int error;
 1733         size_t size;
 1734 
 1735         if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 1736                 return (ERANGE);
 1737         /* In Capability mode, you can only get your own CPU set. */
 1738         if (IN_CAPABILITY_MODE(td)) {
 1739                 if (level != CPU_LEVEL_WHICH)
 1740                         return (ECAPMODE);
 1741                 if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 1742                         return (ECAPMODE);
 1743                 if (id != -1)
 1744                         return (ECAPMODE);
 1745         }
 1746         size = cpusetsize;
 1747         mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 1748         error = cpuset_which(which, id, &p, &ttd, &set);
 1749         if (error)
 1750                 goto out;
 1751         switch (level) {
 1752         case CPU_LEVEL_ROOT:
 1753         case CPU_LEVEL_CPUSET:
 1754                 switch (which) {
 1755                 case CPU_WHICH_TID:
 1756                 case CPU_WHICH_PID:
 1757                         thread_lock(ttd);
 1758                         set = cpuset_ref(ttd->td_cpuset);
 1759                         thread_unlock(ttd);
 1760                         break;
 1761                 case CPU_WHICH_CPUSET:
 1762                 case CPU_WHICH_JAIL:
 1763                         break;
 1764                 case CPU_WHICH_IRQ:
 1765                 case CPU_WHICH_INTRHANDLER:
 1766                 case CPU_WHICH_ITHREAD:
 1767                 case CPU_WHICH_DOMAIN:
 1768                         error = EINVAL;
 1769                         goto out;
 1770                 }
 1771                 if (level == CPU_LEVEL_ROOT)
 1772                         nset = cpuset_refroot(set);
 1773                 else
 1774                         nset = cpuset_refbase(set);
 1775                 CPU_COPY(&nset->cs_mask, mask);
 1776                 cpuset_rel(nset);
 1777                 break;
 1778         case CPU_LEVEL_WHICH:
 1779                 switch (which) {
 1780                 case CPU_WHICH_TID:
 1781                         thread_lock(ttd);
 1782                         CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
 1783                         thread_unlock(ttd);
 1784                         break;
 1785                 case CPU_WHICH_PID:
 1786                         FOREACH_THREAD_IN_PROC(p, ttd) {
 1787                                 thread_lock(ttd);
 1788                                 CPU_OR(mask, &ttd->td_cpuset->cs_mask);
 1789                                 thread_unlock(ttd);
 1790                         }
 1791                         break;
 1792                 case CPU_WHICH_CPUSET:
 1793                 case CPU_WHICH_JAIL:
 1794                         CPU_COPY(&set->cs_mask, mask);
 1795                         break;
 1796                 case CPU_WHICH_IRQ:
 1797                 case CPU_WHICH_INTRHANDLER:
 1798                 case CPU_WHICH_ITHREAD:
 1799                         error = intr_getaffinity(id, which, mask);
 1800                         break;
 1801                 case CPU_WHICH_DOMAIN:
 1802                         if (id < 0 || id >= MAXMEMDOM)
 1803                                 error = ESRCH;
 1804                         else
 1805                                 CPU_COPY(&cpuset_domain[id], mask);
 1806                         break;
 1807                 }
 1808                 break;
 1809         default:
 1810                 error = EINVAL;
 1811                 break;
 1812         }
 1813         if (set)
 1814                 cpuset_rel(set);
 1815         if (p)
 1816                 PROC_UNLOCK(p);
 1817         if (error == 0)
 1818                 error = copyout(mask, maskp, size);
 1819 out:
 1820         free(mask, M_TEMP);
 1821         return (error);
 1822 }
 1823 
 1824 #ifndef _SYS_SYSPROTO_H_
 1825 struct cpuset_setaffinity_args {
 1826         cpulevel_t      level;
 1827         cpuwhich_t      which;
 1828         id_t            id;
 1829         size_t          cpusetsize;
 1830         const cpuset_t  *mask;
 1831 };
 1832 #endif
 1833 int
 1834 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
 1835 {
 1836 
 1837         return (kern_cpuset_setaffinity(td, uap->level, uap->which,
 1838             uap->id, uap->cpusetsize, uap->mask));
 1839 }
 1840 
 1841 int
 1842 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
 1843     id_t id, size_t cpusetsize, const cpuset_t *maskp)
 1844 {
 1845         struct cpuset *nset;
 1846         struct cpuset *set;
 1847         struct thread *ttd;
 1848         struct proc *p;
 1849         cpuset_t *mask;
 1850         int error;
 1851 
 1852         if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 1853                 return (ERANGE);
 1854         /* In Capability mode, you can only set your own CPU set. */
 1855         if (IN_CAPABILITY_MODE(td)) {
 1856                 if (level != CPU_LEVEL_WHICH)
 1857                         return (ECAPMODE);
 1858                 if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 1859                         return (ECAPMODE);
 1860                 if (id != -1)
 1861                         return (ECAPMODE);
 1862         }
 1863         mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
 1864         error = copyin(maskp, mask, cpusetsize);
 1865         if (error)
 1866                 goto out;
 1867         /*
 1868          * Verify that no high bits are set.
 1869          */
 1870         if (cpusetsize > sizeof(cpuset_t)) {
 1871                 char *end;
 1872                 char *cp;
 1873 
 1874                 end = cp = (char *)&mask->__bits;
 1875                 end += cpusetsize;
 1876                 cp += sizeof(cpuset_t);
 1877                 while (cp != end)
 1878                         if (*cp++ != 0) {
 1879                                 error = EINVAL;
 1880                                 goto out;
 1881                         }
 1882 
 1883         }
 1884         switch (level) {
 1885         case CPU_LEVEL_ROOT:
 1886         case CPU_LEVEL_CPUSET:
 1887                 error = cpuset_which(which, id, &p, &ttd, &set);
 1888                 if (error)
 1889                         break;
 1890                 switch (which) {
 1891                 case CPU_WHICH_TID:
 1892                 case CPU_WHICH_PID:
 1893                         thread_lock(ttd);
 1894                         set = cpuset_ref(ttd->td_cpuset);
 1895                         thread_unlock(ttd);
 1896                         PROC_UNLOCK(p);
 1897                         break;
 1898                 case CPU_WHICH_CPUSET:
 1899                 case CPU_WHICH_JAIL:
 1900                         break;
 1901                 case CPU_WHICH_IRQ:
 1902                 case CPU_WHICH_INTRHANDLER:
 1903                 case CPU_WHICH_ITHREAD:
 1904                 case CPU_WHICH_DOMAIN:
 1905                         error = EINVAL;
 1906                         goto out;
 1907                 }
 1908                 if (level == CPU_LEVEL_ROOT)
 1909                         nset = cpuset_refroot(set);
 1910                 else
 1911                         nset = cpuset_refbase(set);
 1912                 error = cpuset_modify(nset, mask);
 1913                 cpuset_rel(nset);
 1914                 cpuset_rel(set);
 1915                 break;
 1916         case CPU_LEVEL_WHICH:
 1917                 switch (which) {
 1918                 case CPU_WHICH_TID:
 1919                         error = cpuset_setthread(id, mask);
 1920                         break;
 1921                 case CPU_WHICH_PID:
 1922                         error = cpuset_setproc(id, NULL, mask, NULL);
 1923                         break;
 1924                 case CPU_WHICH_CPUSET:
 1925                 case CPU_WHICH_JAIL:
 1926                         error = cpuset_which(which, id, &p, &ttd, &set);
 1927                         if (error == 0) {
 1928                                 error = cpuset_modify(set, mask);
 1929                                 cpuset_rel(set);
 1930                         }
 1931                         break;
 1932                 case CPU_WHICH_IRQ:
 1933                 case CPU_WHICH_INTRHANDLER:
 1934                 case CPU_WHICH_ITHREAD:
 1935                         error = intr_setaffinity(id, which, mask);
 1936                         break;
 1937                 default:
 1938                         error = EINVAL;
 1939                         break;
 1940                 }
 1941                 break;
 1942         default:
 1943                 error = EINVAL;
 1944                 break;
 1945         }
 1946 out:
 1947         free(mask, M_TEMP);
 1948         return (error);
 1949 }
 1950 
 1951 #ifndef _SYS_SYSPROTO_H_
 1952 struct cpuset_getdomain_args {
 1953         cpulevel_t      level;
 1954         cpuwhich_t      which;
 1955         id_t            id;
 1956         size_t          domainsetsize;
 1957         domainset_t     *mask;
 1958         int             *policy;
 1959 };
 1960 #endif
 1961 int
 1962 sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap)
 1963 {
 1964 
 1965         return (kern_cpuset_getdomain(td, uap->level, uap->which,
 1966             uap->id, uap->domainsetsize, uap->mask, uap->policy));
 1967 }
 1968 
 1969 int
 1970 kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
 1971     id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp)
 1972 {
 1973         struct domainset outset;
 1974         struct thread *ttd;
 1975         struct cpuset *nset;
 1976         struct cpuset *set;
 1977         struct domainset *dset;
 1978         struct proc *p;
 1979         domainset_t *mask;
 1980         int error;
 1981 
 1982         if (domainsetsize < sizeof(domainset_t) ||
 1983             domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 1984                 return (ERANGE);
 1985         /* In Capability mode, you can only get your own domain set. */
 1986         if (IN_CAPABILITY_MODE(td)) {
 1987                 if (level != CPU_LEVEL_WHICH)
 1988                         return (ECAPMODE);
 1989                 if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 1990                         return (ECAPMODE);
 1991                 if (id != -1)
 1992                         return (ECAPMODE);
 1993         }
 1994         mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 1995         bzero(&outset, sizeof(outset));
 1996         error = cpuset_which(which, id, &p, &ttd, &set);
 1997         if (error)
 1998                 goto out;
 1999         switch (level) {
 2000         case CPU_LEVEL_ROOT:
 2001         case CPU_LEVEL_CPUSET:
 2002                 switch (which) {
 2003                 case CPU_WHICH_TID:
 2004                 case CPU_WHICH_PID:
 2005                         thread_lock(ttd);
 2006                         set = cpuset_ref(ttd->td_cpuset);
 2007                         thread_unlock(ttd);
 2008                         break;
 2009                 case CPU_WHICH_CPUSET:
 2010                 case CPU_WHICH_JAIL:
 2011                         break;
 2012                 case CPU_WHICH_IRQ:
 2013                 case CPU_WHICH_INTRHANDLER:
 2014                 case CPU_WHICH_ITHREAD:
 2015                 case CPU_WHICH_DOMAIN:
 2016                         error = EINVAL;
 2017                         goto out;
 2018                 }
 2019                 if (level == CPU_LEVEL_ROOT)
 2020                         nset = cpuset_refroot(set);
 2021                 else
 2022                         nset = cpuset_refbase(set);
 2023                 domainset_copy(nset->cs_domain, &outset);
 2024                 cpuset_rel(nset);
 2025                 break;
 2026         case CPU_LEVEL_WHICH:
 2027                 switch (which) {
 2028                 case CPU_WHICH_TID:
 2029                         thread_lock(ttd);
 2030                         domainset_copy(ttd->td_cpuset->cs_domain, &outset);
 2031                         thread_unlock(ttd);
 2032                         break;
 2033                 case CPU_WHICH_PID:
 2034                         FOREACH_THREAD_IN_PROC(p, ttd) {
 2035                                 thread_lock(ttd);
 2036                                 dset = ttd->td_cpuset->cs_domain;
 2037                                 /* Show all domains in the proc. */
 2038                                 DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask);
 2039                                 /* Last policy wins. */
 2040                                 outset.ds_policy = dset->ds_policy;
 2041                                 outset.ds_prefer = dset->ds_prefer;
 2042                                 thread_unlock(ttd);
 2043                         }
 2044                         break;
 2045                 case CPU_WHICH_CPUSET:
 2046                 case CPU_WHICH_JAIL:
 2047                         domainset_copy(set->cs_domain, &outset);
 2048                         break;
 2049                 case CPU_WHICH_IRQ:
 2050                 case CPU_WHICH_INTRHANDLER:
 2051                 case CPU_WHICH_ITHREAD:
 2052                 case CPU_WHICH_DOMAIN:
 2053                         error = EINVAL;
 2054                         break;
 2055                 }
 2056                 break;
 2057         default:
 2058                 error = EINVAL;
 2059                 break;
 2060         }
 2061         if (set)
 2062                 cpuset_rel(set);
 2063         if (p)
 2064                 PROC_UNLOCK(p);
 2065         /*
 2066          * Translate prefer into a set containing only the preferred domain,
 2067          * not the entire fallback set.
 2068          */
 2069         if (outset.ds_policy == DOMAINSET_POLICY_PREFER) {
 2070                 DOMAINSET_ZERO(&outset.ds_mask);
 2071                 DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask);
 2072         }
 2073         DOMAINSET_COPY(&outset.ds_mask, mask);
 2074         if (error == 0)
 2075                 error = copyout(mask, maskp, domainsetsize);
 2076         if (error == 0)
 2077                 if (suword32(policyp, outset.ds_policy) != 0)
 2078                         error = EFAULT;
 2079 out:
 2080         free(mask, M_TEMP);
 2081         return (error);
 2082 }
 2083 
 2084 #ifndef _SYS_SYSPROTO_H_
 2085 struct cpuset_setdomain_args {
 2086         cpulevel_t      level;
 2087         cpuwhich_t      which;
 2088         id_t            id;
 2089         size_t          domainsetsize;
 2090         domainset_t     *mask;
 2091         int             policy;
 2092 };
 2093 #endif
 2094 int
 2095 sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
 2096 {
 2097 
 2098         return (kern_cpuset_setdomain(td, uap->level, uap->which,
 2099             uap->id, uap->domainsetsize, uap->mask, uap->policy));
 2100 }
 2101 
 2102 int
 2103 kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
 2104     id_t id, size_t domainsetsize, const domainset_t *maskp, int policy)
 2105 {
 2106         struct cpuset *nset;
 2107         struct cpuset *set;
 2108         struct thread *ttd;
 2109         struct proc *p;
 2110         struct domainset domain;
 2111         domainset_t *mask;
 2112         int error;
 2113 
 2114         if (domainsetsize < sizeof(domainset_t) ||
 2115             domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 2116                 return (ERANGE);
 2117         if (policy <= DOMAINSET_POLICY_INVALID ||
 2118             policy > DOMAINSET_POLICY_MAX)
 2119                 return (EINVAL);
 2120         /* In Capability mode, you can only set your own CPU set. */
 2121         if (IN_CAPABILITY_MODE(td)) {
 2122                 if (level != CPU_LEVEL_WHICH)
 2123                         return (ECAPMODE);
 2124                 if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 2125                         return (ECAPMODE);
 2126                 if (id != -1)
 2127                         return (ECAPMODE);
 2128         }
 2129         memset(&domain, 0, sizeof(domain));
 2130         mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 2131         error = copyin(maskp, mask, domainsetsize);
 2132         if (error)
 2133                 goto out;
 2134         /*
 2135          * Verify that no high bits are set.
 2136          */
 2137         if (domainsetsize > sizeof(domainset_t)) {
 2138                 char *end;
 2139                 char *cp;
 2140 
 2141                 end = cp = (char *)&mask->__bits;
 2142                 end += domainsetsize;
 2143                 cp += sizeof(domainset_t);
 2144                 while (cp != end)
 2145                         if (*cp++ != 0) {
 2146                                 error = EINVAL;
 2147                                 goto out;
 2148                         }
 2149 
 2150         }
 2151         DOMAINSET_COPY(mask, &domain.ds_mask);
 2152         domain.ds_policy = policy;
 2153 
 2154         /* Translate preferred policy into a mask and fallback. */
 2155         if (policy == DOMAINSET_POLICY_PREFER) {
 2156                 /* Only support a single preferred domain. */
 2157                 if (DOMAINSET_COUNT(&domain.ds_mask) != 1) {
 2158                         error = EINVAL;
 2159                         goto out;
 2160                 }
 2161                 domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
 2162                 /* This will be constrained by domainset_shadow(). */
 2163                 DOMAINSET_FILL(&domain.ds_mask);
 2164         }
 2165 
 2166         /*
 2167          *  When given an impossible policy, fall back to interleaving
 2168          *  across all domains
 2169          */
 2170         if (domainset_empty_vm(&domain))
 2171                 domainset_copy(&domainset2, &domain);
 2172 
 2173         switch (level) {
 2174         case CPU_LEVEL_ROOT:
 2175         case CPU_LEVEL_CPUSET:
 2176                 error = cpuset_which(which, id, &p, &ttd, &set);
 2177                 if (error)
 2178                         break;
 2179                 switch (which) {
 2180                 case CPU_WHICH_TID:
 2181                 case CPU_WHICH_PID:
 2182                         thread_lock(ttd);
 2183                         set = cpuset_ref(ttd->td_cpuset);
 2184                         thread_unlock(ttd);
 2185                         PROC_UNLOCK(p);
 2186                         break;
 2187                 case CPU_WHICH_CPUSET:
 2188                 case CPU_WHICH_JAIL:
 2189                         break;
 2190                 case CPU_WHICH_IRQ:
 2191                 case CPU_WHICH_INTRHANDLER:
 2192                 case CPU_WHICH_ITHREAD:
 2193                 case CPU_WHICH_DOMAIN:
 2194                         error = EINVAL;
 2195                         goto out;
 2196                 }
 2197                 if (level == CPU_LEVEL_ROOT)
 2198                         nset = cpuset_refroot(set);
 2199                 else
 2200                         nset = cpuset_refbase(set);
 2201                 error = cpuset_modify_domain(nset, &domain);
 2202                 cpuset_rel(nset);
 2203                 cpuset_rel(set);
 2204                 break;
 2205         case CPU_LEVEL_WHICH:
 2206                 switch (which) {
 2207                 case CPU_WHICH_TID:
 2208                         error = _cpuset_setthread(id, NULL, &domain);
 2209                         break;
 2210                 case CPU_WHICH_PID:
 2211                         error = cpuset_setproc(id, NULL, NULL, &domain);
 2212                         break;
 2213                 case CPU_WHICH_CPUSET:
 2214                 case CPU_WHICH_JAIL:
 2215                         error = cpuset_which(which, id, &p, &ttd, &set);
 2216                         if (error == 0) {
 2217                                 error = cpuset_modify_domain(set, &domain);
 2218                                 cpuset_rel(set);
 2219                         }
 2220                         break;
 2221                 case CPU_WHICH_IRQ:
 2222                 case CPU_WHICH_INTRHANDLER:
 2223                 case CPU_WHICH_ITHREAD:
 2224                 default:
 2225                         error = EINVAL;
 2226                         break;
 2227                 }
 2228                 break;
 2229         default:
 2230                 error = EINVAL;
 2231                 break;
 2232         }
 2233 out:
 2234         free(mask, M_TEMP);
 2235         return (error);
 2236 }
 2237 
 2238 #ifdef DDB
 2239 
 2240 static void
 2241 ddb_display_bitset(const struct bitset *set, int size)
 2242 {
 2243         int bit, once;
 2244 
 2245         for (once = 0, bit = 0; bit < size; bit++) {
 2246                 if (CPU_ISSET(bit, set)) {
 2247                         if (once == 0) {
 2248                                 db_printf("%d", bit);
 2249                                 once = 1;
 2250                         } else  
 2251                                 db_printf(",%d", bit);
 2252                 }
 2253         }
 2254         if (once == 0)
 2255                 db_printf("<none>");
 2256 }
 2257 
 2258 void
 2259 ddb_display_cpuset(const cpuset_t *set)
 2260 {
 2261         ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE);
 2262 }
 2263 
 2264 static void
 2265 ddb_display_domainset(const domainset_t *set)
 2266 {
 2267         ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE);
 2268 }
 2269 
 2270 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 2271 {
 2272         struct cpuset *set;
 2273 
 2274         LIST_FOREACH(set, &cpuset_ids, cs_link) {
 2275                 db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
 2276                     set, set->cs_id, set->cs_ref, set->cs_flags,
 2277                     (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 2278                 db_printf("  cpu mask=");
 2279                 ddb_display_cpuset(&set->cs_mask);
 2280                 db_printf("\n");
 2281                 db_printf("  domain policy %d prefer %d mask=",
 2282                     set->cs_domain->ds_policy, set->cs_domain->ds_prefer);
 2283                 ddb_display_domainset(&set->cs_domain->ds_mask);
 2284                 db_printf("\n");
 2285                 if (db_pager_quit)
 2286                         break;
 2287         }
 2288 }
 2289 
 2290 DB_SHOW_COMMAND(domainsets, db_show_domainsets)
 2291 {
 2292         struct domainset *set;
 2293 
 2294         LIST_FOREACH(set, &cpuset_domains, ds_link) {
 2295                 db_printf("set=%p policy %d prefer %d cnt %d\n",
 2296                     set, set->ds_policy, set->ds_prefer, set->ds_cnt);
 2297                 db_printf("  mask =");
 2298                 ddb_display_domainset(&set->ds_mask);
 2299                 db_printf("\n");
 2300         }
 2301 }
 2302 #endif /* DDB */

Cache object: 2cbcb5f8ea12fa7c3838c7e92fc2fcbb


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.