The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_cpuset.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
    5  * All rights reserved.
    6  * 
    7  * Copyright (c) 2008 Nokia Corporation
    8  * All rights reserved.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice unmodified, this list of conditions, and the following
   15  *    disclaimer.
   16  * 2. Redistributions in binary form must reproduce the above copyright
   17  *    notice, this list of conditions and the following disclaimer in the
   18  *    documentation and/or other materials provided with the distribution.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   30  *
   31  */
   32 
   33 #include <sys/cdefs.h>
   34 __FBSDID("$FreeBSD$");
   35 
   36 #include "opt_ddb.h"
   37 
   38 #include <sys/param.h>
   39 #include <sys/systm.h>
   40 #include <sys/sysctl.h>
   41 #include <sys/ctype.h>
   42 #include <sys/sysproto.h>
   43 #include <sys/jail.h>
   44 #include <sys/kernel.h>
   45 #include <sys/lock.h>
   46 #include <sys/malloc.h>
   47 #include <sys/mutex.h>
   48 #include <sys/priv.h>
   49 #include <sys/proc.h>
   50 #include <sys/refcount.h>
   51 #include <sys/sched.h>
   52 #include <sys/smp.h>
   53 #include <sys/syscallsubr.h>
   54 #include <sys/capsicum.h>
   55 #include <sys/cpuset.h>
   56 #include <sys/domainset.h>
   57 #include <sys/sx.h>
   58 #include <sys/queue.h>
   59 #include <sys/libkern.h>
   60 #include <sys/limits.h>
   61 #include <sys/bus.h>
   62 #include <sys/interrupt.h>
   63 #include <sys/vmmeter.h>
   64 
   65 #include <vm/uma.h>
   66 #include <vm/vm.h>
   67 #include <vm/vm_object.h>
   68 #include <vm/vm_page.h>
   69 #include <vm/vm_pageout.h>
   70 #include <vm/vm_extern.h>
   71 #include <vm/vm_param.h>
   72 #include <vm/vm_phys.h>
   73 #include <vm/vm_pagequeue.h>
   74 
   75 #ifdef DDB
   76 #include <ddb/ddb.h>
   77 #endif /* DDB */
   78 
   79 /*
   80  * cpusets provide a mechanism for creating and manipulating sets of
   81  * processors for the purpose of constraining the scheduling of threads to
   82  * specific processors.
   83  *
   84  * Each process belongs to an identified set, by default this is set 1.  Each
   85  * thread may further restrict the cpus it may run on to a subset of this
   86  * named set.  This creates an anonymous set which other threads and processes
   87  * may not join by number.
   88  *
   89  * The named set is referred to herein as the 'base' set to avoid ambiguity.
   90  * This set is usually a child of a 'root' set while the anonymous set may
   91  * simply be referred to as a mask.  In the syscall api these are referred to
   92  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
   93  *
   94  * Threads inherit their set from their creator whether it be anonymous or
   95  * not.  This means that anonymous sets are immutable because they may be
   96  * shared.  To modify an anonymous set a new set is created with the desired
   97  * mask and the same parent as the existing anonymous set.  This gives the
   98  * illusion of each thread having a private mask.
   99  *
  100  * Via the syscall apis a user may ask to retrieve or modify the root, base,
  101  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
  102  * modifies all numbered and anonymous child sets to comply with the new mask.
  103  * Modifying a pid or tid's mask applies only to that tid but must still
  104  * exist within the assigned parent set.
  105  *
  106  * A thread may not be assigned to a group separate from other threads in
  107  * the process.  This is to remove ambiguity when the setid is queried with
  108  * a pid argument.  There is no other technical limitation.
  109  *
  110  * This somewhat complex arrangement is intended to make it easy for
  111  * applications to query available processors and bind their threads to
  112  * specific processors while also allowing administrators to dynamically
  113  * reprovision by changing sets which apply to groups of processes.
  114  *
  115  * A simple application should not concern itself with sets at all and
  116  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
  117  * meaning 'curthread'.  It may query available cpus for that tid with a
  118  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  119  */
  120 
  121 LIST_HEAD(domainlist, domainset);
  122 struct domainset __read_mostly domainset_fixed[MAXMEMDOM];
  123 struct domainset __read_mostly domainset_prefer[MAXMEMDOM];
  124 struct domainset __read_mostly domainset_roundrobin;
  125 
  126 static uma_zone_t cpuset_zone;
  127 static uma_zone_t domainset_zone;
  128 static struct mtx cpuset_lock;
  129 static struct setlist cpuset_ids;
  130 static struct domainlist cpuset_domains;
  131 static struct unrhdr *cpuset_unr;
  132 static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel;
  133 static struct domainset domainset0, domainset2;
  134 
  135 /* Return the size of cpuset_t at the kernel level */
  136 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
  137     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
  138 
  139 cpuset_t *cpuset_root;
  140 cpuset_t cpuset_domain[MAXMEMDOM];
  141 
  142 static int domainset_valid(const struct domainset *, const struct domainset *);
  143 
  144 /*
  145  * Find the first non-anonymous set starting from 'set'.
  146  */
  147 static struct cpuset *
  148 cpuset_getbase(struct cpuset *set)
  149 {
  150 
  151         if (set->cs_id == CPUSET_INVALID)
  152                 set = set->cs_parent;
  153         return (set);
  154 }
  155 
  156 /*
  157  * Walks up the tree from 'set' to find the root.
  158  */
  159 static struct cpuset *
  160 cpuset_getroot(struct cpuset *set)
  161 {
  162 
  163         while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
  164                 set = set->cs_parent;
  165         return (set);
  166 }
  167 
  168 /*
  169  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  170  */
  171 struct cpuset *
  172 cpuset_ref(struct cpuset *set)
  173 {
  174 
  175         refcount_acquire(&set->cs_ref);
  176         return (set);
  177 }
  178 
  179 /*
  180  * Walks up the tree from 'set' to find the root.  Returns the root
  181  * referenced.
  182  */
  183 static struct cpuset *
  184 cpuset_refroot(struct cpuset *set)
  185 {
  186 
  187         return (cpuset_ref(cpuset_getroot(set)));
  188 }
  189 
  190 /*
  191  * Find the first non-anonymous set starting from 'set'.  Returns this set
  192  * referenced.  May return the passed in set with an extra ref if it is
  193  * not anonymous. 
  194  */
  195 static struct cpuset *
  196 cpuset_refbase(struct cpuset *set)
  197 {
  198 
  199         return (cpuset_ref(cpuset_getbase(set)));
  200 }
  201 
  202 /*
  203  * Release a reference in a context where it is safe to allocate.
  204  */
  205 void
  206 cpuset_rel(struct cpuset *set)
  207 {
  208         cpusetid_t id;
  209 
  210         if (refcount_release_if_not_last(&set->cs_ref))
  211                 return;
  212         mtx_lock_spin(&cpuset_lock);
  213         if (!refcount_release(&set->cs_ref)) {
  214                 mtx_unlock_spin(&cpuset_lock);
  215                 return;
  216         }
  217         LIST_REMOVE(set, cs_siblings);
  218         id = set->cs_id;
  219         if (id != CPUSET_INVALID)
  220                 LIST_REMOVE(set, cs_link);
  221         mtx_unlock_spin(&cpuset_lock);
  222         cpuset_rel(set->cs_parent);
  223         uma_zfree(cpuset_zone, set);
  224         if (id != CPUSET_INVALID)
  225                 free_unr(cpuset_unr, id);
  226 }
  227 
  228 /*
  229  * Deferred release must be used when in a context that is not safe to
  230  * allocate/free.  This places any unreferenced sets on the list 'head'.
  231  */
  232 static void
  233 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
  234 {
  235 
  236         if (refcount_release_if_not_last(&set->cs_ref))
  237                 return;
  238         mtx_lock_spin(&cpuset_lock);
  239         if (!refcount_release(&set->cs_ref)) {
  240                 mtx_unlock_spin(&cpuset_lock);
  241                 return;
  242         }
  243         LIST_REMOVE(set, cs_siblings);
  244         if (set->cs_id != CPUSET_INVALID)
  245                 LIST_REMOVE(set, cs_link);
  246         LIST_INSERT_HEAD(head, set, cs_link);
  247         mtx_unlock_spin(&cpuset_lock);
  248 }
  249 
  250 /*
  251  * Complete a deferred release.  Removes the set from the list provided to
  252  * cpuset_rel_defer.
  253  */
  254 static void
  255 cpuset_rel_complete(struct cpuset *set)
  256 {
  257         cpusetid_t id;
  258 
  259         id = set->cs_id;
  260         LIST_REMOVE(set, cs_link);
  261         cpuset_rel(set->cs_parent);
  262         uma_zfree(cpuset_zone, set);
  263         if (id != CPUSET_INVALID)
  264                 free_unr(cpuset_unr, id);
  265 }
  266 
  267 /*
  268  * Find a set based on an id.  Returns it with a ref.
  269  */
  270 static struct cpuset *
  271 cpuset_lookup(cpusetid_t setid, struct thread *td)
  272 {
  273         struct cpuset *set;
  274 
  275         if (setid == CPUSET_INVALID)
  276                 return (NULL);
  277         mtx_lock_spin(&cpuset_lock);
  278         LIST_FOREACH(set, &cpuset_ids, cs_link)
  279                 if (set->cs_id == setid)
  280                         break;
  281         if (set)
  282                 cpuset_ref(set);
  283         mtx_unlock_spin(&cpuset_lock);
  284 
  285         KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
  286         if (set != NULL && jailed(td->td_ucred)) {
  287                 struct cpuset *jset, *tset;
  288 
  289                 jset = td->td_ucred->cr_prison->pr_cpuset;
  290                 for (tset = set; tset != NULL; tset = tset->cs_parent)
  291                         if (tset == jset)
  292                                 break;
  293                 if (tset == NULL) {
  294                         cpuset_rel(set);
  295                         set = NULL;
  296                 }
  297         }
  298 
  299         return (set);
  300 }
  301 
  302 /*
  303  * Initialize a set in the space provided in 'set' with the provided parameters.
  304  * The set is returned with a single ref.  May return EDEADLK if the set
  305  * will have no valid cpu based on restrictions from the parent.
  306  */
  307 static int
  308 cpuset_init(struct cpuset *set, struct cpuset *parent,
  309     const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
  310 {
  311 
  312         if (domain == NULL)
  313                 domain = parent->cs_domain;
  314         if (mask == NULL)
  315                 mask = &parent->cs_mask;
  316         if (!CPU_OVERLAP(&parent->cs_mask, mask))
  317                 return (EDEADLK);
  318         /* The domain must be prepared ahead of time. */
  319         if (!domainset_valid(parent->cs_domain, domain))
  320                 return (EDEADLK);
  321         CPU_COPY(mask, &set->cs_mask);
  322         LIST_INIT(&set->cs_children);
  323         refcount_init(&set->cs_ref, 1);
  324         set->cs_flags = 0;
  325         mtx_lock_spin(&cpuset_lock);
  326         set->cs_domain = domain;
  327         CPU_AND(&set->cs_mask, &parent->cs_mask);
  328         set->cs_id = id;
  329         set->cs_parent = cpuset_ref(parent);
  330         LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
  331         if (set->cs_id != CPUSET_INVALID)
  332                 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
  333         mtx_unlock_spin(&cpuset_lock);
  334 
  335         return (0);
  336 }
  337 
  338 /*
  339  * Create a new non-anonymous set with the requested parent and mask.  May
  340  * return failures if the mask is invalid or a new number can not be
  341  * allocated.
  342  *
  343  * If *setp is not NULL, then it will be used as-is.  The caller must take
  344  * into account that *setp will be inserted at the head of cpuset_ids and
  345  * plan any potentially conflicting cs_link usage accordingly.
  346  */
  347 static int
  348 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
  349 {
  350         struct cpuset *set;
  351         cpusetid_t id;
  352         int error;
  353         bool dofree;
  354 
  355         id = alloc_unr(cpuset_unr);
  356         if (id == -1)
  357                 return (ENFILE);
  358         dofree = (*setp == NULL);
  359         if (*setp != NULL)
  360                 set = *setp;
  361         else
  362                 *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
  363         error = cpuset_init(set, parent, mask, NULL, id);
  364         if (error == 0)
  365                 return (0);
  366         free_unr(cpuset_unr, id);
  367         if (dofree)
  368                 uma_zfree(cpuset_zone, set);
  369 
  370         return (error);
  371 }
  372 
  373 static void
  374 cpuset_freelist_add(struct setlist *list, int count)
  375 {
  376         struct cpuset *set;
  377         int i;
  378 
  379         for (i = 0; i < count; i++) {
  380                 set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
  381                 LIST_INSERT_HEAD(list, set, cs_link);
  382         }
  383 }
  384 
  385 static void
  386 cpuset_freelist_init(struct setlist *list, int count)
  387 {
  388 
  389         LIST_INIT(list);
  390         cpuset_freelist_add(list, count);
  391 }
  392 
  393 static void
  394 cpuset_freelist_free(struct setlist *list)
  395 {
  396         struct cpuset *set;
  397 
  398         while ((set = LIST_FIRST(list)) != NULL) {
  399                 LIST_REMOVE(set, cs_link);
  400                 uma_zfree(cpuset_zone, set);
  401         }
  402 }
  403 
  404 static void
  405 domainset_freelist_add(struct domainlist *list, int count)
  406 {
  407         struct domainset *set;
  408         int i;
  409 
  410         for (i = 0; i < count; i++) {
  411                 set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
  412                 LIST_INSERT_HEAD(list, set, ds_link);
  413         }
  414 }
  415 
  416 static void
  417 domainset_freelist_init(struct domainlist *list, int count)
  418 {
  419 
  420         LIST_INIT(list);
  421         domainset_freelist_add(list, count);
  422 }
  423 
  424 static void
  425 domainset_freelist_free(struct domainlist *list)
  426 {
  427         struct domainset *set;
  428 
  429         while ((set = LIST_FIRST(list)) != NULL) {
  430                 LIST_REMOVE(set, ds_link);
  431                 uma_zfree(domainset_zone, set);
  432         }
  433 }
  434 
  435 /* Copy a domainset preserving mask and policy. */
  436 static void
  437 domainset_copy(const struct domainset *from, struct domainset *to)
  438 {
  439 
  440         DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
  441         to->ds_policy = from->ds_policy;
  442         to->ds_prefer = from->ds_prefer;
  443 }
  444 
  445 /* Return 1 if mask and policy are equal, otherwise 0. */
  446 static int
  447 domainset_equal(const struct domainset *one, const struct domainset *two)
  448 {
  449 
  450         return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
  451             one->ds_policy == two->ds_policy &&
  452             one->ds_prefer == two->ds_prefer);
  453 }
  454 
  455 /* Return 1 if child is a valid subset of parent. */
  456 static int
  457 domainset_valid(const struct domainset *parent, const struct domainset *child)
  458 {
  459         if (child->ds_policy != DOMAINSET_POLICY_PREFER)
  460                 return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
  461         return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
  462 }
  463 
  464 static int
  465 domainset_restrict(const struct domainset *parent,
  466     const struct domainset *child)
  467 {
  468         if (child->ds_policy != DOMAINSET_POLICY_PREFER)
  469                 return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
  470         return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
  471 }
  472 
  473 /*
  474  * Lookup or create a domainset.  The key is provided in ds_mask and
  475  * ds_policy.  If the domainset does not yet exist the storage in
  476  * 'domain' is used to insert.  Otherwise this storage is freed to the
  477  * domainset_zone and the existing domainset is returned.
  478  */
  479 static struct domainset *
  480 _domainset_create(struct domainset *domain, struct domainlist *freelist)
  481 {
  482         struct domainset *ndomain;
  483         int i, j;
  484 
  485         KASSERT(domain->ds_cnt <= vm_ndomains,
  486             ("invalid domain count in domainset %p", domain));
  487         KASSERT(domain->ds_policy != DOMAINSET_POLICY_PREFER ||
  488             domain->ds_prefer < vm_ndomains,
  489             ("invalid preferred domain in domains %p", domain));
  490 
  491         mtx_lock_spin(&cpuset_lock);
  492         LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
  493                 if (domainset_equal(ndomain, domain))
  494                         break;
  495         /*
  496          * If the domain does not yet exist we insert it and initialize
  497          * various iteration helpers which are not part of the key.
  498          */
  499         if (ndomain == NULL) {
  500                 LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
  501                 domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
  502                 for (i = 0, j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++)
  503                         if (DOMAINSET_ISSET(i, &domain->ds_mask))
  504                                 domain->ds_order[j++] = i;
  505         }
  506         mtx_unlock_spin(&cpuset_lock);
  507         if (ndomain == NULL)
  508                 return (domain);
  509         if (freelist != NULL)
  510                 LIST_INSERT_HEAD(freelist, domain, ds_link);
  511         else
  512                 uma_zfree(domainset_zone, domain);
  513         return (ndomain);
  514 
  515 }
  516 
  517 /*
  518  * Are any of the domains in the mask empty?  If so, silently
  519  * remove them and update the domainset accordingly.  If only empty
  520  * domains are present, we must return failure.
  521  */
  522 static bool
  523 domainset_empty_vm(struct domainset *domain)
  524 {
  525         domainset_t empty;
  526         int i, j;
  527 
  528         DOMAINSET_ZERO(&empty);
  529         for (i = 0; i < vm_ndomains; i++)
  530                 if (VM_DOMAIN_EMPTY(i))
  531                         DOMAINSET_SET(i, &empty);
  532         if (DOMAINSET_SUBSET(&empty, &domain->ds_mask))
  533                 return (true);
  534 
  535         /* Remove empty domains from the set and recompute. */
  536         DOMAINSET_ANDNOT(&domain->ds_mask, &empty);
  537         domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
  538         for (i = j = 0; i < DOMAINSET_FLS(&domain->ds_mask); i++)
  539                 if (DOMAINSET_ISSET(i, &domain->ds_mask))
  540                         domain->ds_order[j++] = i;
  541 
  542         /* Convert a PREFER policy referencing an empty domain to RR. */
  543         if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
  544             DOMAINSET_ISSET(domain->ds_prefer, &empty)) {
  545                 domain->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
  546                 domain->ds_prefer = -1;
  547         }
  548 
  549         return (false);
  550 }
  551 
  552 /*
  553  * Create or lookup a domainset based on the key held in 'domain'.
  554  */
  555 struct domainset *
  556 domainset_create(const struct domainset *domain)
  557 {
  558         struct domainset *ndomain;
  559 
  560         /*
  561          * Validate the policy.  It must specify a useable policy number with
  562          * only valid domains.  Preferred must include the preferred domain
  563          * in the mask.
  564          */
  565         if (domain->ds_policy <= DOMAINSET_POLICY_INVALID ||
  566             domain->ds_policy > DOMAINSET_POLICY_MAX)
  567                 return (NULL);
  568         if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
  569             !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask))
  570                 return (NULL);
  571         if (!DOMAINSET_SUBSET(&domainset0.ds_mask, &domain->ds_mask))
  572                 return (NULL);
  573         ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
  574         domainset_copy(domain, ndomain);
  575         return _domainset_create(ndomain, NULL);
  576 }
  577 
  578 /*
  579  * Update thread domainset pointers.
  580  */
  581 static void
  582 domainset_notify(void)
  583 {
  584         struct thread *td;
  585         struct proc *p;
  586 
  587         sx_slock(&allproc_lock);
  588         FOREACH_PROC_IN_SYSTEM(p) {
  589                 PROC_LOCK(p);
  590                 if (p->p_state == PRS_NEW) {
  591                         PROC_UNLOCK(p);
  592                         continue;
  593                 }
  594                 FOREACH_THREAD_IN_PROC(p, td) {
  595                         thread_lock(td);
  596                         td->td_domain.dr_policy = td->td_cpuset->cs_domain;
  597                         thread_unlock(td);
  598                 }
  599                 PROC_UNLOCK(p);
  600         }
  601         sx_sunlock(&allproc_lock);
  602         kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
  603 }
  604 
  605 /*
  606  * Create a new set that is a subset of a parent.
  607  */
  608 static struct domainset *
  609 domainset_shadow(const struct domainset *pdomain,
  610     const struct domainset *domain, struct domainlist *freelist)
  611 {
  612         struct domainset *ndomain;
  613 
  614         ndomain = LIST_FIRST(freelist);
  615         LIST_REMOVE(ndomain, ds_link);
  616 
  617         /*
  618          * Initialize the key from the request.
  619          */
  620         domainset_copy(domain, ndomain);
  621 
  622         /*
  623          * Restrict the key by the parent.
  624          */
  625         DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
  626 
  627         return _domainset_create(ndomain, freelist);
  628 }
  629 
  630 /*
  631  * Recursively check for errors that would occur from applying mask to
  632  * the tree of sets starting at 'set'.  Checks for sets that would become
  633  * empty as well as RDONLY flags.
  634  */
  635 static int
  636 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int augment_mask)
  637 {
  638         struct cpuset *nset;
  639         cpuset_t newmask;
  640         int error;
  641 
  642         mtx_assert(&cpuset_lock, MA_OWNED);
  643         if (set->cs_flags & CPU_SET_RDONLY)
  644                 return (EPERM);
  645         if (augment_mask) {
  646                 CPU_COPY(&set->cs_mask, &newmask);
  647                 CPU_AND(&newmask, mask);
  648         } else
  649                 CPU_COPY(mask, &newmask);
  650 
  651         if (CPU_EMPTY(&newmask))
  652                 return (EDEADLK);
  653         error = 0;
  654         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
  655                 if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
  656                         break;
  657         return (error);
  658 }
  659 
  660 /*
  661  * Applies the mask 'mask' without checking for empty sets or permissions.
  662  */
  663 static void
  664 cpuset_update(struct cpuset *set, cpuset_t *mask)
  665 {
  666         struct cpuset *nset;
  667 
  668         mtx_assert(&cpuset_lock, MA_OWNED);
  669         CPU_AND(&set->cs_mask, mask);
  670         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
  671                 cpuset_update(nset, &set->cs_mask);
  672 
  673         return;
  674 }
  675 
  676 /*
  677  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
  678  * mask to restrict all children in the tree.  Checks for validity before
  679  * applying the changes.
  680  */
  681 static int
  682 cpuset_modify(struct cpuset *set, cpuset_t *mask)
  683 {
  684         struct cpuset *root;
  685         int error;
  686 
  687         error = priv_check(curthread, PRIV_SCHED_CPUSET);
  688         if (error)
  689                 return (error);
  690         /*
  691          * In case we are called from within the jail,
  692          * we do not allow modifying the dedicated root
  693          * cpuset of the jail but may still allow to
  694          * change child sets, including subordinate jails'
  695          * roots.
  696          */
  697         if ((set->cs_flags & CPU_SET_ROOT) != 0 &&
  698             jailed(curthread->td_ucred) &&
  699             set == curthread->td_ucred->cr_prison->pr_cpuset)
  700                 return (EPERM);
  701         /*
  702          * Verify that we have access to this set of
  703          * cpus.
  704          */
  705         if ((set->cs_flags & (CPU_SET_ROOT | CPU_SET_RDONLY)) == CPU_SET_ROOT) {
  706                 KASSERT(set->cs_parent != NULL,
  707                     ("jail.cpuset=%d is not a proper child of parent jail's root.",
  708                     set->cs_id));
  709 
  710                 /*
  711                  * cpuset_getroot() cannot work here due to how top-level jail
  712                  * roots are constructed.  Top-level jails are parented to
  713                  * thread0's cpuset (i.e. cpuset 1) rather than the system root.
  714                  */
  715                 root = set->cs_parent;
  716         } else {
  717                 root = cpuset_getroot(set);
  718         }
  719         mtx_lock_spin(&cpuset_lock);
  720         if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
  721                 error = EINVAL;
  722                 goto out;
  723         }
  724         error = cpuset_testupdate(set, mask, 0);
  725         if (error)
  726                 goto out;
  727         CPU_COPY(mask, &set->cs_mask);
  728         cpuset_update(set, mask);
  729 out:
  730         mtx_unlock_spin(&cpuset_lock);
  731 
  732         return (error);
  733 }
  734 
  735 /*
  736  * Recursively check for errors that would occur from applying mask to
  737  * the tree of sets starting at 'set'.  Checks for sets that would become
  738  * empty as well as RDONLY flags.
  739  */
  740 static int
  741 cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
  742     struct domainset *orig, int *count, int augment_mask __unused)
  743 {
  744         struct cpuset *nset;
  745         struct domainset *domain;
  746         struct domainset newset;
  747         int error;
  748 
  749         mtx_assert(&cpuset_lock, MA_OWNED);
  750         if (set->cs_flags & CPU_SET_RDONLY)
  751                 return (EPERM);
  752         domain = set->cs_domain;
  753         domainset_copy(domain, &newset);
  754         if (!domainset_equal(domain, orig)) {
  755                 if (!domainset_restrict(domain, dset))
  756                         return (EDEADLK);
  757                 DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
  758                 /* Count the number of domains that are changing. */
  759                 (*count)++;
  760         }
  761         error = 0;
  762         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
  763                 if ((error = cpuset_testupdate_domain(nset, &newset, domain,
  764                     count, 1)) != 0)
  765                         break;
  766         return (error);
  767 }
  768 
  769 /*
  770  * Applies the mask 'mask' without checking for empty sets or permissions.
  771  */
  772 static void
  773 cpuset_update_domain(struct cpuset *set, struct domainset *domain,
  774     struct domainset *orig, struct domainlist *domains)
  775 {
  776         struct cpuset *nset;
  777 
  778         mtx_assert(&cpuset_lock, MA_OWNED);
  779         /*
  780          * If this domainset has changed from the parent we must calculate
  781          * a new set.  Otherwise it simply inherits from the parent.  When
  782          * we inherit from the parent we get a new mask and policy.  If the
  783          * set is modified from the parent we keep the policy and only
  784          * update the mask.
  785          */
  786         if (set->cs_domain != orig) {
  787                 orig = set->cs_domain;
  788                 set->cs_domain = domainset_shadow(domain, orig, domains);
  789         } else
  790                 set->cs_domain = domain;
  791         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
  792                 cpuset_update_domain(nset, set->cs_domain, orig, domains);
  793 
  794         return;
  795 }
  796 
  797 /*
  798  * Modify the set 'set' to use a copy the domainset provided.  Apply this new
  799  * mask to restrict all children in the tree.  Checks for validity before
  800  * applying the changes.
  801  */
  802 static int
  803 cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
  804 {
  805         struct domainlist domains;
  806         struct domainset temp;
  807         struct domainset *dset;
  808         struct cpuset *root;
  809         int ndomains, needed;
  810         int error;
  811 
  812         error = priv_check(curthread, PRIV_SCHED_CPUSET);
  813         if (error)
  814                 return (error);
  815         /*
  816          * In case we are called from within the jail
  817          * we do not allow modifying the dedicated root
  818          * cpuset of the jail but may still allow to
  819          * change child sets.
  820          */
  821         if (jailed(curthread->td_ucred) &&
  822             set->cs_flags & CPU_SET_ROOT)
  823                 return (EPERM);
  824         domainset_freelist_init(&domains, 0);
  825         domain = domainset_create(domain);
  826         ndomains = 0;
  827 
  828         mtx_lock_spin(&cpuset_lock);
  829         for (;;) {
  830                 root = cpuset_getroot(set);
  831                 dset = root->cs_domain;
  832                 /*
  833                  * Verify that we have access to this set of domains.
  834                  */
  835                 if (!domainset_valid(dset, domain)) {
  836                         error = EINVAL;
  837                         goto out;
  838                 }
  839                 /*
  840                  * If applying prefer we keep the current set as the fallback.
  841                  */
  842                 if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
  843                         DOMAINSET_COPY(&set->cs_domain->ds_mask,
  844                             &domain->ds_mask);
  845                 /*
  846                  * Determine whether we can apply this set of domains and
  847                  * how many new domain structures it will require.
  848                  */
  849                 domainset_copy(domain, &temp);
  850                 needed = 0;
  851                 error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
  852                     &needed, 0);
  853                 if (error)
  854                         goto out;
  855                 if (ndomains >= needed)
  856                         break;
  857 
  858                 /* Dropping the lock; we'll need to re-evaluate again. */
  859                 mtx_unlock_spin(&cpuset_lock);
  860                 domainset_freelist_add(&domains, needed - ndomains);
  861                 ndomains = needed;
  862                 mtx_lock_spin(&cpuset_lock);
  863         }
  864         dset = set->cs_domain;
  865         cpuset_update_domain(set, domain, dset, &domains);
  866 out:
  867         mtx_unlock_spin(&cpuset_lock);
  868         domainset_freelist_free(&domains);
  869         if (error == 0)
  870                 domainset_notify();
  871 
  872         return (error);
  873 }
  874 
  875 /*
  876  * Resolve the 'which' parameter of several cpuset apis.
  877  *
  878  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
  879  * checks for permission via p_cansched().
  880  *
  881  * For WHICH_SET returns a valid set with a new reference.
  882  *
  883  * -1 may be supplied for any argument to mean the current proc/thread or
  884  * the base set of the current thread.  May fail with ESRCH/EPERM.
  885  */
  886 int
  887 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
  888     struct cpuset **setp)
  889 {
  890         struct cpuset *set;
  891         struct thread *td;
  892         struct proc *p;
  893         int error;
  894 
  895         *pp = p = NULL;
  896         *tdp = td = NULL;
  897         *setp = set = NULL;
  898         switch (which) {
  899         case CPU_WHICH_PID:
  900                 if (id == -1) {
  901                         PROC_LOCK(curproc);
  902                         p = curproc;
  903                         break;
  904                 }
  905                 if ((p = pfind(id)) == NULL)
  906                         return (ESRCH);
  907                 break;
  908         case CPU_WHICH_TID:
  909                 if (id == -1) {
  910                         PROC_LOCK(curproc);
  911                         p = curproc;
  912                         td = curthread;
  913                         break;
  914                 }
  915                 td = tdfind(id, -1);
  916                 if (td == NULL)
  917                         return (ESRCH);
  918                 p = td->td_proc;
  919                 break;
  920         case CPU_WHICH_CPUSET:
  921                 if (id == -1) {
  922                         thread_lock(curthread);
  923                         set = cpuset_refbase(curthread->td_cpuset);
  924                         thread_unlock(curthread);
  925                 } else
  926                         set = cpuset_lookup(id, curthread);
  927                 if (set) {
  928                         *setp = set;
  929                         return (0);
  930                 }
  931                 return (ESRCH);
  932         case CPU_WHICH_JAIL:
  933         {
  934                 /* Find `set' for prison with given id. */
  935                 struct prison *pr;
  936 
  937                 sx_slock(&allprison_lock);
  938                 pr = prison_find_child(curthread->td_ucred->cr_prison, id);
  939                 sx_sunlock(&allprison_lock);
  940                 if (pr == NULL)
  941                         return (ESRCH);
  942                 cpuset_ref(pr->pr_cpuset);
  943                 *setp = pr->pr_cpuset;
  944                 mtx_unlock(&pr->pr_mtx);
  945                 return (0);
  946         }
  947         case CPU_WHICH_IRQ:
  948         case CPU_WHICH_DOMAIN:
  949                 return (0);
  950         default:
  951                 return (EINVAL);
  952         }
  953         error = p_cansched(curthread, p);
  954         if (error) {
  955                 PROC_UNLOCK(p);
  956                 return (error);
  957         }
  958         if (td == NULL)
  959                 td = FIRST_THREAD_IN_PROC(p);
  960         *pp = p;
  961         *tdp = td;
  962         return (0);
  963 }
  964 
  965 static int
  966 cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
  967     const struct domainset *domain)
  968 {
  969         struct cpuset *parent;
  970         struct domainset *dset;
  971 
  972         parent = cpuset_getbase(set);
  973         /*
  974          * If we are restricting a cpu mask it must be a subset of the
  975          * parent or invalid CPUs have been specified.
  976          */
  977         if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
  978                 return (EINVAL);
  979 
  980         /*
  981          * If we are restricting a domain mask it must be a subset of the
  982          * parent or invalid domains have been specified.
  983          */
  984         dset = parent->cs_domain;
  985         if (domain != NULL && !domainset_valid(dset, domain))
  986                 return (EINVAL);
  987 
  988         return (0);
  989 }
  990 
  991 /*
  992  * Create an anonymous set with the provided mask in the space provided by
  993  * 'nset'.  If the passed in set is anonymous we use its parent otherwise
  994  * the new set is a child of 'set'.
  995  */
  996 static int
  997 cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
  998    const cpuset_t *mask, const struct domainset *domain,
  999    struct setlist *cpusets, struct domainlist *domains)
 1000 {
 1001         struct cpuset *parent;
 1002         struct cpuset *nset;
 1003         struct domainset *dset;
 1004         struct domainset *d;
 1005         int error;
 1006 
 1007         error = cpuset_testshadow(set, mask, domain);
 1008         if (error)
 1009                 return (error);
 1010 
 1011         parent = cpuset_getbase(set);
 1012         dset = parent->cs_domain;
 1013         if (mask == NULL)
 1014                 mask = &set->cs_mask;
 1015         if (domain != NULL)
 1016                 d = domainset_shadow(dset, domain, domains);
 1017         else
 1018                 d = set->cs_domain;
 1019         nset = LIST_FIRST(cpusets);
 1020         error = cpuset_init(nset, parent, mask, d, CPUSET_INVALID);
 1021         if (error == 0) {
 1022                 LIST_REMOVE(nset, cs_link);
 1023                 *nsetp = nset;
 1024         }
 1025         return (error);
 1026 }
 1027 
 1028 static struct cpuset *
 1029 cpuset_update_thread(struct thread *td, struct cpuset *nset)
 1030 {
 1031         struct cpuset *tdset;
 1032 
 1033         tdset = td->td_cpuset;
 1034         td->td_cpuset = nset;
 1035         td->td_domain.dr_policy = nset->cs_domain;
 1036         sched_affinity(td);
 1037 
 1038         return (tdset);
 1039 }
 1040 
 1041 static int
 1042 cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
 1043     struct domainset *domain)
 1044 {
 1045         struct cpuset *parent;
 1046 
 1047         parent = cpuset_getbase(tdset);
 1048         if (mask == NULL)
 1049                 mask = &tdset->cs_mask;
 1050         if (domain == NULL)
 1051                 domain = tdset->cs_domain;
 1052         return cpuset_testshadow(parent, mask, domain);
 1053 }
 1054 
 1055 static int
 1056 cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
 1057     struct domainset *domain, struct cpuset **nsetp,
 1058     struct setlist *freelist, struct domainlist *domainlist)
 1059 {
 1060         struct cpuset *parent;
 1061 
 1062         parent = cpuset_getbase(tdset);
 1063         if (mask == NULL)
 1064                 mask = &tdset->cs_mask;
 1065         if (domain == NULL)
 1066                 domain = tdset->cs_domain;
 1067         return cpuset_shadow(parent, nsetp, mask, domain, freelist,
 1068             domainlist);
 1069 }
 1070 
 1071 static int
 1072 cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
 1073     cpuset_t *mask, struct domainset *domain)
 1074 {
 1075         struct cpuset *parent;
 1076 
 1077         parent = cpuset_getbase(tdset);
 1078 
 1079         /*
 1080          * If the thread restricted its mask then apply that same
 1081          * restriction to the new set, otherwise take it wholesale.
 1082          */
 1083         if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
 1084                 CPU_COPY(&tdset->cs_mask, mask);
 1085                 CPU_AND(mask, &set->cs_mask);
 1086         } else
 1087                 CPU_COPY(&set->cs_mask, mask);
 1088 
 1089         /*
 1090          * If the thread restricted the domain then we apply the
 1091          * restriction to the new set but retain the policy.
 1092          */
 1093         if (tdset->cs_domain != parent->cs_domain) {
 1094                 domainset_copy(tdset->cs_domain, domain);
 1095                 DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
 1096         } else
 1097                 domainset_copy(set->cs_domain, domain);
 1098 
 1099         if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
 1100                 return (EDEADLK);
 1101 
 1102         return (0);
 1103 }
 1104 
 1105 static int
 1106 cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
 1107 {
 1108         struct domainset domain;
 1109         cpuset_t mask;
 1110 
 1111         if (tdset->cs_id != CPUSET_INVALID)
 1112                 return (0);
 1113         return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 1114 }
 1115 
 1116 static int
 1117 cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
 1118     struct cpuset **nsetp, struct setlist *freelist,
 1119     struct domainlist *domainlist)
 1120 {
 1121         struct domainset domain;
 1122         cpuset_t mask;
 1123         int error;
 1124 
 1125         /*
 1126          * If we're replacing on a thread that has not constrained the
 1127          * original set we can simply accept the new set.
 1128          */
 1129         if (tdset->cs_id != CPUSET_INVALID) {
 1130                 *nsetp = cpuset_ref(set);
 1131                 return (0);
 1132         }
 1133         error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 1134         if (error)
 1135                 return (error);
 1136 
 1137         return cpuset_shadow(set, nsetp, &mask, &domain, freelist,
 1138             domainlist);
 1139 }
 1140 
 1141 static int
 1142 cpuset_setproc_newbase(struct thread *td, struct cpuset *set,
 1143     struct cpuset *nroot, struct cpuset **nsetp,
 1144     struct setlist *cpusets, struct domainlist *domainlist)
 1145 {
 1146         struct domainset ndomain;
 1147         cpuset_t nmask;
 1148         struct cpuset *pbase;
 1149         int error;
 1150 
 1151         pbase = cpuset_getbase(td->td_cpuset);
 1152 
 1153         /* Copy process mask, then further apply the new root mask. */
 1154         CPU_COPY(&pbase->cs_mask, &nmask);
 1155         CPU_AND(&nmask, &nroot->cs_mask);
 1156 
 1157         domainset_copy(pbase->cs_domain, &ndomain);
 1158         DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask);
 1159 
 1160         /* Policy is too restrictive, will not work. */
 1161         if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask))
 1162                 return (EDEADLK);
 1163 
 1164         /*
 1165          * Remove pbase from the freelist in advance, it'll be pushed to
 1166          * cpuset_ids on success.  We assume here that cpuset_create() will not
 1167          * touch pbase on failure, and we just enqueue it back to the freelist
 1168          * to remain in a consistent state.
 1169          */
 1170         pbase = LIST_FIRST(cpusets);
 1171         LIST_REMOVE(pbase, cs_link);
 1172         error = cpuset_create(&pbase, set, &nmask);
 1173         if (error != 0) {
 1174                 LIST_INSERT_HEAD(cpusets, pbase, cs_link);
 1175                 return (error);
 1176         }
 1177 
 1178         /* Duplicates some work from above... oh well. */
 1179         pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain,
 1180             domainlist);
 1181         *nsetp = pbase;
 1182         return (0);
 1183 }
 1184 
 1185 /*
 1186  * Handle four cases for updating an entire process.
 1187  *
 1188  * 1) Set is non-null and the process is not rebasing onto a new root.  This
 1189  *    reparents all anonymous sets to the provided set and replaces all
 1190  *    non-anonymous td_cpusets with the provided set.
 1191  * 2) Set is non-null and the process is rebasing onto a new root.  This
 1192  *    creates a new base set if the process previously had its own base set,
 1193  *    then reparents all anonymous sets either to that set or the provided set
 1194  *    if one was not created.  Non-anonymous sets are similarly replaced.
 1195  * 3) Mask is non-null.  This replaces or creates anonymous sets for every
 1196  *    thread with the existing base as a parent.
 1197  * 4) domain is non-null.  This creates anonymous sets for every thread
 1198  *    and replaces the domain set.
 1199  *
 1200  * This is overly complicated because we can't allocate while holding a 
 1201  * spinlock and spinlocks must be held while changing and examining thread
 1202  * state.
 1203  */
 1204 static int
 1205 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
 1206     struct domainset *domain, bool rebase)
 1207 {
 1208         struct setlist freelist;
 1209         struct setlist droplist;
 1210         struct domainlist domainlist;
 1211         struct cpuset *base, *nset, *nroot, *tdroot;
 1212         struct thread *td;
 1213         struct proc *p;
 1214         int needed;
 1215         int nfree;
 1216         int error;
 1217 
 1218         /*
 1219          * The algorithm requires two passes due to locking considerations.
 1220          * 
 1221          * 1) Lookup the process and acquire the locks in the required order.
 1222          * 2) If enough cpusets have not been allocated release the locks and
 1223          *    allocate them.  Loop.
 1224          */
 1225         cpuset_freelist_init(&freelist, 1);
 1226         domainset_freelist_init(&domainlist, 1);
 1227         nfree = 1;
 1228         LIST_INIT(&droplist);
 1229         nfree = 0;
 1230         base = set;
 1231         nroot = NULL;
 1232         if (set != NULL)
 1233                 nroot = cpuset_getroot(set);
 1234         for (;;) {
 1235                 error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 1236                 if (error)
 1237                         goto out;
 1238                 tdroot = cpuset_getroot(td->td_cpuset);
 1239                 needed = p->p_numthreads;
 1240                 if (set != NULL && rebase && tdroot != nroot)
 1241                         needed++;
 1242                 if (nfree >= needed)
 1243                         break;
 1244                 PROC_UNLOCK(p);
 1245                 if (nfree < needed) {
 1246                         cpuset_freelist_add(&freelist, needed - nfree);
 1247                         domainset_freelist_add(&domainlist, needed - nfree);
 1248                         nfree = needed;
 1249                 }
 1250         }
 1251         PROC_LOCK_ASSERT(p, MA_OWNED);
 1252 
 1253         /*
 1254          * If we're changing roots and the root set is what has been specified
 1255          * as the parent, then we'll check if the process was previously using
 1256          * the root set and, if it wasn't, create a new base with the process's
 1257          * mask applied to it.
 1258          *
 1259          * If the new root is incompatible with the existing mask, then we allow
 1260          * the process to take on the new root if and only if they have
 1261          * privilege to widen their mask anyways.  Unprivileged processes get
 1262          * rejected with EDEADLK.
 1263          */
 1264         if (set != NULL && rebase && nroot != tdroot) {
 1265                 cpusetid_t base_id, root_id;
 1266 
 1267                 root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id;
 1268                 base_id = cpuset_getbase(td->td_cpuset)->cs_id;
 1269 
 1270                 if (base_id != root_id) {
 1271                         error = cpuset_setproc_newbase(td, set, nroot, &base,
 1272                             &freelist, &domainlist);
 1273                         if (error == EDEADLK &&
 1274                             priv_check(td, PRIV_SCHED_CPUSET) == 0)
 1275                                 error = 0;
 1276                         if (error != 0)
 1277                                 goto unlock_out;
 1278                 }
 1279         }
 1280 
 1281         /*
 1282          * Now that the appropriate locks are held and we have enough cpusets,
 1283          * make sure the operation will succeed before applying changes. The
 1284          * proc lock prevents td_cpuset from changing between calls.
 1285          */
 1286         error = 0;
 1287         FOREACH_THREAD_IN_PROC(p, td) {
 1288                 thread_lock(td);
 1289                 if (set != NULL)
 1290                         error = cpuset_setproc_test_setthread(td->td_cpuset,
 1291                             base);
 1292                 else
 1293                         error = cpuset_setproc_test_maskthread(td->td_cpuset,
 1294                             mask, domain);
 1295                 thread_unlock(td);
 1296                 if (error)
 1297                         goto unlock_out;
 1298         }
 1299         /*
 1300          * Replace each thread's cpuset while using deferred release.  We
 1301          * must do this because the thread lock must be held while operating
 1302          * on the thread and this limits the type of operations allowed.
 1303          */
 1304         FOREACH_THREAD_IN_PROC(p, td) {
 1305                 thread_lock(td);
 1306                 if (set != NULL)
 1307                         error = cpuset_setproc_setthread(td->td_cpuset, base,
 1308                             &nset, &freelist, &domainlist);
 1309                 else
 1310                         error = cpuset_setproc_maskthread(td->td_cpuset, mask,
 1311                             domain, &nset, &freelist, &domainlist);
 1312                 if (error) {
 1313                         thread_unlock(td);
 1314                         break;
 1315                 }
 1316                 cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
 1317                 thread_unlock(td);
 1318         }
 1319 unlock_out:
 1320         PROC_UNLOCK(p);
 1321 out:
 1322         if (base != NULL && base != set)
 1323                 cpuset_rel(base);
 1324         while ((nset = LIST_FIRST(&droplist)) != NULL)
 1325                 cpuset_rel_complete(nset);
 1326         cpuset_freelist_free(&freelist);
 1327         domainset_freelist_free(&domainlist);
 1328         return (error);
 1329 }
 1330 
 1331 static int
 1332 bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen)
 1333 {
 1334         size_t bytes;
 1335         int i, once;
 1336         char *p;
 1337 
 1338         once = 0;
 1339         p = buf;
 1340         for (i = 0; i < __bitset_words(setlen); i++) {
 1341                 if (once != 0) {
 1342                         if (bufsiz < 1)
 1343                                 return (0);
 1344                         *p = ',';
 1345                         p++;
 1346                         bufsiz--;
 1347                 } else
 1348                         once = 1;
 1349                 if (bufsiz < sizeof(__STRING(ULONG_MAX)))
 1350                         return (0);
 1351                 bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]);
 1352                 p += bytes;
 1353                 bufsiz -= bytes;
 1354         }
 1355         return (p - buf);
 1356 }
 1357 
 1358 static int
 1359 bitset_strscan(struct bitset *set, int setlen, const char *buf)
 1360 {
 1361         int i, ret;
 1362         const char *p;
 1363 
 1364         BIT_ZERO(setlen, set);
 1365         p = buf;
 1366         for (i = 0; i < __bitset_words(setlen); i++) {
 1367                 if (*p == ',') {
 1368                         p++;
 1369                         continue;
 1370                 }
 1371                 ret = sscanf(p, "%lx", &set->__bits[i]);
 1372                 if (ret == 0 || ret == -1)
 1373                         break;
 1374                 while (isxdigit(*p))
 1375                         p++;
 1376         }
 1377         return (p - buf);
 1378 }
 1379 
 1380 /*
 1381  * Return a string representing a valid layout for a cpuset_t object.
 1382  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
 1383  */
 1384 char *
 1385 cpusetobj_strprint(char *buf, const cpuset_t *set)
 1386 {
 1387 
 1388         bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set,
 1389             CPU_SETSIZE);
 1390         return (buf);
 1391 }
 1392 
 1393 /*
 1394  * Build a valid cpuset_t object from a string representation.
 1395  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
 1396  */
 1397 int
 1398 cpusetobj_strscan(cpuset_t *set, const char *buf)
 1399 {
 1400         char p;
 1401 
 1402         if (strlen(buf) > CPUSETBUFSIZ - 1)
 1403                 return (-1);
 1404 
 1405         p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)];
 1406         if (p != '\0')
 1407                 return (-1);
 1408 
 1409         return (0);
 1410 }
 1411 
 1412 /*
 1413  * Handle a domainset specifier in the sysctl tree.  A poiner to a pointer to
 1414  * a domainset is in arg1.  If the user specifies a valid domainset the
 1415  * pointer is updated.
 1416  *
 1417  * Format is:
 1418  * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred
 1419  */
 1420 int
 1421 sysctl_handle_domainset(SYSCTL_HANDLER_ARGS)
 1422 {
 1423         char buf[DOMAINSETBUFSIZ];
 1424         struct domainset *dset;
 1425         struct domainset key;
 1426         int policy, prefer, error;
 1427         char *p;
 1428 
 1429         dset = *(struct domainset **)arg1;
 1430         error = 0;
 1431 
 1432         if (dset != NULL) {
 1433                 p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ,
 1434                     (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE);
 1435                 sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer);
 1436         } else
 1437                 sprintf(buf, "<NULL>");
 1438         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 1439         if (error != 0 || req->newptr == NULL)
 1440                 return (error);
 1441 
 1442         /*
 1443          * Read in and validate the string.
 1444          */
 1445         memset(&key, 0, sizeof(key));
 1446         p = &buf[bitset_strscan((struct bitset *)&key.ds_mask,
 1447             DOMAINSET_SETSIZE, buf)];
 1448         if (p == buf)
 1449                 return (EINVAL);
 1450         if (sscanf(p, ":%d:%d", &policy, &prefer) != 2)
 1451                 return (EINVAL);
 1452         key.ds_policy = policy;
 1453         key.ds_prefer = prefer;
 1454 
 1455         /* Domainset_create() validates the policy.*/
 1456         dset = domainset_create(&key);
 1457         if (dset == NULL)
 1458                 return (EINVAL);
 1459         *(struct domainset **)arg1 = dset;
 1460 
 1461         return (error);
 1462 }
 1463 
 1464 /*
 1465  * Apply an anonymous mask or a domain to a single thread.
 1466  */
 1467 static int
 1468 _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain)
 1469 {
 1470         struct setlist cpusets;
 1471         struct domainlist domainlist;
 1472         struct cpuset *nset;
 1473         struct cpuset *set;
 1474         struct thread *td;
 1475         struct proc *p;
 1476         int error;
 1477 
 1478         cpuset_freelist_init(&cpusets, 1);
 1479         domainset_freelist_init(&domainlist, domain != NULL);
 1480         error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
 1481         if (error)
 1482                 goto out;
 1483         set = NULL;
 1484         thread_lock(td);
 1485         error = cpuset_shadow(td->td_cpuset, &nset, mask, domain,
 1486             &cpusets, &domainlist);
 1487         if (error == 0)
 1488                 set = cpuset_update_thread(td, nset);
 1489         thread_unlock(td);
 1490         PROC_UNLOCK(p);
 1491         if (set)
 1492                 cpuset_rel(set);
 1493 out:
 1494         cpuset_freelist_free(&cpusets);
 1495         domainset_freelist_free(&domainlist);
 1496         return (error);
 1497 }
 1498 
 1499 /*
 1500  * Apply an anonymous mask to a single thread.
 1501  */
 1502 int
 1503 cpuset_setthread(lwpid_t id, cpuset_t *mask)
 1504 {
 1505 
 1506         return _cpuset_setthread(id, mask, NULL);
 1507 }
 1508 
 1509 /*
 1510  * Apply new cpumask to the ithread.
 1511  */
 1512 int
 1513 cpuset_setithread(lwpid_t id, int cpu)
 1514 {
 1515         cpuset_t mask;
 1516 
 1517         CPU_ZERO(&mask);
 1518         if (cpu == NOCPU)
 1519                 CPU_COPY(cpuset_root, &mask);
 1520         else
 1521                 CPU_SET(cpu, &mask);
 1522         return _cpuset_setthread(id, &mask, NULL);
 1523 }
 1524 
 1525 /*
 1526  * Initialize static domainsets after NUMA information is available.  This is
 1527  * called before memory allocators are initialized.
 1528  */
 1529 void
 1530 domainset_init(void)
 1531 {
 1532         struct domainset *dset;
 1533         int i;
 1534 
 1535         dset = &domainset_roundrobin;
 1536         DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 1537         dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 1538         dset->ds_prefer = -1;
 1539         _domainset_create(dset, NULL);
 1540 
 1541         for (i = 0; i < vm_ndomains; i++) {
 1542                 dset = &domainset_fixed[i];
 1543                 DOMAINSET_ZERO(&dset->ds_mask);
 1544                 DOMAINSET_SET(i, &dset->ds_mask);
 1545                 dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 1546                 _domainset_create(dset, NULL);
 1547 
 1548                 dset = &domainset_prefer[i];
 1549                 DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 1550                 dset->ds_policy = DOMAINSET_POLICY_PREFER;
 1551                 dset->ds_prefer = i;
 1552                 _domainset_create(dset, NULL);
 1553         }
 1554 }
 1555 
 1556 /*
 1557  * Create the domainset for cpuset 0, 1 and cpuset 2.
 1558  */
 1559 void
 1560 domainset_zero(void)
 1561 {
 1562         struct domainset *dset, *tmp;
 1563 
 1564         mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
 1565 
 1566         dset = &domainset0;
 1567         DOMAINSET_COPY(&all_domains, &dset->ds_mask);
 1568         dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
 1569         dset->ds_prefer = -1;
 1570         curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
 1571 
 1572         domainset_copy(dset, &domainset2);
 1573         domainset2.ds_policy = DOMAINSET_POLICY_INTERLEAVE;
 1574         kernel_object->domain.dr_policy = _domainset_create(&domainset2, NULL);
 1575 
 1576         /* Remove empty domains from the global policies. */
 1577         LIST_FOREACH_SAFE(dset, &cpuset_domains, ds_link, tmp)
 1578                 if (domainset_empty_vm(dset))
 1579                         LIST_REMOVE(dset, ds_link);
 1580 }
 1581 
 1582 /*
 1583  * Creates system-wide cpusets and the cpuset for thread0 including three
 1584  * sets:
 1585  * 
 1586  * 0 - The root set which should represent all valid processors in the
 1587  *     system.  This set is immutable.
 1588  * 1 - The default set which all processes are a member of until changed.
 1589  *     This allows an administrator to move all threads off of given cpus to
 1590  *     dedicate them to high priority tasks or save power etc.
 1591  * 2 - The kernel set which allows restriction and policy to be applied only
 1592  *     to kernel threads and the kernel_object.
 1593  */
 1594 struct cpuset *
 1595 cpuset_thread0(void)
 1596 {
 1597         struct cpuset *set;
 1598         int i;
 1599         int error __unused;
 1600 
 1601         cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
 1602             NULL, NULL, UMA_ALIGN_CACHE, 0);
 1603         domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
 1604             NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 1605 
 1606         /*
 1607          * Create the root system set (0) for the whole machine.  Doesn't use
 1608          * cpuset_create() due to NULL parent.
 1609          */
 1610         set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 1611         CPU_COPY(&all_cpus, &set->cs_mask);
 1612         LIST_INIT(&set->cs_children);
 1613         LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 1614         refcount_init(&set->cs_ref, 1);
 1615         set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY;
 1616         set->cs_domain = &domainset0;
 1617         cpuset_zero = set;
 1618         cpuset_root = &set->cs_mask;
 1619 
 1620         /*
 1621          * Now derive a default (1), modifiable set from that to give out.
 1622          */
 1623         set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 1624         error = cpuset_init(set, cpuset_zero, NULL, NULL, 1);
 1625         KASSERT(error == 0, ("Error creating default set: %d\n", error));
 1626         cpuset_default = set;
 1627         /*
 1628          * Create the kernel set (2).
 1629          */
 1630         set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 1631         error = cpuset_init(set, cpuset_zero, NULL, NULL, 2);
 1632         KASSERT(error == 0, ("Error creating kernel set: %d\n", error));
 1633         set->cs_domain = &domainset2;
 1634         cpuset_kernel = set;
 1635 
 1636         /*
 1637          * Initialize the unit allocator. 0 and 1 are allocated above.
 1638          */
 1639         cpuset_unr = new_unrhdr(3, INT_MAX, NULL);
 1640 
 1641         /*
 1642          * If MD code has not initialized per-domain cpusets, place all
 1643          * CPUs in domain 0.
 1644          */
 1645         for (i = 0; i < MAXMEMDOM; i++)
 1646                 if (!CPU_EMPTY(&cpuset_domain[i]))
 1647                         goto domains_set;
 1648         CPU_COPY(&all_cpus, &cpuset_domain[0]);
 1649 domains_set:
 1650 
 1651         return (cpuset_default);
 1652 }
 1653 
 1654 void
 1655 cpuset_kernthread(struct thread *td)
 1656 {
 1657         struct cpuset *set;
 1658 
 1659         thread_lock(td);
 1660         set = td->td_cpuset;
 1661         td->td_cpuset = cpuset_ref(cpuset_kernel);
 1662         thread_unlock(td);
 1663         cpuset_rel(set);
 1664 }
 1665 
 1666 /*
 1667  * Create a cpuset, which would be cpuset_create() but
 1668  * mark the new 'set' as root.
 1669  *
 1670  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
 1671  * for that.
 1672  *
 1673  * In case of no error, returns the set in *setp locked with a reference.
 1674  */
 1675 int
 1676 cpuset_create_root(struct prison *pr, struct cpuset **setp)
 1677 {
 1678         struct cpuset *set;
 1679         int error;
 1680 
 1681         KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
 1682         KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
 1683 
 1684         set = NULL;
 1685         error = cpuset_create(&set, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
 1686         if (error)
 1687                 return (error);
 1688 
 1689         KASSERT(set != NULL, ("[%s:%d] cpuset_create returned invalid data",
 1690             __func__, __LINE__));
 1691 
 1692         /* Mark the set as root. */
 1693         set->cs_flags |= CPU_SET_ROOT;
 1694         *setp = set;
 1695 
 1696         return (0);
 1697 }
 1698 
 1699 int
 1700 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 1701 {
 1702         int error;
 1703 
 1704         KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
 1705         KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 1706 
 1707         cpuset_ref(set);
 1708         error = cpuset_setproc(p->p_pid, set, NULL, NULL, true);
 1709         if (error)
 1710                 return (error);
 1711         cpuset_rel(set);
 1712         return (0);
 1713 }
 1714 
 1715 /*
 1716  * In Capability mode, the only accesses that are permitted are to the current
 1717  * thread and process' CPU and domain sets.
 1718  */
 1719 static int
 1720 cpuset_check_capabilities(struct thread *td, cpulevel_t level, cpuwhich_t which,
 1721     id_t id)
 1722 {
 1723         if (IN_CAPABILITY_MODE(td)) {
 1724                 if (level != CPU_LEVEL_WHICH)
 1725                         return (ECAPMODE);
 1726                 if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 1727                         return (ECAPMODE);
 1728                 if (id != -1 &&
 1729                     !(which == CPU_WHICH_TID && id == td->td_tid) &&
 1730                     !(which == CPU_WHICH_PID && id == td->td_proc->p_pid))
 1731                         return (ECAPMODE);
 1732         }
 1733         return (0);
 1734 }
 1735 
 1736 #ifndef _SYS_SYSPROTO_H_
 1737 struct cpuset_args {
 1738         cpusetid_t      *setid;
 1739 };
 1740 #endif
 1741 int
 1742 sys_cpuset(struct thread *td, struct cpuset_args *uap)
 1743 {
 1744         struct cpuset *root;
 1745         struct cpuset *set;
 1746         int error;
 1747 
 1748         thread_lock(td);
 1749         root = cpuset_refroot(td->td_cpuset);
 1750         thread_unlock(td);
 1751         set = NULL;
 1752         error = cpuset_create(&set, root, &root->cs_mask);
 1753         cpuset_rel(root);
 1754         if (error)
 1755                 return (error);
 1756         error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 1757         if (error == 0)
 1758                 error = cpuset_setproc(-1, set, NULL, NULL, false);
 1759         cpuset_rel(set);
 1760         return (error);
 1761 }
 1762 
 1763 #ifndef _SYS_SYSPROTO_H_
 1764 struct cpuset_setid_args {
 1765         cpuwhich_t      which;
 1766         id_t            id;
 1767         cpusetid_t      setid;
 1768 };
 1769 #endif
 1770 int
 1771 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
 1772 {
 1773 
 1774         return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
 1775 }
 1776 
 1777 int
 1778 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
 1779     id_t id, cpusetid_t setid)
 1780 {
 1781         struct cpuset *set;
 1782         int error;
 1783 
 1784         /*
 1785          * Presently we only support per-process sets.
 1786          */
 1787         if (which != CPU_WHICH_PID)
 1788                 return (EINVAL);
 1789         set = cpuset_lookup(setid, td);
 1790         if (set == NULL)
 1791                 return (ESRCH);
 1792         error = cpuset_setproc(id, set, NULL, NULL, false);
 1793         cpuset_rel(set);
 1794         return (error);
 1795 }
 1796 
 1797 #ifndef _SYS_SYSPROTO_H_
 1798 struct cpuset_getid_args {
 1799         cpulevel_t      level;
 1800         cpuwhich_t      which;
 1801         id_t            id;
 1802         cpusetid_t      *setid;
 1803 };
 1804 #endif
 1805 int
 1806 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
 1807 {
 1808 
 1809         return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
 1810             uap->setid));
 1811 }
 1812 
 1813 int
 1814 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
 1815     id_t id, cpusetid_t *setid)
 1816 {
 1817         struct cpuset *nset;
 1818         struct cpuset *set;
 1819         struct thread *ttd;
 1820         struct proc *p;
 1821         cpusetid_t tmpid;
 1822         int error;
 1823 
 1824         if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
 1825                 return (EINVAL);
 1826         error = cpuset_which(which, id, &p, &ttd, &set);
 1827         if (error)
 1828                 return (error);
 1829         switch (which) {
 1830         case CPU_WHICH_TID:
 1831         case CPU_WHICH_PID:
 1832                 thread_lock(ttd);
 1833                 set = cpuset_refbase(ttd->td_cpuset);
 1834                 thread_unlock(ttd);
 1835                 PROC_UNLOCK(p);
 1836                 break;
 1837         case CPU_WHICH_CPUSET:
 1838         case CPU_WHICH_JAIL:
 1839                 break;
 1840         case CPU_WHICH_IRQ:
 1841         case CPU_WHICH_DOMAIN:
 1842                 return (EINVAL);
 1843         }
 1844         switch (level) {
 1845         case CPU_LEVEL_ROOT:
 1846                 nset = cpuset_refroot(set);
 1847                 cpuset_rel(set);
 1848                 set = nset;
 1849                 break;
 1850         case CPU_LEVEL_CPUSET:
 1851                 break;
 1852         case CPU_LEVEL_WHICH:
 1853                 break;
 1854         }
 1855         tmpid = set->cs_id;
 1856         cpuset_rel(set);
 1857         if (error == 0)
 1858                 error = copyout(&tmpid, setid, sizeof(tmpid));
 1859 
 1860         return (error);
 1861 }
 1862 
 1863 #ifndef _SYS_SYSPROTO_H_
 1864 struct cpuset_getaffinity_args {
 1865         cpulevel_t      level;
 1866         cpuwhich_t      which;
 1867         id_t            id;
 1868         size_t          cpusetsize;
 1869         cpuset_t        *mask;
 1870 };
 1871 #endif
 1872 int
 1873 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
 1874 {
 1875 
 1876         return (kern_cpuset_getaffinity(td, uap->level, uap->which,
 1877             uap->id, uap->cpusetsize, uap->mask));
 1878 }
 1879 
 1880 int
 1881 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
 1882     id_t id, size_t cpusetsize, cpuset_t *maskp)
 1883 {
 1884         struct thread *ttd;
 1885         struct cpuset *nset;
 1886         struct cpuset *set;
 1887         struct proc *p;
 1888         cpuset_t *mask;
 1889         int error;
 1890         size_t size;
 1891 
 1892         if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 1893                 return (ERANGE);
 1894         error = cpuset_check_capabilities(td, level, which, id);
 1895         if (error != 0)
 1896                 return (error);
 1897         size = cpusetsize;
 1898         mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 1899         error = cpuset_which(which, id, &p, &ttd, &set);
 1900         if (error)
 1901                 goto out;
 1902         switch (level) {
 1903         case CPU_LEVEL_ROOT:
 1904         case CPU_LEVEL_CPUSET:
 1905                 switch (which) {
 1906                 case CPU_WHICH_TID:
 1907                 case CPU_WHICH_PID:
 1908                         thread_lock(ttd);
 1909                         set = cpuset_ref(ttd->td_cpuset);
 1910                         thread_unlock(ttd);
 1911                         break;
 1912                 case CPU_WHICH_CPUSET:
 1913                 case CPU_WHICH_JAIL:
 1914                         break;
 1915                 case CPU_WHICH_IRQ:
 1916                 case CPU_WHICH_INTRHANDLER:
 1917                 case CPU_WHICH_ITHREAD:
 1918                 case CPU_WHICH_DOMAIN:
 1919                         error = EINVAL;
 1920                         goto out;
 1921                 }
 1922                 if (level == CPU_LEVEL_ROOT)
 1923                         nset = cpuset_refroot(set);
 1924                 else
 1925                         nset = cpuset_refbase(set);
 1926                 CPU_COPY(&nset->cs_mask, mask);
 1927                 cpuset_rel(nset);
 1928                 break;
 1929         case CPU_LEVEL_WHICH:
 1930                 switch (which) {
 1931                 case CPU_WHICH_TID:
 1932                         thread_lock(ttd);
 1933                         CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
 1934                         thread_unlock(ttd);
 1935                         break;
 1936                 case CPU_WHICH_PID:
 1937                         FOREACH_THREAD_IN_PROC(p, ttd) {
 1938                                 thread_lock(ttd);
 1939                                 CPU_OR(mask, &ttd->td_cpuset->cs_mask);
 1940                                 thread_unlock(ttd);
 1941                         }
 1942                         break;
 1943                 case CPU_WHICH_CPUSET:
 1944                 case CPU_WHICH_JAIL:
 1945                         CPU_COPY(&set->cs_mask, mask);
 1946                         break;
 1947                 case CPU_WHICH_IRQ:
 1948                 case CPU_WHICH_INTRHANDLER:
 1949                 case CPU_WHICH_ITHREAD:
 1950                         error = intr_getaffinity(id, which, mask);
 1951                         break;
 1952                 case CPU_WHICH_DOMAIN:
 1953                         if (id < 0 || id >= MAXMEMDOM)
 1954                                 error = ESRCH;
 1955                         else
 1956                                 CPU_COPY(&cpuset_domain[id], mask);
 1957                         break;
 1958                 }
 1959                 break;
 1960         default:
 1961                 error = EINVAL;
 1962                 break;
 1963         }
 1964         if (set)
 1965                 cpuset_rel(set);
 1966         if (p)
 1967                 PROC_UNLOCK(p);
 1968         if (error == 0)
 1969                 error = copyout(mask, maskp, size);
 1970 out:
 1971         free(mask, M_TEMP);
 1972         return (error);
 1973 }
 1974 
 1975 #ifndef _SYS_SYSPROTO_H_
 1976 struct cpuset_setaffinity_args {
 1977         cpulevel_t      level;
 1978         cpuwhich_t      which;
 1979         id_t            id;
 1980         size_t          cpusetsize;
 1981         const cpuset_t  *mask;
 1982 };
 1983 #endif
 1984 int
 1985 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
 1986 {
 1987 
 1988         return (kern_cpuset_setaffinity(td, uap->level, uap->which,
 1989             uap->id, uap->cpusetsize, uap->mask));
 1990 }
 1991 
 1992 int
 1993 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
 1994     id_t id, size_t cpusetsize, const cpuset_t *maskp)
 1995 {
 1996         struct cpuset *nset;
 1997         struct cpuset *set;
 1998         struct thread *ttd;
 1999         struct proc *p;
 2000         cpuset_t *mask;
 2001         int error;
 2002 
 2003         if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 2004                 return (ERANGE);
 2005         error = cpuset_check_capabilities(td, level, which, id);
 2006         if (error != 0)
 2007                 return (error);
 2008         mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
 2009         error = copyin(maskp, mask, cpusetsize);
 2010         if (error)
 2011                 goto out;
 2012         /*
 2013          * Verify that no high bits are set.
 2014          */
 2015         if (cpusetsize > sizeof(cpuset_t)) {
 2016                 char *end;
 2017                 char *cp;
 2018 
 2019                 end = cp = (char *)&mask->__bits;
 2020                 end += cpusetsize;
 2021                 cp += sizeof(cpuset_t);
 2022                 while (cp != end)
 2023                         if (*cp++ != 0) {
 2024                                 error = EINVAL;
 2025                                 goto out;
 2026                         }
 2027         }
 2028         if (CPU_EMPTY(mask)) {
 2029                 error = EDEADLK;
 2030                 goto out;
 2031         }
 2032         switch (level) {
 2033         case CPU_LEVEL_ROOT:
 2034         case CPU_LEVEL_CPUSET:
 2035                 error = cpuset_which(which, id, &p, &ttd, &set);
 2036                 if (error)
 2037                         break;
 2038                 switch (which) {
 2039                 case CPU_WHICH_TID:
 2040                 case CPU_WHICH_PID:
 2041                         thread_lock(ttd);
 2042                         set = cpuset_ref(ttd->td_cpuset);
 2043                         thread_unlock(ttd);
 2044                         PROC_UNLOCK(p);
 2045                         break;
 2046                 case CPU_WHICH_CPUSET:
 2047                 case CPU_WHICH_JAIL:
 2048                         break;
 2049                 case CPU_WHICH_IRQ:
 2050                 case CPU_WHICH_INTRHANDLER:
 2051                 case CPU_WHICH_ITHREAD:
 2052                 case CPU_WHICH_DOMAIN:
 2053                         error = EINVAL;
 2054                         goto out;
 2055                 }
 2056                 if (level == CPU_LEVEL_ROOT)
 2057                         nset = cpuset_refroot(set);
 2058                 else
 2059                         nset = cpuset_refbase(set);
 2060                 error = cpuset_modify(nset, mask);
 2061                 cpuset_rel(nset);
 2062                 cpuset_rel(set);
 2063                 break;
 2064         case CPU_LEVEL_WHICH:
 2065                 switch (which) {
 2066                 case CPU_WHICH_TID:
 2067                         error = cpuset_setthread(id, mask);
 2068                         break;
 2069                 case CPU_WHICH_PID:
 2070                         error = cpuset_setproc(id, NULL, mask, NULL, false);
 2071                         break;
 2072                 case CPU_WHICH_CPUSET:
 2073                 case CPU_WHICH_JAIL:
 2074                         error = cpuset_which(which, id, &p, &ttd, &set);
 2075                         if (error == 0) {
 2076                                 error = cpuset_modify(set, mask);
 2077                                 cpuset_rel(set);
 2078                         }
 2079                         break;
 2080                 case CPU_WHICH_IRQ:
 2081                 case CPU_WHICH_INTRHANDLER:
 2082                 case CPU_WHICH_ITHREAD:
 2083                         error = intr_setaffinity(id, which, mask);
 2084                         break;
 2085                 default:
 2086                         error = EINVAL;
 2087                         break;
 2088                 }
 2089                 break;
 2090         default:
 2091                 error = EINVAL;
 2092                 break;
 2093         }
 2094 out:
 2095         free(mask, M_TEMP);
 2096         return (error);
 2097 }
 2098 
 2099 #ifndef _SYS_SYSPROTO_H_
 2100 struct cpuset_getdomain_args {
 2101         cpulevel_t      level;
 2102         cpuwhich_t      which;
 2103         id_t            id;
 2104         size_t          domainsetsize;
 2105         domainset_t     *mask;
 2106         int             *policy;
 2107 };
 2108 #endif
 2109 int
 2110 sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap)
 2111 {
 2112 
 2113         return (kern_cpuset_getdomain(td, uap->level, uap->which,
 2114             uap->id, uap->domainsetsize, uap->mask, uap->policy));
 2115 }
 2116 
 2117 int
 2118 kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
 2119     id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp)
 2120 {
 2121         struct domainset outset;
 2122         struct thread *ttd;
 2123         struct cpuset *nset;
 2124         struct cpuset *set;
 2125         struct domainset *dset;
 2126         struct proc *p;
 2127         domainset_t *mask;
 2128         int error;
 2129 
 2130         if (domainsetsize < sizeof(domainset_t) ||
 2131             domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 2132                 return (ERANGE);
 2133         error = cpuset_check_capabilities(td, level, which, id);
 2134         if (error != 0)
 2135                 return (error);
 2136         mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 2137         bzero(&outset, sizeof(outset));
 2138         error = cpuset_which(which, id, &p, &ttd, &set);
 2139         if (error)
 2140                 goto out;
 2141         switch (level) {
 2142         case CPU_LEVEL_ROOT:
 2143         case CPU_LEVEL_CPUSET:
 2144                 switch (which) {
 2145                 case CPU_WHICH_TID:
 2146                 case CPU_WHICH_PID:
 2147                         thread_lock(ttd);
 2148                         set = cpuset_ref(ttd->td_cpuset);
 2149                         thread_unlock(ttd);
 2150                         break;
 2151                 case CPU_WHICH_CPUSET:
 2152                 case CPU_WHICH_JAIL:
 2153                         break;
 2154                 case CPU_WHICH_IRQ:
 2155                 case CPU_WHICH_INTRHANDLER:
 2156                 case CPU_WHICH_ITHREAD:
 2157                 case CPU_WHICH_DOMAIN:
 2158                         error = EINVAL;
 2159                         goto out;
 2160                 }
 2161                 if (level == CPU_LEVEL_ROOT)
 2162                         nset = cpuset_refroot(set);
 2163                 else
 2164                         nset = cpuset_refbase(set);
 2165                 domainset_copy(nset->cs_domain, &outset);
 2166                 cpuset_rel(nset);
 2167                 break;
 2168         case CPU_LEVEL_WHICH:
 2169                 switch (which) {
 2170                 case CPU_WHICH_TID:
 2171                         thread_lock(ttd);
 2172                         domainset_copy(ttd->td_cpuset->cs_domain, &outset);
 2173                         thread_unlock(ttd);
 2174                         break;
 2175                 case CPU_WHICH_PID:
 2176                         FOREACH_THREAD_IN_PROC(p, ttd) {
 2177                                 thread_lock(ttd);
 2178                                 dset = ttd->td_cpuset->cs_domain;
 2179                                 /* Show all domains in the proc. */
 2180                                 DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask);
 2181                                 /* Last policy wins. */
 2182                                 outset.ds_policy = dset->ds_policy;
 2183                                 outset.ds_prefer = dset->ds_prefer;
 2184                                 thread_unlock(ttd);
 2185                         }
 2186                         break;
 2187                 case CPU_WHICH_CPUSET:
 2188                 case CPU_WHICH_JAIL:
 2189                         domainset_copy(set->cs_domain, &outset);
 2190                         break;
 2191                 case CPU_WHICH_IRQ:
 2192                 case CPU_WHICH_INTRHANDLER:
 2193                 case CPU_WHICH_ITHREAD:
 2194                 case CPU_WHICH_DOMAIN:
 2195                         error = EINVAL;
 2196                         break;
 2197                 }
 2198                 break;
 2199         default:
 2200                 error = EINVAL;
 2201                 break;
 2202         }
 2203         if (set)
 2204                 cpuset_rel(set);
 2205         if (p)
 2206                 PROC_UNLOCK(p);
 2207         /*
 2208          * Translate prefer into a set containing only the preferred domain,
 2209          * not the entire fallback set.
 2210          */
 2211         if (outset.ds_policy == DOMAINSET_POLICY_PREFER) {
 2212                 DOMAINSET_ZERO(&outset.ds_mask);
 2213                 DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask);
 2214         }
 2215         DOMAINSET_COPY(&outset.ds_mask, mask);
 2216         if (error == 0)
 2217                 error = copyout(mask, maskp, domainsetsize);
 2218         if (error == 0)
 2219                 if (suword32(policyp, outset.ds_policy) != 0)
 2220                         error = EFAULT;
 2221 out:
 2222         free(mask, M_TEMP);
 2223         return (error);
 2224 }
 2225 
 2226 #ifndef _SYS_SYSPROTO_H_
 2227 struct cpuset_setdomain_args {
 2228         cpulevel_t      level;
 2229         cpuwhich_t      which;
 2230         id_t            id;
 2231         size_t          domainsetsize;
 2232         domainset_t     *mask;
 2233         int             policy;
 2234 };
 2235 #endif
 2236 int
 2237 sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
 2238 {
 2239 
 2240         return (kern_cpuset_setdomain(td, uap->level, uap->which,
 2241             uap->id, uap->domainsetsize, uap->mask, uap->policy));
 2242 }
 2243 
 2244 int
 2245 kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
 2246     id_t id, size_t domainsetsize, const domainset_t *maskp, int policy)
 2247 {
 2248         struct cpuset *nset;
 2249         struct cpuset *set;
 2250         struct thread *ttd;
 2251         struct proc *p;
 2252         struct domainset domain;
 2253         domainset_t *mask;
 2254         int error;
 2255 
 2256         if (domainsetsize < sizeof(domainset_t) ||
 2257             domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 2258                 return (ERANGE);
 2259         if (policy <= DOMAINSET_POLICY_INVALID ||
 2260             policy > DOMAINSET_POLICY_MAX)
 2261                 return (EINVAL);
 2262         error = cpuset_check_capabilities(td, level, which, id);
 2263         if (error != 0)
 2264                 return (error);
 2265         memset(&domain, 0, sizeof(domain));
 2266         mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 2267         error = copyin(maskp, mask, domainsetsize);
 2268         if (error)
 2269                 goto out;
 2270         /*
 2271          * Verify that no high bits are set.
 2272          */
 2273         if (domainsetsize > sizeof(domainset_t)) {
 2274                 char *end;
 2275                 char *cp;
 2276 
 2277                 end = cp = (char *)&mask->__bits;
 2278                 end += domainsetsize;
 2279                 cp += sizeof(domainset_t);
 2280                 while (cp != end)
 2281                         if (*cp++ != 0) {
 2282                                 error = EINVAL;
 2283                                 goto out;
 2284                         }
 2285         }
 2286         if (DOMAINSET_EMPTY(mask)) {
 2287                 error = EDEADLK;
 2288                 goto out;
 2289         }
 2290         DOMAINSET_COPY(mask, &domain.ds_mask);
 2291         domain.ds_policy = policy;
 2292 
 2293         /*
 2294          * Sanitize the provided mask.
 2295          */
 2296         if (!DOMAINSET_SUBSET(&all_domains, &domain.ds_mask)) {
 2297                 error = EINVAL;
 2298                 goto out;
 2299         }
 2300 
 2301         /* Translate preferred policy into a mask and fallback. */
 2302         if (policy == DOMAINSET_POLICY_PREFER) {
 2303                 /* Only support a single preferred domain. */
 2304                 if (DOMAINSET_COUNT(&domain.ds_mask) != 1) {
 2305                         error = EINVAL;
 2306                         goto out;
 2307                 }
 2308                 domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
 2309                 /* This will be constrained by domainset_shadow(). */
 2310                 DOMAINSET_COPY(&all_domains, &domain.ds_mask);
 2311         }
 2312 
 2313         /*
 2314          * When given an impossible policy, fall back to interleaving
 2315          * across all domains.
 2316          */
 2317         if (domainset_empty_vm(&domain))
 2318                 domainset_copy(&domainset2, &domain);
 2319 
 2320         switch (level) {
 2321         case CPU_LEVEL_ROOT:
 2322         case CPU_LEVEL_CPUSET:
 2323                 error = cpuset_which(which, id, &p, &ttd, &set);
 2324                 if (error)
 2325                         break;
 2326                 switch (which) {
 2327                 case CPU_WHICH_TID:
 2328                 case CPU_WHICH_PID:
 2329                         thread_lock(ttd);
 2330                         set = cpuset_ref(ttd->td_cpuset);
 2331                         thread_unlock(ttd);
 2332                         PROC_UNLOCK(p);
 2333                         break;
 2334                 case CPU_WHICH_CPUSET:
 2335                 case CPU_WHICH_JAIL:
 2336                         break;
 2337                 case CPU_WHICH_IRQ:
 2338                 case CPU_WHICH_INTRHANDLER:
 2339                 case CPU_WHICH_ITHREAD:
 2340                 case CPU_WHICH_DOMAIN:
 2341                         error = EINVAL;
 2342                         goto out;
 2343                 }
 2344                 if (level == CPU_LEVEL_ROOT)
 2345                         nset = cpuset_refroot(set);
 2346                 else
 2347                         nset = cpuset_refbase(set);
 2348                 error = cpuset_modify_domain(nset, &domain);
 2349                 cpuset_rel(nset);
 2350                 cpuset_rel(set);
 2351                 break;
 2352         case CPU_LEVEL_WHICH:
 2353                 switch (which) {
 2354                 case CPU_WHICH_TID:
 2355                         error = _cpuset_setthread(id, NULL, &domain);
 2356                         break;
 2357                 case CPU_WHICH_PID:
 2358                         error = cpuset_setproc(id, NULL, NULL, &domain, false);
 2359                         break;
 2360                 case CPU_WHICH_CPUSET:
 2361                 case CPU_WHICH_JAIL:
 2362                         error = cpuset_which(which, id, &p, &ttd, &set);
 2363                         if (error == 0) {
 2364                                 error = cpuset_modify_domain(set, &domain);
 2365                                 cpuset_rel(set);
 2366                         }
 2367                         break;
 2368                 case CPU_WHICH_IRQ:
 2369                 case CPU_WHICH_INTRHANDLER:
 2370                 case CPU_WHICH_ITHREAD:
 2371                 default:
 2372                         error = EINVAL;
 2373                         break;
 2374                 }
 2375                 break;
 2376         default:
 2377                 error = EINVAL;
 2378                 break;
 2379         }
 2380 out:
 2381         free(mask, M_TEMP);
 2382         return (error);
 2383 }
 2384 
 2385 #ifdef DDB
 2386 
 2387 static void
 2388 ddb_display_bitset(const struct bitset *set, int size)
 2389 {
 2390         int bit, once;
 2391 
 2392         for (once = 0, bit = 0; bit < size; bit++) {
 2393                 if (CPU_ISSET(bit, set)) {
 2394                         if (once == 0) {
 2395                                 db_printf("%d", bit);
 2396                                 once = 1;
 2397                         } else  
 2398                                 db_printf(",%d", bit);
 2399                 }
 2400         }
 2401         if (once == 0)
 2402                 db_printf("<none>");
 2403 }
 2404 
 2405 void
 2406 ddb_display_cpuset(const cpuset_t *set)
 2407 {
 2408         ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE);
 2409 }
 2410 
 2411 static void
 2412 ddb_display_domainset(const domainset_t *set)
 2413 {
 2414         ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE);
 2415 }
 2416 
 2417 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 2418 {
 2419         struct cpuset *set;
 2420 
 2421         LIST_FOREACH(set, &cpuset_ids, cs_link) {
 2422                 db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
 2423                     set, set->cs_id, refcount_load(&set->cs_ref), set->cs_flags,
 2424                     (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 2425                 db_printf("  cpu mask=");
 2426                 ddb_display_cpuset(&set->cs_mask);
 2427                 db_printf("\n");
 2428                 db_printf("  domain policy %d prefer %d mask=",
 2429                     set->cs_domain->ds_policy, set->cs_domain->ds_prefer);
 2430                 ddb_display_domainset(&set->cs_domain->ds_mask);
 2431                 db_printf("\n");
 2432                 if (db_pager_quit)
 2433                         break;
 2434         }
 2435 }
 2436 
 2437 DB_SHOW_COMMAND(domainsets, db_show_domainsets)
 2438 {
 2439         struct domainset *set;
 2440 
 2441         LIST_FOREACH(set, &cpuset_domains, ds_link) {
 2442                 db_printf("set=%p policy %d prefer %d cnt %d\n",
 2443                     set, set->ds_policy, set->ds_prefer, set->ds_cnt);
 2444                 db_printf("  mask =");
 2445                 ddb_display_domainset(&set->ds_mask);
 2446                 db_printf("\n");
 2447         }
 2448 }
 2449 #endif /* DDB */

Cache object: b4d119fe9ddb23310abe9009b7e9e497


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.