[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_cpuset.c

Version: -  FREEBSD  -  FREEBSD7  -  FREEBSD70  -  FREEBSD6  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  OPENSOLARIS  -  minix-3-1-1  -  TRUSTEDBSD-SEBSD  -  TRUSTEDBSD-SEDARWIN  -  TRUSTEDBSD-SEDARWIN7 
Ident_Mode: -  plain  -  excerpts  -  bigexcerpts 

  1 /*-
  2  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
  3  * All rights reserved.
  4  * 
  5  * Copyright (c) 2008 Nokia Corporation
  6  * All rights reserved.
  7  *
  8  * Redistribution and use in source and binary forms, with or without
  9  * modification, are permitted provided that the following conditions
 10  * are met:
 11  * 1. Redistributions of source code must retain the above copyright
 12  *    notice unmodified, this list of conditions, and the following
 13  *    disclaimer.
 14  * 2. Redistributions in binary form must reproduce the above copyright
 15  *    notice, this list of conditions and the following disclaimer in the
 16  *    documentation and/or other materials provided with the distribution.
 17  *
 18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 23  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 24  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 25  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 27  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28  *
 29  */
 30 
 31 #include <sys/cdefs.h>
 32 __FBSDID("$FreeBSD: src/sys/kern/kern_cpuset.c,v 1.13 2008/07/07 21:32:02 bz Exp $");
 33 
 34 #include "opt_ddb.h"
 35 
 36 #include <sys/param.h>
 37 #include <sys/systm.h>
 38 #include <sys/sysproto.h>
 39 #include <sys/kernel.h>
 40 #include <sys/lock.h>
 41 #include <sys/malloc.h>
 42 #include <sys/mutex.h>
 43 #include <sys/priv.h>
 44 #include <sys/proc.h>
 45 #include <sys/refcount.h>
 46 #include <sys/sched.h>
 47 #include <sys/smp.h>
 48 #include <sys/syscallsubr.h>
 49 #include <sys/cpuset.h>
 50 #include <sys/sx.h>
 51 #include <sys/refcount.h>
 52 #include <sys/queue.h>
 53 #include <sys/limits.h>
 54 #include <sys/bus.h>
 55 #include <sys/interrupt.h>
 56 
 57 #include <vm/uma.h>
 58 
 59 #ifdef DDB
 60 #include <ddb/ddb.h>
 61 #endif /* DDB */
 62 
 63 /*
 64  * cpusets provide a mechanism for creating and manipulating sets of
 65  * processors for the purpose of constraining the scheduling of threads to
 66  * specific processors.
 67  *
 68  * Each process belongs to an identified set, by default this is set 1.  Each
 69  * thread may further restrict the cpus it may run on to a subset of this
 70  * named set.  This creates an anonymous set which other threads and processes
 71  * may not join by number.
 72  *
 73  * The named set is referred to herein as the 'base' set to avoid ambiguity.
 74  * This set is usually a child of a 'root' set while the anonymous set may
 75  * simply be referred to as a mask.  In the syscall api these are referred to
 76  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
 77  *
 78  * Threads inherit their set from their creator whether it be anonymous or
 79  * not.  This means that anonymous sets are immutable because they may be
 80  * shared.  To modify an anonymous set a new set is created with the desired
 81  * mask and the same parent as the existing anonymous set.  This gives the
 82  * illusion of each thread having a private mask.A
 83  *
 84  * Via the syscall apis a user may ask to retrieve or modify the root, base,
 85  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
 86  * modifies all numbered and anonymous child sets to comply with the new mask.
 87  * Modifying a pid or tid's mask applies only to that tid but must still
 88  * exist within the assigned parent set.
 89  *
 90  * A thread may not be assigned to a a group seperate from other threads in
 91  * the process.  This is to remove ambiguity when the setid is queried with
 92  * a pid argument.  There is no other technical limitation.
 93  *
 94  * This somewhat complex arrangement is intended to make it easy for
 95  * applications to query available processors and bind their threads to
 96  * specific processors while also allowing administrators to dynamically
 97  * reprovision by changing sets which apply to groups of processes.
 98  *
 99  * A simple application should not concern itself with sets at all and
100  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
101  * meaning 'curthread'.  It may query availble cpus for that tid with a
102  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
103  */
104 static uma_zone_t cpuset_zone;
105 static struct mtx cpuset_lock;
106 static struct setlist cpuset_ids;
107 static struct unrhdr *cpuset_unr;
108 static struct cpuset *cpuset_zero;
109 
110 cpuset_t *cpuset_root;
111 
112 /*
113  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
114  */
115 struct cpuset *
116 cpuset_ref(struct cpuset *set)
117 {
118 
119         refcount_acquire(&set->cs_ref);
120         return (set);
121 }
122 
123 /*
124  * Walks up the tree from 'set' to find the root.  Returns the root
125  * referenced.
126  */
127 static struct cpuset *
128 cpuset_refroot(struct cpuset *set)
129 {
130 
131         for (; set->cs_parent != NULL; set = set->cs_parent)
132                 if (set->cs_flags & CPU_SET_ROOT)
133                         break;
134         cpuset_ref(set);
135 
136         return (set);
137 }
138 
139 /*
140  * Find the first non-anonymous set starting from 'set'.  Returns this set
141  * referenced.  May return the passed in set with an extra ref if it is
142  * not anonymous. 
143  */
144 static struct cpuset *
145 cpuset_refbase(struct cpuset *set)
146 {
147 
148         if (set->cs_id == CPUSET_INVALID)
149                 set = set->cs_parent;
150         cpuset_ref(set);
151 
152         return (set);
153 }
154 
155 /*
156  * Release a reference in a context where it is safe to allocte.
157  */
158 void
159 cpuset_rel(struct cpuset *set)
160 {
161         cpusetid_t id;
162 
163         if (refcount_release(&set->cs_ref) == 0)
164                 return;
165         mtx_lock_spin(&cpuset_lock);
166         LIST_REMOVE(set, cs_siblings);
167         id = set->cs_id;
168         if (id != CPUSET_INVALID)
169                 LIST_REMOVE(set, cs_link);
170         mtx_unlock_spin(&cpuset_lock);
171         cpuset_rel(set->cs_parent);
172         uma_zfree(cpuset_zone, set);
173         if (id != CPUSET_INVALID)
174                 free_unr(cpuset_unr, id);
175 }
176 
177 /*
178  * Deferred release must be used when in a context that is not safe to
179  * allocate/free.  This places any unreferenced sets on the list 'head'.
180  */
181 static void
182 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
183 {
184 
185         if (refcount_release(&set->cs_ref) == 0)
186                 return;
187         mtx_lock_spin(&cpuset_lock);
188         LIST_REMOVE(set, cs_siblings);
189         if (set->cs_id != CPUSET_INVALID)
190                 LIST_REMOVE(set, cs_link);
191         LIST_INSERT_HEAD(head, set, cs_link);
192         mtx_unlock_spin(&cpuset_lock);
193 }
194 
195 /*
196  * Complete a deferred release.  Removes the set from the list provided to
197  * cpuset_rel_defer.
198  */
199 static void
200 cpuset_rel_complete(struct cpuset *set)
201 {
202         LIST_REMOVE(set, cs_link);
203         cpuset_rel(set->cs_parent);
204         uma_zfree(cpuset_zone, set);
205 }
206 
207 /*
208  * Find a set based on an id.  Returns it with a ref.
209  */
210 static struct cpuset *
211 cpuset_lookup(cpusetid_t setid)
212 {
213         struct cpuset *set;
214 
215         if (setid == CPUSET_INVALID)
216                 return (NULL);
217         mtx_lock_spin(&cpuset_lock);
218         LIST_FOREACH(set, &cpuset_ids, cs_link)
219                 if (set->cs_id == setid)
220                         break;
221         if (set)
222                 cpuset_ref(set);
223         mtx_unlock_spin(&cpuset_lock);
224         return (set);
225 }
226 
227 /*
228  * Create a set in the space provided in 'set' with the provided parameters.
229  * The set is returned with a single ref.  May return EDEADLK if the set
230  * will have no valid cpu based on restrictions from the parent.
231  */
232 static int
233 _cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
234     cpusetid_t id)
235 {
236 
237         if (!CPU_OVERLAP(&parent->cs_mask, mask))
238                 return (EDEADLK);
239         CPU_COPY(mask, &set->cs_mask);
240         LIST_INIT(&set->cs_children);
241         refcount_init(&set->cs_ref, 1);
242         set->cs_flags = 0;
243         mtx_lock_spin(&cpuset_lock);
244         CPU_AND(mask, &parent->cs_mask);
245         set->cs_id = id;
246         set->cs_parent = cpuset_ref(parent);
247         LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
248         if (set->cs_id != CPUSET_INVALID)
249                 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
250         mtx_unlock_spin(&cpuset_lock);
251 
252         return (0);
253 }
254 
255 /*
256  * Create a new non-anonymous set with the requested parent and mask.  May
257  * return failures if the mask is invalid or a new number can not be
258  * allocated.
259  */
260 static int
261 cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
262 {
263         struct cpuset *set;
264         cpusetid_t id;
265         int error;
266 
267         id = alloc_unr(cpuset_unr);
268         if (id == -1)
269                 return (ENFILE);
270         *setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
271         error = _cpuset_create(set, parent, mask, id);
272         if (error == 0)
273                 return (0);
274         free_unr(cpuset_unr, id);
275         uma_zfree(cpuset_zone, set);
276 
277         return (error);
278 }
279 
280 /*
281  * Recursively check for errors that would occur from applying mask to
282  * the tree of sets starting at 'set'.  Checks for sets that would become
283  * empty as well as RDONLY flags.
284  */
285 static int
286 cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
287 {
288         struct cpuset *nset;
289         cpuset_t newmask;
290         int error;
291 
292         mtx_assert(&cpuset_lock, MA_OWNED);
293         if (set->cs_flags & CPU_SET_RDONLY)
294                 return (EPERM);
295         if (!CPU_OVERLAP(&set->cs_mask, mask))
296                 return (EDEADLK);
297         CPU_COPY(&set->cs_mask, &newmask);
298         CPU_AND(&newmask, mask);
299         error = 0;
300         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
301                 if ((error = cpuset_testupdate(nset, &newmask)) != 0)
302                         break;
303         return (error);
304 }
305 
306 /*
307  * Applies the mask 'mask' without checking for empty sets or permissions.
308  */
309 static void
310 cpuset_update(struct cpuset *set, cpuset_t *mask)
311 {
312         struct cpuset *nset;
313 
314         mtx_assert(&cpuset_lock, MA_OWNED);
315         CPU_AND(&set->cs_mask, mask);
316         LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
317                 cpuset_update(nset, &set->cs_mask);
318 
319         return;
320 }
321 
322 /*
323  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
324  * mask to restrict all children in the tree.  Checks for validity before
325  * applying the changes.
326  */
327 static int
328 cpuset_modify(struct cpuset *set, cpuset_t *mask)
329 {
330         struct cpuset *root;
331         int error;
332 
333         error = priv_check(curthread, PRIV_SCHED_CPUSET);
334         if (error)
335                 return (error);
336         /*
337          * Verify that we have access to this set of
338          * cpus.
339          */
340         root = set->cs_parent;
341         if (root && !CPU_SUBSET(&root->cs_mask, mask))
342                 return (EINVAL);
343         mtx_lock_spin(&cpuset_lock);
344         error = cpuset_testupdate(set, mask);
345         if (error)
346                 goto out;
347         cpuset_update(set, mask);
348         CPU_COPY(mask, &set->cs_mask);
349 out:
350         mtx_unlock_spin(&cpuset_lock);
351 
352         return (error);
353 }
354 
355 /*
356  * Resolve the 'which' parameter of several cpuset apis.
357  *
358  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
359  * checks for permission via p_cansched().
360  *
361  * For WHICH_SET returns a valid set with a new reference.
362  *
363  * -1 may be supplied for any argument to mean the current proc/thread or
364  * the base set of the current thread.  May fail with ESRCH/EPERM.
365  */
366 static int
367 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
368     struct cpuset **setp)
369 {
370         struct cpuset *set;
371         struct thread *td;
372         struct proc *p;
373         int error;
374 
375         *pp = p = NULL;
376         *tdp = td = NULL;
377         *setp = set = NULL;
378         switch (which) {
379         case CPU_WHICH_PID:
380                 if (id == -1) {
381                         PROC_LOCK(curproc);
382                         p = curproc;
383                         break;
384                 }
385                 if ((p = pfind(id)) == NULL)
386                         return (ESRCH);
387                 break;
388         case CPU_WHICH_TID:
389                 if (id == -1) {
390                         PROC_LOCK(curproc);
391                         p = curproc;
392                         td = curthread;
393                         break;
394                 }
395                 sx_slock(&allproc_lock);
396                 FOREACH_PROC_IN_SYSTEM(p) {
397                         PROC_LOCK(p);
398                         FOREACH_THREAD_IN_PROC(p, td)
399                                 if (td->td_tid == id)
400                                         break;
401                         if (td != NULL)
402                                 break;
403                         PROC_UNLOCK(p);
404                 }
405                 sx_sunlock(&allproc_lock);
406                 if (td == NULL)
407                         return (ESRCH);
408                 break;
409         case CPU_WHICH_CPUSET:
410                 if (id == -1) {
411                         thread_lock(curthread);
412                         set = cpuset_refbase(curthread->td_cpuset);
413                         thread_unlock(curthread);
414                 } else
415                         set = cpuset_lookup(id);
416                 if (set) {
417                         *setp = set;
418                         return (0);
419                 }
420                 return (ESRCH);
421         case CPU_WHICH_IRQ:
422                 return (0);
423         default:
424                 return (EINVAL);
425         }
426         error = p_cansched(curthread, p);
427         if (error) {
428                 PROC_UNLOCK(p);
429                 return (error);
430         }
431         if (td == NULL)
432                 td = FIRST_THREAD_IN_PROC(p);
433         *pp = p;
434         *tdp = td;
435         return (0);
436 }
437 
438 /*
439  * Create an anonymous set with the provided mask in the space provided by
440  * 'fset'.  If the passed in set is anonymous we use its parent otherwise
441  * the new set is a child of 'set'.
442  */
443 static int
444 cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
445 {
446         struct cpuset *parent;
447 
448         if (set->cs_id == CPUSET_INVALID)
449                 parent = set->cs_parent;
450         else
451                 parent = set;
452         if (!CPU_SUBSET(&parent->cs_mask, mask))
453                 return (EDEADLK);
454         return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
455 }
456 
457 /*
458  * Handle two cases for replacing the base set or mask of an entire process.
459  *
460  * 1) Set is non-null and mask is null.  This reparents all anonymous sets
461  *    to the provided set and replaces all non-anonymous td_cpusets with the
462  *    provided set.
463  * 2) Mask is non-null and set is null.  This replaces or creates anonymous
464  *    sets for every thread with the existing base as a parent.
465  *
466  * This is overly complicated because we can't allocate while holding a 
467  * spinlock and spinlocks must be held while changing and examining thread
468  * state.
469  */
470 static int
471 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
472 {
473         struct setlist freelist;
474         struct setlist droplist;
475         struct cpuset *tdset;
476         struct cpuset *nset;
477         struct thread *td;
478         struct proc *p;
479         int threads;
480         int nfree;
481         int error;
482         /*
483          * The algorithm requires two passes due to locking considerations.
484          * 
485          * 1) Lookup the process and acquire the locks in the required order.
486          * 2) If enough cpusets have not been allocated release the locks and
487          *    allocate them.  Loop.
488          */
489         LIST_INIT(&freelist);
490         LIST_INIT(&droplist);
491         nfree = 0;
492         for (;;) {
493                 error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
494                 if (error)
495                         goto out;
496                 if (nfree >= p->p_numthreads)
497                         break;
498                 threads = p->p_numthreads;
499                 PROC_UNLOCK(p);
500                 for (; nfree < threads; nfree++) {
501                         nset = uma_zalloc(cpuset_zone, M_WAITOK);
502                         LIST_INSERT_HEAD(&freelist, nset, cs_link);
503                 }
504         }
505         PROC_LOCK_ASSERT(p, MA_OWNED);
506         /*
507          * Now that the appropriate locks are held and we have enough cpusets,
508          * make sure the operation will succeed before applying changes.  The
509          * proc lock prevents td_cpuset from changing between calls.
510          */
511         error = 0;
512         FOREACH_THREAD_IN_PROC(p, td) {
513                 thread_lock(td);
514                 tdset = td->td_cpuset;
515                 /*
516                  * Verify that a new mask doesn't specify cpus outside of
517                  * the set the thread is a member of.
518                  */
519                 if (mask) {
520                         if (tdset->cs_id == CPUSET_INVALID)
521                                 tdset = tdset->cs_parent;
522                         if (!CPU_SUBSET(&tdset->cs_mask, mask))
523                                 error = EDEADLK;
524                 /*
525                  * Verify that a new set won't leave an existing thread
526                  * mask without a cpu to run on.  It can, however, restrict
527                  * the set.
528                  */
529                 } else if (tdset->cs_id == CPUSET_INVALID) {
530                         if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
531                                 error = EDEADLK;
532                 }
533                 thread_unlock(td);
534                 if (error)
535                         goto unlock_out;
536         }
537         /*
538          * Replace each thread's cpuset while using deferred release.  We
539          * must do this because the thread lock must be held while operating
540          * on the thread and this limits the type of operations allowed.
541          */
542         FOREACH_THREAD_IN_PROC(p, td) {
543                 thread_lock(td);
544                 /*
545                  * If we presently have an anonymous set or are applying a
546                  * mask we must create an anonymous shadow set.  That is
547                  * either parented to our existing base or the supplied set.
548                  *
549                  * If we have a base set with no anonymous shadow we simply
550                  * replace it outright.
551                  */
552                 tdset = td->td_cpuset;
553                 if (tdset->cs_id == CPUSET_INVALID || mask) {
554                         nset = LIST_FIRST(&freelist);
555                         LIST_REMOVE(nset, cs_link);
556                         if (mask)
557                                 error = cpuset_shadow(tdset, nset, mask);
558                         else
559                                 error = _cpuset_create(nset, set,
560                                     &tdset->cs_mask, CPUSET_INVALID);
561                         if (error) {
562                                 LIST_INSERT_HEAD(&freelist, nset, cs_link);
563                                 thread_unlock(td);
564                                 break;
565                         }
566                 } else
567                         nset = cpuset_ref(set);
568                 cpuset_rel_defer(&droplist, tdset);
569                 td->td_cpuset = nset;
570                 sched_affinity(td);
571                 thread_unlock(td);
572         }
573 unlock_out:
574         PROC_UNLOCK(p);
575 out:
576         while ((nset = LIST_FIRST(&droplist)) != NULL)
577                 cpuset_rel_complete(nset);
578         while ((nset = LIST_FIRST(&freelist)) != NULL) {
579                 LIST_REMOVE(nset, cs_link);
580                 uma_zfree(cpuset_zone, nset);
581         }
582         return (error);
583 }
584 
585 /*
586  * Apply an anonymous mask to a single thread.
587  */
588 int
589 cpuset_setthread(lwpid_t id, cpuset_t *mask)
590 {
591         struct cpuset *nset;
592         struct cpuset *set;
593         struct thread *td;
594         struct proc *p;
595         int error;
596 
597         nset = uma_zalloc(cpuset_zone, M_WAITOK);
598         error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
599         if (error)
600                 goto out;
601         set = NULL;
602         thread_lock(td);
603         error = cpuset_shadow(td->td_cpuset, nset, mask);
604         if (error == 0) {
605                 set = td->td_cpuset;
606                 td->td_cpuset = nset;
607                 sched_affinity(td);
608                 nset = NULL;
609         }
610         thread_unlock(td);
611         PROC_UNLOCK(p);
612         if (set)
613                 cpuset_rel(set);
614 out:
615         if (nset)
616                 uma_zfree(cpuset_zone, nset);
617         return (error);
618 }
619 
620 /*
621  * Creates the cpuset for thread0.  We make two sets:
622  * 
623  * 0 - The root set which should represent all valid processors in the
624  *     system.  It is initially created with a mask of all processors
625  *     because we don't know what processors are valid until cpuset_init()
626  *     runs.  This set is immutable.
627  * 1 - The default set which all processes are a member of until changed.
628  *     This allows an administrator to move all threads off of given cpus to
629  *     dedicate them to high priority tasks or save power etc.
630  */
631 struct cpuset *
632 cpuset_thread0(void)
633 {
634         struct cpuset *set;
635         int error;
636 
637         cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
638             NULL, NULL, UMA_ALIGN_PTR, 0);
639         mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
640         /*
641          * Create the root system set for the whole machine.  Doesn't use
642          * cpuset_create() due to NULL parent.
643          */
644         set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
645         set->cs_mask.__bits[0] = -1;
646         LIST_INIT(&set->cs_children);
647         LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
648         set->cs_ref = 1;
649         set->cs_flags = CPU_SET_ROOT;
650         cpuset_zero = set;
651         cpuset_root = &set->cs_mask;
652         /*
653          * Now derive a default, modifiable set from that to give out.
654          */
655         set = uma_zalloc(cpuset_zone, M_WAITOK);
656         error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
657         KASSERT(error == 0, ("Error creating default set: %d\n", error));
658         /*
659          * Initialize the unit allocator. 0 and 1 are allocated above.
660          */
661         cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
662 
663         return (set);
664 }
665 
666 /*
667  * This is called once the final set of system cpus is known.  Modifies
668  * the root set and all children and mark the root readonly.  
669  */
670 static void
671 cpuset_init(void *arg)
672 {
673         cpuset_t mask;
674 
675         CPU_ZERO(&mask);
676 #ifdef SMP
677         mask.__bits[0] = all_cpus;
678 #else
679         mask.__bits[0] = 1;
680 #endif
681         if (cpuset_modify(cpuset_zero, &mask))
682                 panic("Can't set initial cpuset mask.\n");
683         cpuset_zero->cs_flags |= CPU_SET_RDONLY;
684 }
685 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
686 
687 #ifndef _SYS_SYSPROTO_H_
688 struct cpuset_args {
689         cpusetid_t      *setid;
690 };
691 #endif
692 int
693 cpuset(struct thread *td, struct cpuset_args *uap)
694 {
695         struct cpuset *root;
696         struct cpuset *set;
697         int error;
698 
699         thread_lock(td);
700         root = cpuset_refroot(td->td_cpuset);
701         thread_unlock(td);
702         error = cpuset_create(&set, root, &root->cs_mask);
703         cpuset_rel(root);
704         if (error)
705                 return (error);
706         error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
707         if (error == 0)
708                 error = cpuset_setproc(-1, set, NULL);
709         cpuset_rel(set);
710         return (error);
711 }
712 
713 #ifndef _SYS_SYSPROTO_H_
714 struct cpuset_setid_args {
715         cpuwhich_t      which;
716         id_t            id;
717         cpusetid_t      setid;
718 };
719 #endif
720 int
721 cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
722 {
723         struct cpuset *set;
724         int error;
725 
726         /*
727          * Presently we only support per-process sets.
728          */
729         if (uap->which != CPU_WHICH_PID)
730                 return (EINVAL);
731         set = cpuset_lookup(uap->setid);
732         if (set == NULL)
733                 return (ESRCH);
734         error = cpuset_setproc(uap->id, set, NULL);
735         cpuset_rel(set);
736         return (error);
737 }
738 
739 #ifndef _SYS_SYSPROTO_H_
740 struct cpuset_getid_args {
741         cpulevel_t      level;
742         cpuwhich_t      which;
743         id_t            id;
744         cpusetid_t      *setid;
745 #endif
746 int
747 cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
748 {
749         struct cpuset *nset;
750         struct cpuset *set;
751         struct thread *ttd;
752         struct proc *p;
753         cpusetid_t id;
754         int error;
755 
756         if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
757                 return (EINVAL);
758         error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
759         if (error)
760                 return (error);
761         switch (uap->which) {
762         case CPU_WHICH_TID:
763         case CPU_WHICH_PID:
764                 thread_lock(ttd);
765                 set = cpuset_refbase(ttd->td_cpuset);
766                 thread_unlock(ttd);
767                 PROC_UNLOCK(p);
768                 break;
769         case CPU_WHICH_CPUSET:
770                 break;
771         case CPU_WHICH_IRQ:
772                 return (EINVAL);
773         }
774         switch (uap->level) {
775         case CPU_LEVEL_ROOT:
776                 nset = cpuset_refroot(set);
777                 cpuset_rel(set);
778                 set = nset;
779                 break;
780         case CPU_LEVEL_CPUSET:
781                 break;
782         case CPU_LEVEL_WHICH:
783                 break;
784         }
785         id = set->cs_id;
786         cpuset_rel(set);
787         if (error == 0)
788                 error = copyout(&id, uap->setid, sizeof(id));
789 
790         return (error);
791 }
792 
793 #ifndef _SYS_SYSPROTO_H_
794 struct cpuset_getaffinity_args {
795         cpulevel_t      level;
796         cpuwhich_t      which;
797         id_t            id;
798         size_t          cpusetsize;
799         cpuset_t        *mask;
800 };
801 #endif
802 int
803 cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
804 {
805         struct thread *ttd;
806         struct cpuset *nset;
807         struct cpuset *set;
808         struct proc *p;
809         cpuset_t *mask;
810         int error;
811         size_t size;
812 
813         if (uap->cpusetsize < sizeof(cpuset_t) ||
814             uap->cpusetsize > CPU_MAXSIZE / NBBY)
815                 return (ERANGE);
816         size = uap->cpusetsize;
817         mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
818         error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
819         if (error)
820                 goto out;
821         switch (uap->level) {
822         case CPU_LEVEL_ROOT:
823         case CPU_LEVEL_CPUSET:
824                 switch (uap->which) {
825                 case CPU_WHICH_TID:
826                 case CPU_WHICH_PID:
827                         thread_lock(ttd);
828                         set = cpuset_ref(ttd->td_cpuset);
829                         thread_unlock(ttd);
830                         break;
831                 case CPU_WHICH_CPUSET:
832                         break;
833                 case CPU_WHICH_IRQ:
834                         error = EINVAL;
835                         goto out;
836                 }
837                 if (uap->level == CPU_LEVEL_ROOT)
838                         nset = cpuset_refroot(set);
839                 else
840                         nset = cpuset_refbase(set);
841                 CPU_COPY(&nset->cs_mask, mask);
842                 cpuset_rel(nset);
843                 break;
844         case CPU_LEVEL_WHICH:
845                 switch (uap->which) {
846                 case CPU_WHICH_TID:
847                         thread_lock(ttd);
848                         CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
849                         thread_unlock(ttd);
850                         break;
851                 case CPU_WHICH_PID:
852                         FOREACH_THREAD_IN_PROC(p, ttd) {
853                                 thread_lock(ttd);
854                                 CPU_OR(mask, &ttd->td_cpuset->cs_mask);
855                                 thread_unlock(ttd);
856                         }
857                         break;
858                 case CPU_WHICH_CPUSET:
859                         CPU_COPY(&set->cs_mask, mask);
860                         break;
861                 case CPU_WHICH_IRQ:
862                         error = intr_getaffinity(uap->id, mask);
863                         break;
864                 }
865                 break;
866         default:
867                 error = EINVAL;
868                 break;
869         }
870         if (set)
871                 cpuset_rel(set);
872         if (p)
873                 PROC_UNLOCK(p);
874         if (error == 0)
875                 error = copyout(mask, uap->mask, size);
876 out:
877         free(mask, M_TEMP);
878         return (error);
879 }
880 
881 #ifndef _SYS_SYSPROTO_H_
882 struct cpuset_setaffinity_args {
883         cpulevel_t      level;
884         cpuwhich_t      which;
885         id_t            id;
886         size_t          cpusetsize;
887         const cpuset_t  *mask;
888 };
889 #endif
890 int
891 cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
892 {
893         struct cpuset *nset;
894         struct cpuset *set;
895         struct thread *ttd;
896         struct proc *p;
897         cpuset_t *mask;
898         int error;
899 
900         if (uap->cpusetsize < sizeof(cpuset_t) ||
901             uap->cpusetsize > CPU_MAXSIZE / NBBY)
902                 return (ERANGE);
903         mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
904         error = copyin(uap->mask, mask, uap->cpusetsize);
905         if (error)
906                 goto out;
907         /*
908          * Verify that no high bits are set.
909          */
910         if (uap->cpusetsize > sizeof(cpuset_t)) {
911                 char *end;
912                 char *cp;
913 
914                 end = cp = (char *)&mask->__bits;
915                 end += uap->cpusetsize;
916                 cp += sizeof(cpuset_t);
917                 while (cp != end)
918                         if (*cp++ != 0) {
919                                 error = EINVAL;
920                                 goto out;
921                         }
922 
923         }
924         switch (uap->level) {
925         case CPU_LEVEL_ROOT:
926         case CPU_LEVEL_CPUSET:
927                 error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
928                 if (error)
929                         break;
930                 switch (uap->which) {
931                 case CPU_WHICH_TID:
932                 case CPU_WHICH_PID:
933                         thread_lock(ttd);
934                         set = cpuset_ref(ttd->td_cpuset);
935                         thread_unlock(ttd);
936                         PROC_UNLOCK(p);
937                         break;
938                 case CPU_WHICH_CPUSET:
939                         break;
940                 case CPU_WHICH_IRQ:
941                         error = EINVAL;
942                         goto out;
943                 }
944                 if (uap->level == CPU_LEVEL_ROOT)
945                         nset = cpuset_refroot(set);
946                 else
947                         nset = cpuset_refbase(set);
948                 error = cpuset_modify(nset, mask);
949                 cpuset_rel(nset);
950                 cpuset_rel(set);
951                 break;
952         case CPU_LEVEL_WHICH:
953                 switch (uap->which) {
954                 case CPU_WHICH_TID:
955                         error = cpuset_setthread(uap->id, mask);
956                         break;
957                 case CPU_WHICH_PID:
958                         error = cpuset_setproc(uap->id, NULL, mask);
959                         break;
960                 case CPU_WHICH_CPUSET:
961                         error = cpuset_which(CPU_WHICH_CPUSET, uap->id, &p,
962                             &ttd, &set);
963                         if (error == 0) {
964                                 error = cpuset_modify(set, mask);
965                                 cpuset_rel(set);
966                         }
967                         break;
968                 case CPU_WHICH_IRQ:
969                         error = intr_setaffinity(uap->id, mask);
970                         break;
971                 default:
972                         error = EINVAL;
973                         break;
974                 }
975                 break;
976         default:
977                 error = EINVAL;
978                 break;
979         }
980 out:
981         free(mask, M_TEMP);
982         return (error);
983 }
984 
985 #ifdef DDB
986 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
987 {
988         struct cpuset *set;
989         int cpu, once;
990 
991         LIST_FOREAC