1 /*-
2 * Copyright (c) 2008, Jeffrey Roberson <jeff@freebsd.org>
3 * All rights reserved.
4 *
5 * Copyright (c) 2008 Nokia Corporation
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice unmodified, this list of conditions, and the following
13 * disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include "opt_ddb.h"
35
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/sysproto.h>
39 #include <sys/kernel.h>
40 #include <sys/lock.h>
41 #include <sys/malloc.h>
42 #include <sys/mutex.h>
43 #include <sys/priv.h>
44 #include <sys/proc.h>
45 #include <sys/refcount.h>
46 #include <sys/sched.h>
47 #include <sys/smp.h>
48 #include <sys/syscallsubr.h>
49 #include <sys/cpuset.h>
50 #include <sys/sx.h>
51 #include <sys/refcount.h>
52 #include <sys/queue.h>
53 #include <sys/limits.h>
54 #include <sys/bus.h>
55 #include <sys/interrupt.h>
56 #include <sys/jail.h> /* Must come after sys/proc.h */
57
58 #include <vm/uma.h>
59
60 #ifdef DDB
61 #include <ddb/ddb.h>
62 #endif /* DDB */
63
64 /*
65 * cpusets provide a mechanism for creating and manipulating sets of
66 * processors for the purpose of constraining the scheduling of threads to
67 * specific processors.
68 *
69 * Each process belongs to an identified set, by default this is set 1. Each
70 * thread may further restrict the cpus it may run on to a subset of this
71 * named set. This creates an anonymous set which other threads and processes
72 * may not join by number.
73 *
74 * The named set is referred to herein as the 'base' set to avoid ambiguity.
75 * This set is usually a child of a 'root' set while the anonymous set may
76 * simply be referred to as a mask. In the syscall api these are referred to
77 * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
78 *
79 * Threads inherit their set from their creator whether it be anonymous or
80 * not. This means that anonymous sets are immutable because they may be
81 * shared. To modify an anonymous set a new set is created with the desired
82 * mask and the same parent as the existing anonymous set. This gives the
83 * illusion of each thread having a private mask.A
84 *
85 * Via the syscall apis a user may ask to retrieve or modify the root, base,
86 * or mask that is discovered via a pid, tid, or setid. Modifying a set
87 * modifies all numbered and anonymous child sets to comply with the new mask.
88 * Modifying a pid or tid's mask applies only to that tid but must still
89 * exist within the assigned parent set.
90 *
91 * A thread may not be assigned to a a group seperate from other threads in
92 * the process. This is to remove ambiguity when the setid is queried with
93 * a pid argument. There is no other technical limitation.
94 *
95 * This somewhat complex arrangement is intended to make it easy for
96 * applications to query available processors and bind their threads to
97 * specific processors while also allowing administrators to dynamically
98 * reprovision by changing sets which apply to groups of processes.
99 *
100 * A simple application should not concern itself with sets at all and
101 * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
102 * meaning 'curthread'. It may query availble cpus for that tid with a
103 * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
104 */
105 static uma_zone_t cpuset_zone;
106 static struct mtx cpuset_lock;
107 static struct setlist cpuset_ids;
108 static struct unrhdr *cpuset_unr;
109 static struct cpuset *cpuset_zero;
110
111 cpuset_t *cpuset_root;
112
113 /*
114 * Acquire a reference to a cpuset, all pointers must be tracked with refs.
115 */
116 struct cpuset *
117 cpuset_ref(struct cpuset *set)
118 {
119
120 refcount_acquire(&set->cs_ref);
121 return (set);
122 }
123
124 /*
125 * Walks up the tree from 'set' to find the root. Returns the root
126 * referenced.
127 */
128 static struct cpuset *
129 cpuset_refroot(struct cpuset *set)
130 {
131
132 for (; set->cs_parent != NULL; set = set->cs_parent)
133 if (set->cs_flags & CPU_SET_ROOT)
134 break;
135 cpuset_ref(set);
136
137 return (set);
138 }
139
140 /*
141 * Find the first non-anonymous set starting from 'set'. Returns this set
142 * referenced. May return the passed in set with an extra ref if it is
143 * not anonymous.
144 */
145 static struct cpuset *
146 cpuset_refbase(struct cpuset *set)
147 {
148
149 if (set->cs_id == CPUSET_INVALID)
150 set = set->cs_parent;
151 cpuset_ref(set);
152
153 return (set);
154 }
155
156 /*
157 * Release a reference in a context where it is safe to allocte.
158 */
159 void
160 cpuset_rel(struct cpuset *set)
161 {
162 cpusetid_t id;
163
164 if (refcount_release(&set->cs_ref) == 0)
165 return;
166 mtx_lock_spin(&cpuset_lock);
167 LIST_REMOVE(set, cs_siblings);
168 id = set->cs_id;
169 if (id != CPUSET_INVALID)
170 LIST_REMOVE(set, cs_link);
171 mtx_unlock_spin(&cpuset_lock);
172 cpuset_rel(set->cs_parent);
173 uma_zfree(cpuset_zone, set);
174 if (id != CPUSET_INVALID)
175 free_unr(cpuset_unr, id);
176 }
177
178 /*
179 * Deferred release must be used when in a context that is not safe to
180 * allocate/free. This places any unreferenced sets on the list 'head'.
181 */
182 static void
183 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
184 {
185
186 if (refcount_release(&set->cs_ref) == 0)
187 return;
188 mtx_lock_spin(&cpuset_lock);
189 LIST_REMOVE(set, cs_siblings);
190 if (set->cs_id != CPUSET_INVALID)
191 LIST_REMOVE(set, cs_link);
192 LIST_INSERT_HEAD(head, set, cs_link);
193 mtx_unlock_spin(&cpuset_lock);
194 }
195
196 /*
197 * Complete a deferred release. Removes the set from the list provided to
198 * cpuset_rel_defer.
199 */
200 static void
201 cpuset_rel_complete(struct cpuset *set)
202 {
203 LIST_REMOVE(set, cs_link);
204 cpuset_rel(set->cs_parent);
205 uma_zfree(cpuset_zone, set);
206 }
207
208 /*
209 * Find a set based on an id. Returns it with a ref.
210 */
211 static struct cpuset *
212 cpuset_lookup(cpusetid_t setid, struct thread *td)
213 {
214 struct cpuset *set;
215
216 if (setid == CPUSET_INVALID)
217 return (NULL);
218 mtx_lock_spin(&cpuset_lock);
219 LIST_FOREACH(set, &cpuset_ids, cs_link)
220 if (set->cs_id == setid)
221 break;
222 if (set)
223 cpuset_ref(set);
224 mtx_unlock_spin(&cpuset_lock);
225
226 KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
227 if (set != NULL && jailed(td->td_ucred)) {
228 struct cpuset *rset, *jset;
229 struct prison *pr;
230
231 rset = cpuset_refroot(set);
232
233 pr = td->td_ucred->cr_prison;
234 mtx_lock(&pr->pr_mtx);
235 cpuset_ref(pr->pr_cpuset);
236 jset = pr->pr_cpuset;
237 mtx_unlock(&pr->pr_mtx);
238
239 if (jset->cs_id != rset->cs_id) {
240 cpuset_rel(set);
241 set = NULL;
242 }
243 cpuset_rel(jset);
244 cpuset_rel(rset);
245 }
246
247 return (set);
248 }
249
250 /*
251 * Create a set in the space provided in 'set' with the provided parameters.
252 * The set is returned with a single ref. May return EDEADLK if the set
253 * will have no valid cpu based on restrictions from the parent.
254 */
255 static int
256 _cpuset_create(struct cpuset *set, struct cpuset *parent, cpuset_t *mask,
257 cpusetid_t id)
258 {
259
260 if (!CPU_OVERLAP(&parent->cs_mask, mask))
261 return (EDEADLK);
262 CPU_COPY(mask, &set->cs_mask);
263 LIST_INIT(&set->cs_children);
264 refcount_init(&set->cs_ref, 1);
265 set->cs_flags = 0;
266 mtx_lock_spin(&cpuset_lock);
267 CPU_AND(mask, &parent->cs_mask);
268 set->cs_id = id;
269 set->cs_parent = cpuset_ref(parent);
270 LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
271 if (set->cs_id != CPUSET_INVALID)
272 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
273 mtx_unlock_spin(&cpuset_lock);
274
275 return (0);
276 }
277
278 /*
279 * Create a new non-anonymous set with the requested parent and mask. May
280 * return failures if the mask is invalid or a new number can not be
281 * allocated.
282 */
283 static int
284 cpuset_create(struct cpuset **setp, struct cpuset *parent, cpuset_t *mask)
285 {
286 struct cpuset *set;
287 cpusetid_t id;
288 int error;
289
290 id = alloc_unr(cpuset_unr);
291 if (id == -1)
292 return (ENFILE);
293 *setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
294 error = _cpuset_create(set, parent, mask, id);
295 if (error == 0)
296 return (0);
297 free_unr(cpuset_unr, id);
298 uma_zfree(cpuset_zone, set);
299
300 return (error);
301 }
302
303 /*
304 * Recursively check for errors that would occur from applying mask to
305 * the tree of sets starting at 'set'. Checks for sets that would become
306 * empty as well as RDONLY flags.
307 */
308 static int
309 cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
310 {
311 struct cpuset *nset;
312 cpuset_t newmask;
313 int error;
314
315 mtx_assert(&cpuset_lock, MA_OWNED);
316 if (set->cs_flags & CPU_SET_RDONLY)
317 return (EPERM);
318 if (!CPU_OVERLAP(&set->cs_mask, mask))
319 return (EDEADLK);
320 CPU_COPY(&set->cs_mask, &newmask);
321 CPU_AND(&newmask, mask);
322 error = 0;
323 LIST_FOREACH(nset, &set->cs_children, cs_siblings)
324 if ((error = cpuset_testupdate(nset, &newmask)) != 0)
325 break;
326 return (error);
327 }
328
329 /*
330 * Applies the mask 'mask' without checking for empty sets or permissions.
331 */
332 static void
333 cpuset_update(struct cpuset *set, cpuset_t *mask)
334 {
335 struct cpuset *nset;
336
337 mtx_assert(&cpuset_lock, MA_OWNED);
338 CPU_AND(&set->cs_mask, mask);
339 LIST_FOREACH(nset, &set->cs_children, cs_siblings)
340 cpuset_update(nset, &set->cs_mask);
341
342 return;
343 }
344
345 /*
346 * Modify the set 'set' to use a copy of the mask provided. Apply this new
347 * mask to restrict all children in the tree. Checks for validity before
348 * applying the changes.
349 */
350 static int
351 cpuset_modify(struct cpuset *set, cpuset_t *mask)
352 {
353 struct cpuset *root;
354 int error;
355
356 error = priv_check(curthread, PRIV_SCHED_CPUSET);
357 if (error)
358 return (error);
359 /*
360 * Verify that we have access to this set of
361 * cpus.
362 */
363 root = set->cs_parent;
364 if (root && !CPU_SUBSET(&root->cs_mask, mask))
365 return (EINVAL);
366 mtx_lock_spin(&cpuset_lock);
367 error = cpuset_testupdate(set, mask);
368 if (error)
369 goto out;
370 cpuset_update(set, mask);
371 CPU_COPY(mask, &set->cs_mask);
372 out:
373 mtx_unlock_spin(&cpuset_lock);
374
375 return (error);
376 }
377
378 /*
379 * Resolve the 'which' parameter of several cpuset apis.
380 *
381 * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also
382 * checks for permission via p_cansched().
383 *
384 * For WHICH_SET returns a valid set with a new reference.
385 *
386 * -1 may be supplied for any argument to mean the current proc/thread or
387 * the base set of the current thread. May fail with ESRCH/EPERM.
388 */
389 static int
390 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
391 struct cpuset **setp)
392 {
393 struct cpuset *set;
394 struct thread *td;
395 struct proc *p;
396 int error;
397
398 *pp = p = NULL;
399 *tdp = td = NULL;
400 *setp = set = NULL;
401 switch (which) {
402 case CPU_WHICH_PID:
403 if (id == -1) {
404 PROC_LOCK(curproc);
405 p = curproc;
406 break;
407 }
408 if ((p = pfind(id)) == NULL)
409 return (ESRCH);
410 break;
411 case CPU_WHICH_TID:
412 if (id == -1) {
413 PROC_LOCK(curproc);
414 p = curproc;
415 td = curthread;
416 break;
417 }
418 sx_slock(&allproc_lock);
419 FOREACH_PROC_IN_SYSTEM(p) {
420 PROC_LOCK(p);
421 PROC_SLOCK(p);
422 FOREACH_THREAD_IN_PROC(p, td)
423 if (td->td_tid == id)
424 break;
425 PROC_SUNLOCK(p);
426 if (td != NULL)
427 break;
428 PROC_UNLOCK(p);
429 }
430 sx_sunlock(&allproc_lock);
431 if (td == NULL)
432 return (ESRCH);
433 break;
434 case CPU_WHICH_CPUSET:
435 if (id == -1) {
436 thread_lock(curthread);
437 set = cpuset_refbase(curthread->td_cpuset);
438 thread_unlock(curthread);
439 } else
440 set = cpuset_lookup(id, curthread);
441 if (set) {
442 *setp = set;
443 return (0);
444 }
445 return (ESRCH);
446 case CPU_WHICH_JAIL:
447 {
448 /* Find `set' for prison with given id. */
449 struct prison *pr;
450
451 sx_slock(&allprison_lock);
452 pr = prison_find(id);
453 sx_sunlock(&allprison_lock);
454 if (pr == NULL)
455 return (ESRCH);
456 if (jailed(curthread->td_ucred)) {
457 if (curthread->td_ucred->cr_prison == pr) {
458 cpuset_ref(pr->pr_cpuset);
459 set = pr->pr_cpuset;
460 }
461 } else {
462 cpuset_ref(pr->pr_cpuset);
463 set = pr->pr_cpuset;
464 }
465 mtx_unlock(&pr->pr_mtx);
466 if (set) {
467 *setp = set;
468 return (0);
469 }
470 return (ESRCH);
471 }
472 default:
473 return (EINVAL);
474 }
475 error = p_cansched(curthread, p);
476 if (error) {
477 PROC_UNLOCK(p);
478 return (error);
479 }
480 if (td == NULL)
481 td = FIRST_THREAD_IN_PROC(p);
482 *pp = p;
483 *tdp = td;
484 return (0);
485 }
486
487 /*
488 * Create an anonymous set with the provided mask in the space provided by
489 * 'fset'. If the passed in set is anonymous we use its parent otherwise
490 * the new set is a child of 'set'.
491 */
492 static int
493 cpuset_shadow(struct cpuset *set, struct cpuset *fset, cpuset_t *mask)
494 {
495 struct cpuset *parent;
496
497 if (set->cs_id == CPUSET_INVALID)
498 parent = set->cs_parent;
499 else
500 parent = set;
501 if (!CPU_SUBSET(&parent->cs_mask, mask))
502 return (EDEADLK);
503 return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
504 }
505
506 /*
507 * Handle two cases for replacing the base set or mask of an entire process.
508 *
509 * 1) Set is non-null and mask is null. This reparents all anonymous sets
510 * to the provided set and replaces all non-anonymous td_cpusets with the
511 * provided set.
512 * 2) Mask is non-null and set is null. This replaces or creates anonymous
513 * sets for every thread with the existing base as a parent.
514 *
515 * This is overly complicated because we can't allocate while holding a
516 * spinlock and spinlocks must be held while changing and examining thread
517 * state.
518 */
519 static int
520 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
521 {
522 struct setlist freelist;
523 struct setlist droplist;
524 struct cpuset *tdset;
525 struct cpuset *nset;
526 struct thread *td;
527 struct proc *p;
528 int threads;
529 int nfree;
530 int error;
531 /*
532 * The algorithm requires two passes due to locking considerations.
533 *
534 * 1) Lookup the process and acquire the locks in the required order.
535 * 2) If enough cpusets have not been allocated release the locks and
536 * allocate them. Loop.
537 */
538 LIST_INIT(&freelist);
539 LIST_INIT(&droplist);
540 nfree = 0;
541 for (;;) {
542 error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
543 if (error)
544 goto out;
545 PROC_SLOCK(p);
546 if (nfree >= p->p_numthreads)
547 break;
548 threads = p->p_numthreads;
549 PROC_SUNLOCK(p);
550 PROC_UNLOCK(p);
551 for (; nfree < threads; nfree++) {
552 nset = uma_zalloc(cpuset_zone, M_WAITOK);
553 LIST_INSERT_HEAD(&freelist, nset, cs_link);
554 }
555 }
556 PROC_LOCK_ASSERT(p, MA_OWNED);
557 PROC_SLOCK_ASSERT(p, MA_OWNED);
558 /*
559 * Now that the appropriate locks are held and we have enough cpusets,
560 * make sure the operation will succeed before applying changes. The
561 * proc lock prevents td_cpuset from changing between calls.
562 */
563 error = 0;
564 FOREACH_THREAD_IN_PROC(p, td) {
565 thread_lock(td);
566 tdset = td->td_cpuset;
567 /*
568 * Verify that a new mask doesn't specify cpus outside of
569 * the set the thread is a member of.
570 */
571 if (mask) {
572 if (tdset->cs_id == CPUSET_INVALID)
573 tdset = tdset->cs_parent;
574 if (!CPU_SUBSET(&tdset->cs_mask, mask))
575 error = EDEADLK;
576 /*
577 * Verify that a new set won't leave an existing thread
578 * mask without a cpu to run on. It can, however, restrict
579 * the set.
580 */
581 } else if (tdset->cs_id == CPUSET_INVALID) {
582 if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
583 error = EDEADLK;
584 }
585 thread_unlock(td);
586 if (error)
587 goto unlock_out;
588 }
589 /*
590 * Replace each thread's cpuset while using deferred release. We
591 * must do this because the PROC_SLOCK has to be held while traversing
592 * the thread list and this limits the type of operations allowed.
593 */
594 FOREACH_THREAD_IN_PROC(p, td) {
595 thread_lock(td);
596 /*
597 * If we presently have an anonymous set or are applying a
598 * mask we must create an anonymous shadow set. That is
599 * either parented to our existing base or the supplied set.
600 *
601 * If we have a base set with no anonymous shadow we simply
602 * replace it outright.
603 */
604 tdset = td->td_cpuset;
605 if (tdset->cs_id == CPUSET_INVALID || mask) {
606 nset = LIST_FIRST(&freelist);
607 LIST_REMOVE(nset, cs_link);
608 if (mask)
609 error = cpuset_shadow(tdset, nset, mask);
610 else
611 error = _cpuset_create(nset, set,
612 &tdset->cs_mask, CPUSET_INVALID);
613 if (error) {
614 LIST_INSERT_HEAD(&freelist, nset, cs_link);
615 thread_unlock(td);
616 break;
617 }
618 } else
619 nset = cpuset_ref(set);
620 cpuset_rel_defer(&droplist, tdset);
621 td->td_cpuset = nset;
622 sched_affinity(td);
623 thread_unlock(td);
624 }
625 unlock_out:
626 PROC_SUNLOCK(p);
627 PROC_UNLOCK(p);
628 out:
629 while ((nset = LIST_FIRST(&droplist)) != NULL)
630 cpuset_rel_complete(nset);
631 while ((nset = LIST_FIRST(&freelist)) != NULL) {
632 LIST_REMOVE(nset, cs_link);
633 uma_zfree(cpuset_zone, nset);
634 }
635 return (error);
636 }
637
638 /*
639 * Apply an anonymous mask to a single thread.
640 */
641 int
642 cpuset_setthread(lwpid_t id, cpuset_t *mask)
643 {
644 struct cpuset *nset;
645 struct cpuset *set;
646 struct thread *td;
647 struct proc *p;
648 int error;
649
650 nset = uma_zalloc(cpuset_zone, M_WAITOK);
651 error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
652 if (error)
653 goto out;
654 set = NULL;
655 thread_lock(td);
656 error = cpuset_shadow(td->td_cpuset, nset, mask);
657 if (error == 0) {
658 set = td->td_cpuset;
659 td->td_cpuset = nset;
660 sched_affinity(td);
661 nset = NULL;
662 }
663 thread_unlock(td);
664 PROC_UNLOCK(p);
665 if (set)
666 cpuset_rel(set);
667 out:
668 if (nset)
669 uma_zfree(cpuset_zone, nset);
670 return (error);
671 }
672
673 /*
674 * Creates the cpuset for thread0. We make two sets:
675 *
676 * 0 - The root set which should represent all valid processors in the
677 * system. It is initially created with a mask of all processors
678 * because we don't know what processors are valid until cpuset_init()
679 * runs. This set is immutable.
680 * 1 - The default set which all processes are a member of until changed.
681 * This allows an administrator to move all threads off of given cpus to
682 * dedicate them to high priority tasks or save power etc.
683 */
684 struct cpuset *
685 cpuset_thread0(void)
686 {
687 struct cpuset *set;
688 int error;
689
690 cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
691 NULL, NULL, UMA_ALIGN_PTR, 0);
692 mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
693 /*
694 * Create the root system set for the whole machine. Doesn't use
695 * cpuset_create() due to NULL parent.
696 */
697 set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
698 set->cs_mask.__bits[0] = -1;
699 LIST_INIT(&set->cs_children);
700 LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
701 set->cs_ref = 1;
702 set->cs_flags = CPU_SET_ROOT;
703 cpuset_zero = set;
704 cpuset_root = &set->cs_mask;
705 /*
706 * Now derive a default, modifiable set from that to give out.
707 */
708 set = uma_zalloc(cpuset_zone, M_WAITOK);
709 error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
710 KASSERT(error == 0, ("Error creating default set: %d\n", error));
711 /*
712 * Initialize the unit allocator. 0 and 1 are allocated above.
713 */
714 cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
715
716 return (set);
717 }
718
719 /*
720 * Create a cpuset, which would be cpuset_create() but
721 * mark the new 'set' as root.
722 *
723 * We are not going to reparent the td to it. Use cpuset_reparentproc() for that.
724 *
725 * In case of no error, returns the set in *setp locked with a reference.
726 */
727 int
728 cpuset_create_root(struct thread *td, struct cpuset **setp)
729 {
730 struct cpuset *root;
731 struct cpuset *set;
732 int error;
733
734 KASSERT(td != NULL, ("[%s:%d] invalid td", __func__, __LINE__));
735 KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
736
737 thread_lock(td);
738 root = cpuset_refroot(td->td_cpuset);
739 thread_unlock(td);
740
741 error = cpuset_create(setp, td->td_cpuset, &root->cs_mask);
742 cpuset_rel(root);
743 if (error)
744 return (error);
745
746 KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
747 __func__, __LINE__));
748
749 /* Mark the set as root. */
750 set = *setp;
751 set->cs_flags |= CPU_SET_ROOT;
752
753 return (0);
754 }
755
756 int
757 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
758 {
759 int error;
760
761 KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
762 KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
763
764 cpuset_ref(set);
765 error = cpuset_setproc(p->p_pid, set, NULL);
766 if (error)
767 return (error);
768 cpuset_rel(set);
769 return (0);
770 }
771
772 /*
773 * This is called once the final set of system cpus is known. Modifies
774 * the root set and all children and mark the root readonly.
775 */
776 static void
777 cpuset_init(void *arg)
778 {
779 cpuset_t mask;
780
781 CPU_ZERO(&mask);
782 #ifdef SMP
783 mask.__bits[0] = all_cpus;
784 #else
785 mask.__bits[0] = 1;
786 #endif
787 if (cpuset_modify(cpuset_zero, &mask))
788 panic("Can't set initial cpuset mask.\n");
789 cpuset_zero->cs_flags |= CPU_SET_RDONLY;
790 }
791 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
792
793 #ifndef _SYS_SYSPROTO_H_
794 struct cpuset_args {
795 cpusetid_t *setid;
796 };
797 #endif
798 int
799 cpuset(struct thread *td, struct cpuset_args *uap)
800 {
801 struct cpuset *root;
802 struct cpuset *set;
803 int error;
804
805 thread_lock(td);
806 root = cpuset_refroot(td->td_cpuset);
807 thread_unlock(td);
808 error = cpuset_create(&set, root, &root->cs_mask);
809 cpuset_rel(root);
810 if (error)
811 return (error);
812 error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
813 if (error == 0)
814 error = cpuset_setproc(-1, set, NULL);
815 cpuset_rel(set);
816 return (error);
817 }
818
819 #ifndef _SYS_SYSPROTO_H_
820 struct cpuset_setid_args {
821 cpuwhich_t which;
822 id_t id;
823 cpusetid_t setid;
824 };
825 #endif
826 int
827 cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
828 {
829 struct cpuset *set;
830 int error;
831
832 /*
833 * Presently we only support per-process sets.
834 */
835 if (uap->which != CPU_WHICH_PID)
836 return (EINVAL);
837 set = cpuset_lookup(uap->setid, td);
838 if (set == NULL)
839 return (ESRCH);
840 error = cpuset_setproc(uap->id, set, NULL);
841 cpuset_rel(set);
842 return (error);
843 }
844
845 #ifndef _SYS_SYSPROTO_H_
846 struct cpuset_getid_args {
847 cpulevel_t level;
848 cpuwhich_t which;
849 id_t id;
850 cpusetid_t *setid;
851 #endif
852 int
853 cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
854 {
855 struct cpuset *nset;
856 struct cpuset *set;
857 struct thread *ttd;
858 struct proc *p;
859 cpusetid_t id;
860 int error;
861
862 if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
863 return (EINVAL);
864 error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
865 if (error)
866 return (error);
867 switch (uap->which) {
868 case CPU_WHICH_TID:
869 case CPU_WHICH_PID:
870 thread_lock(ttd);
871 set = cpuset_refbase(ttd->td_cpuset);
872 thread_unlock(ttd);
873 PROC_UNLOCK(p);
874 break;
875 case CPU_WHICH_CPUSET:
876 case CPU_WHICH_JAIL:
877 break;
878 }
879 switch (uap->level) {
880 case CPU_LEVEL_ROOT:
881 nset = cpuset_refroot(set);
882 cpuset_rel(set);
883 set = nset;
884 break;
885 case CPU_LEVEL_CPUSET:
886 break;
887 case CPU_LEVEL_WHICH:
888 break;
889 }
890 id = set->cs_id;
891 cpuset_rel(set);
892 if (error == 0)
893 error = copyout(&id, uap->setid, sizeof(id));
894
895 return (error);
896 }
897
898 #ifndef _SYS_SYSPROTO_H_
899 struct cpuset_getaffinity_args {
900 cpulevel_t level;
901 cpuwhich_t which;
902 id_t id;
903 size_t cpusetsize;
904 cpuset_t *mask;
905 };
906 #endif
907 int
908 cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
909 {
910 struct thread *ttd;
911 struct cpuset *nset;
912 struct cpuset *set;
913 struct proc *p;
914 cpuset_t *mask;
915 int error;
916 size_t size;
917
918 if (uap->cpusetsize < sizeof(cpuset_t) ||
919 uap->cpusetsize > CPU_MAXSIZE / NBBY)
920 return (ERANGE);
921 size = uap->cpusetsize;
922 mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
923 error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
924 if (error)
925 goto out;
926 switch (uap->level) {
927 case CPU_LEVEL_ROOT:
928 case CPU_LEVEL_CPUSET:
929 switch (uap->which) {
930 case CPU_WHICH_TID:
931 case CPU_WHICH_PID:
932 thread_lock(ttd);
933 set = cpuset_ref(ttd->td_cpuset);
934 thread_unlock(ttd);
935 break;
936 case CPU_WHICH_CPUSET:
937 case CPU_WHICH_JAIL:
938 break;
939 }
940 if (uap->level == CPU_LEVEL_ROOT)
941 nset = cpuset_refroot(set);
942 else
943 nset = cpuset_refbase(set);
944 CPU_COPY(&nset->cs_mask, mask);
945 cpuset_rel(nset);
946 break;
947 case CPU_LEVEL_WHICH:
948 switch (uap->which) {
949 case CPU_WHICH_TID:
950 thread_lock(ttd);
951 CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
952 thread_unlock(ttd);
953 break;
954 case CPU_WHICH_PID:
955 PROC_SLOCK(p);
956 FOREACH_THREAD_IN_PROC(p, ttd) {
957 thread_lock(ttd);
958 CPU_OR(mask, &ttd->td_cpuset->cs_mask);
959 thread_unlock(ttd);
960 }
961 PROC_SUNLOCK(p);
962 break;
963 case CPU_WHICH_CPUSET:
964 case CPU_WHICH_JAIL:
965 CPU_COPY(&set->cs_mask, mask);
966 break;
967 }
968 break;
969 default:
970 error = EINVAL;
971 break;
972 }
973 if (set)
974 cpuset_rel(set);
975 if (p)
976 PROC_UNLOCK(p);
977 if (error == 0)
978 error = copyout(mask, uap->mask, size);
979 out:
980 free(mask, M_TEMP);
981 return (error);
982 }
983
984 #ifndef _SYS_SYSPROTO_H_
985 struct cpuset_setaffinity_args {
986 cpulevel_t level;
987 cpuwhich_t which;
988 id_t id;
989 size_t cpusetsize;
990 const cpuset_t *mask;
991 };
992 #endif
993 int
994 cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
995 {
996 struct cpuset *nset;
997 struct cpuset *set;
998 struct thread *ttd;
999 struct proc *p;
1000 cpuset_t *mask;
1001 int error;
1002
1003 if (uap->cpusetsize < sizeof(cpuset_t) ||
1004 uap->cpusetsize > CPU_MAXSIZE / NBBY)
1005 return (ERANGE);
1006 mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
1007 error = copyin(uap->mask, mask, uap->cpusetsize);
1008 if (error)
1009 goto out;
1010 /*
1011 * Verify that no high bits are set.
1012 */
1013 if (uap->cpusetsize > sizeof(cpuset_t)) {
1014 char *end;
1015 char *cp;
1016
1017 end = cp = (char *)&mask->__bits;
1018 end += uap->cpusetsize;
1019 cp += sizeof(cpuset_t);
1020 while (cp != end)
1021 if (*cp++ != 0) {
1022 error = EINVAL;
1023 goto out;
1024 }
1025
1026 }
1027 switch (uap->level) {
1028 case CPU_LEVEL_ROOT:
1029 case CPU_LEVEL_CPUSET:
1030 error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
1031 if (error)
1032 break;
1033 switch (uap->which) {
1034 case CPU_WHICH_TID:
1035 case CPU_WHICH_PID:
1036 thread_lock(ttd);
1037 set = cpuset_ref(ttd->td_cpuset);
1038 thread_unlock(ttd);
1039 PROC_UNLOCK(p);
1040 break;
1041 case CPU_WHICH_CPUSET:
1042 case CPU_WHICH_JAIL:
1043 break;
1044 }
1045 if (uap->level == CPU_LEVEL_ROOT)
1046 nset = cpuset_refroot(set);
1047 else
1048 nset = cpuset_refbase(set);
1049 error = cpuset_modify(nset, mask);
1050 cpuset_rel(nset);
1051 cpuset_rel(set);
1052 break;
1053 case CPU_LEVEL_WHICH:
1054 switch (uap->which) {
1055 case CPU_WHICH_TID:
1056 error = cpuset_setthread(uap->id, mask);
1057 break;
1058 case CPU_WHICH_PID:
1059 error = cpuset_setproc(uap->id, NULL, mask);
1060 break;
1061 case CPU_WHICH_CPUSET:
1062 case CPU_WHICH_JAIL:
1063 error = cpuset_which(uap->which, uap->id, &p,
1064 &ttd, &set);
1065 if (error == 0) {
1066 error = cpuset_modify(set, mask);
1067 cpuset_rel(set);
1068 }
1069 break;
1070 default:
1071 error = EINVAL;
1072 break;
1073 }
1074 break;
1075 default:
1076 error = EINVAL;
1077 break;
1078 }
1079 out:
1080 free(mask, M_TEMP);
1081 return (error);
1082 }
1083
1084 #ifdef DDB
1085 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
1086 {
1087 struct cpuset *set;
1088 int cpu, once;
1089
1090 LIST_FOREACH(set, &cpuset_ids, cs_link) {
1091 db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
1092 set, set->cs_id, set->cs_ref, set->cs_flags,
1093 (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
1094 db_printf(" mask=");
1095 for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
1096 if (CPU_ISSET(cpu, &set->cs_mask)) {
1097 if (once == 0) {
1098 db_printf("%d", cpu);
1099 once = 1;
1100 } else
1101 db_printf(",%d", cpu);
1102 }
1103 }
1104 db_printf("\n");
1105 if (db_pager_quit)
1106 break;
1107 }
1108 }
1109 #endif /* DDB */
Cache object: a777116fc9a4b49e0d55ae7c489f51d7
|