FreeBSD/Linux Kernel Cross Reference
sys/port/sysproc.c
1 #include "u.h"
2 #include "tos.h"
3 #include "../port/lib.h"
4 #include "mem.h"
5 #include "dat.h"
6 #include "fns.h"
7 #include "../port/error.h"
8 #include "edf.h"
9
10 #include <a.out.h>
11
12 int shargs(char*, int, char**);
13
14 extern void checkpages(void);
15 extern void checkpagerefs(void);
16
17 long
18 sysr1(ulong*)
19 {
20 checkpagerefs();
21 return 0;
22 }
23
24 long
25 sysrfork(ulong *arg)
26 {
27 Proc *p;
28 int n, i;
29 Fgrp *ofg;
30 Pgrp *opg;
31 Rgrp *org;
32 Egrp *oeg;
33 ulong pid, flag;
34 Mach *wm;
35
36 flag = arg[0];
37 /* Check flags before we commit */
38 if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
39 error(Ebadarg);
40 if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
41 error(Ebadarg);
42 if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
43 error(Ebadarg);
44
45 if((flag&RFPROC) == 0) {
46 if(flag & (RFMEM|RFNOWAIT))
47 error(Ebadarg);
48 if(flag & (RFFDG|RFCFDG)) {
49 ofg = up->fgrp;
50 if(flag & RFFDG)
51 up->fgrp = dupfgrp(ofg);
52 else
53 up->fgrp = dupfgrp(nil);
54 closefgrp(ofg);
55 }
56 if(flag & (RFNAMEG|RFCNAMEG)) {
57 opg = up->pgrp;
58 up->pgrp = newpgrp();
59 if(flag & RFNAMEG)
60 pgrpcpy(up->pgrp, opg);
61 /* inherit noattach */
62 up->pgrp->noattach = opg->noattach;
63 closepgrp(opg);
64 }
65 if(flag & RFNOMNT)
66 up->pgrp->noattach = 1;
67 if(flag & RFREND) {
68 org = up->rgrp;
69 up->rgrp = newrgrp();
70 closergrp(org);
71 }
72 if(flag & (RFENVG|RFCENVG)) {
73 oeg = up->egrp;
74 up->egrp = smalloc(sizeof(Egrp));
75 up->egrp->ref = 1;
76 if(flag & RFENVG)
77 envcpy(up->egrp, oeg);
78 closeegrp(oeg);
79 }
80 if(flag & RFNOTEG)
81 up->noteid = incref(¬eidalloc);
82 return 0;
83 }
84
85 p = newproc();
86
87 p->fpsave = up->fpsave;
88 p->scallnr = up->scallnr;
89 p->s = up->s;
90 p->nerrlab = 0;
91 p->slash = up->slash;
92 p->dot = up->dot;
93 incref(p->dot);
94
95 memmove(p->note, up->note, sizeof(p->note));
96 p->privatemem = up->privatemem;
97 p->noswap = up->noswap;
98 p->nnote = up->nnote;
99 p->notified = 0;
100 p->lastnote = up->lastnote;
101 p->notify = up->notify;
102 p->ureg = up->ureg;
103 p->dbgreg = 0;
104
105 /* Make a new set of memory segments */
106 n = flag & RFMEM;
107 qlock(&p->seglock);
108 if(waserror()){
109 qunlock(&p->seglock);
110 nexterror();
111 }
112 for(i = 0; i < NSEG; i++)
113 if(up->seg[i])
114 p->seg[i] = dupseg(up->seg, i, n);
115 qunlock(&p->seglock);
116 poperror();
117
118 /* File descriptors */
119 if(flag & (RFFDG|RFCFDG)) {
120 if(flag & RFFDG)
121 p->fgrp = dupfgrp(up->fgrp);
122 else
123 p->fgrp = dupfgrp(nil);
124 }
125 else {
126 p->fgrp = up->fgrp;
127 incref(p->fgrp);
128 }
129
130 /* Process groups */
131 if(flag & (RFNAMEG|RFCNAMEG)) {
132 p->pgrp = newpgrp();
133 if(flag & RFNAMEG)
134 pgrpcpy(p->pgrp, up->pgrp);
135 /* inherit noattach */
136 p->pgrp->noattach = up->pgrp->noattach;
137 }
138 else {
139 p->pgrp = up->pgrp;
140 incref(p->pgrp);
141 }
142 if(flag & RFNOMNT)
143 up->pgrp->noattach = 1;
144
145 if(flag & RFREND)
146 p->rgrp = newrgrp();
147 else {
148 incref(up->rgrp);
149 p->rgrp = up->rgrp;
150 }
151
152 /* Environment group */
153 if(flag & (RFENVG|RFCENVG)) {
154 p->egrp = smalloc(sizeof(Egrp));
155 p->egrp->ref = 1;
156 if(flag & RFENVG)
157 envcpy(p->egrp, up->egrp);
158 }
159 else {
160 p->egrp = up->egrp;
161 incref(p->egrp);
162 }
163 p->hang = up->hang;
164 p->procmode = up->procmode;
165
166 /* Craft a return frame which will cause the child to pop out of
167 * the scheduler in user mode with the return register zero
168 */
169 forkchild(p, up->dbgreg);
170
171 p->parent = up;
172 p->parentpid = up->pid;
173 if(flag&RFNOWAIT)
174 p->parentpid = 0;
175 else {
176 lock(&up->exl);
177 up->nchild++;
178 unlock(&up->exl);
179 }
180 if((flag&RFNOTEG) == 0)
181 p->noteid = up->noteid;
182
183 p->fpstate = up->fpstate;
184 pid = p->pid;
185 memset(p->time, 0, sizeof(p->time));
186 p->time[TReal] = MACHP(0)->ticks;
187
188 kstrdup(&p->text, up->text);
189 kstrdup(&p->user, up->user);
190 /*
191 * since the bss/data segments are now shareable,
192 * any mmu info about this process is now stale
193 * (i.e. has bad properties) and has to be discarded.
194 */
195 flushmmu();
196 p->basepri = up->basepri;
197 p->priority = up->basepri;
198 p->fixedpri = up->fixedpri;
199 p->mp = up->mp;
200 wm = up->wired;
201 if(wm)
202 procwired(p, wm->machno);
203 ready(p);
204 sched();
205 return pid;
206 }
207
208 static ulong
209 l2be(long l)
210 {
211 uchar *cp;
212
213 cp = (uchar*)&l;
214 return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
215 }
216
217 long
218 sysexec(ulong *arg)
219 {
220 Segment *s, *ts;
221 ulong t, d, b;
222 int i;
223 Chan *tc;
224 char **argv, **argp;
225 char *a, *charp, *args, *file;
226 char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
227 ulong ssize, spage, nargs, nbytes, n, bssend;
228 int indir;
229 Exec exec;
230 char line[sizeof(Exec)];
231 Fgrp *f;
232 Image *img;
233 ulong magic, text, entry, data, bss;
234 Tos *tos;
235
236 validaddr(arg[0], 1, 0);
237 file = (char*)arg[0];
238 indir = 0;
239 elem = nil;
240 if(waserror()){
241 free(elem);
242 nexterror();
243 }
244 for(;;){
245 tc = namec(file, Aopen, OEXEC, 0);
246 if(waserror()){
247 cclose(tc);
248 nexterror();
249 }
250 if(!indir)
251 kstrdup(&elem, up->genbuf);
252
253 n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
254 if(n < 2)
255 error(Ebadexec);
256 magic = l2be(exec.magic);
257 text = l2be(exec.text);
258 entry = l2be(exec.entry);
259 if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){
260 if(text >= USTKTOP-UTZERO
261 || entry < UTZERO+sizeof(Exec)
262 || entry >= UTZERO+sizeof(Exec)+text)
263 error(Ebadexec);
264 break; /* for binary */
265 }
266
267 /*
268 * Process #! /bin/sh args ...
269 */
270 memmove(line, &exec, sizeof(Exec));
271 if(indir || line[0]!='#' || line[1]!='!')
272 error(Ebadexec);
273 n = shargs(line, n, progarg);
274 if(n == 0)
275 error(Ebadexec);
276 indir = 1;
277 /*
278 * First arg becomes complete file name
279 */
280 progarg[n++] = file;
281 progarg[n] = 0;
282 validaddr(arg[1], BY2WD, 1);
283 arg[1] += BY2WD;
284 file = progarg[0];
285 if(strlen(elem) >= sizeof progelem)
286 error(Ebadexec);
287 strcpy(progelem, elem);
288 progarg[0] = progelem;
289 poperror();
290 cclose(tc);
291 }
292
293 data = l2be(exec.data);
294 bss = l2be(exec.bss);
295 t = (UTZERO+sizeof(Exec)+text+(BY2PG-1)) & ~(BY2PG-1);
296 d = (t + data + (BY2PG-1)) & ~(BY2PG-1);
297 bssend = t + data + bss;
298 b = (bssend + (BY2PG-1)) & ~(BY2PG-1);
299 if(t >= KZERO || d >= KZERO || b >= KZERO)
300 error(Ebadexec);
301
302 /*
303 * Args: pass 1: count
304 */
305 nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */
306 nargs = 0;
307 if(indir){
308 argp = progarg;
309 while(*argp){
310 a = *argp++;
311 nbytes += strlen(a) + 1;
312 nargs++;
313 }
314 }
315 evenaddr(arg[1]);
316 argp = (char**)arg[1];
317 validaddr((ulong)argp, BY2WD, 0);
318 while(*argp){
319 a = *argp++;
320 if(((ulong)argp&(BY2PG-1)) < BY2WD)
321 validaddr((ulong)argp, BY2WD, 0);
322 validaddr((ulong)a, 1, 0);
323 nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1;
324 nargs++;
325 }
326 ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
327
328 /*
329 * 8-byte align SP for those (e.g. sparc) that need it.
330 * execregs() will subtract another 4 bytes for argc.
331 */
332 if((ssize+4) & 7)
333 ssize += 4;
334 spage = (ssize+(BY2PG-1)) >> PGSHIFT;
335
336 /*
337 * Build the stack segment, putting it in kernel virtual for the moment
338 */
339 if(spage > TSTKSIZ)
340 error(Enovmem);
341
342 qlock(&up->seglock);
343 if(waserror()){
344 qunlock(&up->seglock);
345 nexterror();
346 }
347 up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG);
348
349 /*
350 * Args: pass 2: assemble; the pages will be faulted in
351 */
352 tos = (Tos*)(TSTKTOP - sizeof(Tos));
353 tos->cyclefreq = m->cyclefreq;
354 cycles((uvlong*)&tos->pcycles);
355 tos->pcycles = -tos->pcycles;
356 tos->kcycles = tos->pcycles;
357 tos->clock = 0;
358 argv = (char**)(TSTKTOP - ssize);
359 charp = (char*)(TSTKTOP - nbytes);
360 args = charp;
361 if(indir)
362 argp = progarg;
363 else
364 argp = (char**)arg[1];
365
366 for(i=0; i<nargs; i++){
367 if(indir && *argp==0) {
368 indir = 0;
369 argp = (char**)arg[1];
370 }
371 *argv++ = charp + (USTKTOP-TSTKTOP);
372 n = strlen(*argp) + 1;
373 memmove(charp, *argp++, n);
374 charp += n;
375 }
376
377 free(up->text);
378 up->text = elem;
379 elem = nil; /* so waserror() won't free elem */
380 USED(elem);
381
382 /* copy args; easiest from new process's stack */
383 n = charp - args;
384 if(n > 128) /* don't waste too much space on huge arg lists */
385 n = 128;
386 a = up->args;
387 up->args = nil;
388 free(a);
389 up->args = smalloc(n);
390 memmove(up->args, args, n);
391 if(n>0 && up->args[n-1]!='\0'){
392 /* make sure last arg is NUL-terminated */
393 /* put NUL at UTF-8 character boundary */
394 for(i=n-1; i>0; --i)
395 if(fullrune(up->args+i, n-i))
396 break;
397 up->args[i] = 0;
398 n = i+1;
399 }
400 up->nargs = n;
401
402 /*
403 * Committed.
404 * Free old memory.
405 * Special segments are maintained across exec
406 */
407 for(i = SSEG; i <= BSEG; i++) {
408 putseg(up->seg[i]);
409 /* prevent a second free if we have an error */
410 up->seg[i] = 0;
411 }
412 for(i = BSEG+1; i < NSEG; i++) {
413 s = up->seg[i];
414 if(s != 0 && (s->type&SG_CEXEC)) {
415 putseg(s);
416 up->seg[i] = 0;
417 }
418 }
419
420 /*
421 * Close on exec
422 */
423 f = up->fgrp;
424 for(i=0; i<=f->maxfd; i++)
425 fdclose(i, CCEXEC);
426
427 /* Text. Shared. Attaches to cache image if possible */
428 /* attachimage returns a locked cache image */
429 img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
430 ts = img->s;
431 up->seg[TSEG] = ts;
432 ts->flushme = 1;
433 ts->fstart = 0;
434 ts->flen = sizeof(Exec)+text;
435 unlock(img);
436
437 /* Data. Shared. */
438 s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
439 up->seg[DSEG] = s;
440
441 /* Attached by hand */
442 incref(img);
443 s->image = img;
444 s->fstart = ts->fstart+ts->flen;
445 s->flen = data;
446
447 /* BSS. Zero fill on demand */
448 up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
449
450 /*
451 * Move the stack
452 */
453 s = up->seg[ESEG];
454 up->seg[ESEG] = 0;
455 up->seg[SSEG] = s;
456 qunlock(&up->seglock);
457 poperror(); /* seglock */
458 poperror(); /* elem */
459 s->base = USTKTOP-USTKSIZE;
460 s->top = USTKTOP;
461 relocateseg(s, USTKTOP-TSTKTOP);
462
463 /*
464 * '/' processes are higher priority (hack to make /ip more responsive).
465 */
466 if(devtab[tc->type]->dc == L'/')
467 up->basepri = PriRoot;
468 up->priority = up->basepri;
469 poperror();
470 cclose(tc);
471
472 /*
473 * At this point, the mmu contains info about the old address
474 * space and needs to be flushed
475 */
476 flushmmu();
477 qlock(&up->debug);
478 up->nnote = 0;
479 up->notify = 0;
480 up->notified = 0;
481 up->privatemem = 0;
482 procsetup(up);
483 qunlock(&up->debug);
484 if(up->hang)
485 up->procctl = Proc_stopme;
486
487 return execregs(entry, ssize, nargs);
488 }
489
490 int
491 shargs(char *s, int n, char **ap)
492 {
493 int i;
494
495 s += 2;
496 n -= 2; /* skip #! */
497 for(i=0; s[i]!='\n'; i++)
498 if(i == n-1)
499 return 0;
500 s[i] = 0;
501 *ap = 0;
502 i = 0;
503 for(;;) {
504 while(*s==' ' || *s=='\t')
505 s++;
506 if(*s == 0)
507 break;
508 i++;
509 *ap++ = s;
510 *ap = 0;
511 while(*s && *s!=' ' && *s!='\t')
512 s++;
513 if(*s == 0)
514 break;
515 else
516 *s++ = 0;
517 }
518 return i;
519 }
520
521 int
522 return0(void*)
523 {
524 return 0;
525 }
526
527 long
528 syssleep(ulong *arg)
529 {
530
531 int n;
532
533 n = arg[0];
534 if(n <= 0) {
535 if (up->edf && (up->edf->flags & Admitted))
536 edfyield();
537 else
538 yield();
539 return 0;
540 }
541 if(n < TK2MS(1))
542 n = TK2MS(1);
543 tsleep(&up->sleep, return0, 0, n);
544 return 0;
545 }
546
547 long
548 sysalarm(ulong *arg)
549 {
550 return procalarm(arg[0]);
551 }
552
553 long
554 sysexits(ulong *arg)
555 {
556 char *status;
557 char *inval = "invalid exit string";
558 char buf[ERRMAX];
559
560 status = (char*)arg[0];
561 if(status){
562 if(waserror())
563 status = inval;
564 else{
565 validaddr((ulong)status, 1, 0);
566 if(vmemchr(status, 0, ERRMAX) == 0){
567 memmove(buf, status, ERRMAX);
568 buf[ERRMAX-1] = 0;
569 status = buf;
570 }
571 poperror();
572 }
573
574 }
575 pexit(status, 1);
576 return 0; /* not reached */
577 }
578
579 long
580 sys_wait(ulong *arg)
581 {
582 int pid;
583 Waitmsg w;
584 OWaitmsg *ow;
585
586 if(arg[0] == 0)
587 return pwait(nil);
588
589 validaddr(arg[0], sizeof(OWaitmsg), 1);
590 evenaddr(arg[0]);
591 pid = pwait(&w);
592 if(pid >= 0){
593 ow = (OWaitmsg*)arg[0];
594 readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
595 readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
596 readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
597 readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
598 strncpy(ow->msg, w.msg, sizeof(ow->msg));
599 ow->msg[sizeof(ow->msg)-1] = '\0';
600 }
601 return pid;
602 }
603
604 long
605 sysawait(ulong *arg)
606 {
607 int i;
608 int pid;
609 Waitmsg w;
610 ulong n;
611
612 n = arg[1];
613 validaddr(arg[0], n, 1);
614 pid = pwait(&w);
615 if(pid < 0)
616 return -1;
617 i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q",
618 w.pid,
619 w.time[TUser], w.time[TSys], w.time[TReal],
620 w.msg);
621
622 return i;
623 }
624
625 void
626 werrstr(char *fmt, ...)
627 {
628 va_list va;
629
630 if(up == nil)
631 return;
632
633 va_start(va, fmt);
634 vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
635 va_end(va);
636 }
637
638 static long
639 generrstr(char *buf, uint nbuf)
640 {
641 char tmp[ERRMAX];
642
643 if(nbuf == 0)
644 error(Ebadarg);
645 validaddr((ulong)buf, nbuf, 1);
646 if(nbuf > sizeof tmp)
647 nbuf = sizeof tmp;
648 memmove(tmp, buf, nbuf);
649
650 /* make sure it's NUL-terminated */
651 tmp[nbuf-1] = '\0';
652 memmove(buf, up->syserrstr, nbuf);
653 buf[nbuf-1] = '\0';
654 memmove(up->syserrstr, tmp, nbuf);
655 return 0;
656 }
657
658 long
659 syserrstr(ulong *arg)
660 {
661 return generrstr((char*)arg[0], arg[1]);
662 }
663
664 /* compatibility for old binaries */
665 long
666 sys_errstr(ulong *arg)
667 {
668 return generrstr((char*)arg[0], 64);
669 }
670
671 long
672 sysnotify(ulong *arg)
673 {
674 if(arg[0] != 0)
675 validaddr(arg[0], sizeof(ulong), 0);
676 up->notify = (int(*)(void*, char*))(arg[0]);
677 return 0;
678 }
679
680 long
681 sysnoted(ulong *arg)
682 {
683 if(arg[0]!=NRSTR && !up->notified)
684 error(Egreg);
685 return 0;
686 }
687
688 long
689 syssegbrk(ulong *arg)
690 {
691 int i;
692 ulong addr;
693 Segment *s;
694
695 addr = arg[0];
696 for(i = 0; i < NSEG; i++) {
697 s = up->seg[i];
698 if(s == 0 || addr < s->base || addr >= s->top)
699 continue;
700 switch(s->type&SG_TYPE) {
701 case SG_TEXT:
702 case SG_DATA:
703 case SG_STACK:
704 error(Ebadarg);
705 default:
706 return ibrk(arg[1], i);
707 }
708 }
709
710 error(Ebadarg);
711 return 0; /* not reached */
712 }
713
714 long
715 syssegattach(ulong *arg)
716 {
717 return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]);
718 }
719
720 long
721 syssegdetach(ulong *arg)
722 {
723 int i;
724 ulong addr;
725 Segment *s;
726
727 qlock(&up->seglock);
728 if(waserror()){
729 qunlock(&up->seglock);
730 nexterror();
731 }
732
733 s = 0;
734 addr = arg[0];
735 for(i = 0; i < NSEG; i++)
736 if(s = up->seg[i]) {
737 qlock(&s->lk);
738 if((addr >= s->base && addr < s->top) ||
739 (s->top == s->base && addr == s->base))
740 goto found;
741 qunlock(&s->lk);
742 }
743
744 error(Ebadarg);
745
746 found:
747 /*
748 * Check we are not detaching the initial stack segment.
749 */
750 if(s == up->seg[SSEG]){
751 qunlock(&s->lk);
752 error(Ebadarg);
753 }
754 up->seg[i] = 0;
755 qunlock(&s->lk);
756 putseg(s);
757 qunlock(&up->seglock);
758 poperror();
759
760 /* Ensure we flush any entries from the lost segment */
761 flushmmu();
762 return 0;
763 }
764
765 long
766 syssegfree(ulong *arg)
767 {
768 Segment *s;
769 ulong from, to;
770
771 from = arg[0];
772 s = seg(up, from, 1);
773 if(s == nil)
774 error(Ebadarg);
775 to = (from + arg[1]) & ~(BY2PG-1);
776 from = PGROUND(from);
777
778 if(to > s->top) {
779 qunlock(&s->lk);
780 error(Ebadarg);
781 }
782
783 mfreeseg(s, from, (to - from) / BY2PG);
784 qunlock(&s->lk);
785 flushmmu();
786
787 return 0;
788 }
789
790 /* For binary compatibility */
791 long
792 sysbrk_(ulong *arg)
793 {
794 return ibrk(arg[0], BSEG);
795 }
796
797 long
798 sysrendezvous(ulong *arg)
799 {
800 uintptr tag, val;
801 Proc *p, **l;
802
803 tag = arg[0];
804 l = &REND(up->rgrp, tag);
805 up->rendval = ~(uintptr)0;
806
807 lock(up->rgrp);
808 for(p = *l; p; p = p->rendhash) {
809 if(p->rendtag == tag) {
810 *l = p->rendhash;
811 val = p->rendval;
812 p->rendval = arg[1];
813
814 while(p->mach != 0)
815 ;
816 ready(p);
817 unlock(up->rgrp);
818 return val;
819 }
820 l = &p->rendhash;
821 }
822
823 /* Going to sleep here */
824 up->rendtag = tag;
825 up->rendval = arg[1];
826 up->rendhash = *l;
827 *l = up;
828 up->state = Rendezvous;
829 unlock(up->rgrp);
830
831 sched();
832
833 return up->rendval;
834 }
835
836 /*
837 * The implementation of semaphores is complicated by needing
838 * to avoid rescheduling in syssemrelease, so that it is safe
839 * to call from real-time processes. This means syssemrelease
840 * cannot acquire any qlocks, only spin locks.
841 *
842 * Semacquire and semrelease must both manipulate the semaphore
843 * wait list. Lock-free linked lists only exist in theory, not
844 * in practice, so the wait list is protected by a spin lock.
845 *
846 * The semaphore value *addr is stored in user memory, so it
847 * cannot be read or written while holding spin locks.
848 *
849 * Thus, we can access the list only when holding the lock, and
850 * we can access the semaphore only when not holding the lock.
851 * This makes things interesting. Note that sleep's condition function
852 * is called while holding two locks - r and up->rlock - so it cannot
853 * access the semaphore value either.
854 *
855 * An acquirer announces its intention to try for the semaphore
856 * by putting a Sema structure onto the wait list and then
857 * setting Sema.waiting. After one last check of semaphore,
858 * the acquirer sleeps until Sema.waiting==0. A releaser of n
859 * must wake up n acquirers who have Sema.waiting set. It does
860 * this by clearing Sema.waiting and then calling wakeup.
861 *
862 * There are three interesting races here.
863
864 * The first is that in this particular sleep/wakeup usage, a single
865 * wakeup can rouse a process from two consecutive sleeps!
866 * The ordering is:
867 *
868 * (a) set Sema.waiting = 1
869 * (a) call sleep
870 * (b) set Sema.waiting = 0
871 * (a) check Sema.waiting inside sleep, return w/o sleeping
872 * (a) try for semaphore, fail
873 * (a) set Sema.waiting = 1
874 * (a) call sleep
875 * (b) call wakeup(a)
876 * (a) wake up again
877 *
878 * This is okay - semacquire will just go around the loop
879 * again. It does mean that at the top of the for(;;) loop in
880 * semacquire, phore.waiting might already be set to 1.
881 *
882 * The second is that a releaser might wake an acquirer who is
883 * interrupted before he can acquire the lock. Since
884 * release(n) issues only n wakeup calls -- only n can be used
885 * anyway -- if the interrupted process is not going to use his
886 * wakeup call he must pass it on to another acquirer.
887 *
888 * The third race is similar to the second but more subtle. An
889 * acquirer sets waiting=1 and then does a final canacquire()
890 * before going to sleep. The opposite order would result in
891 * missing wakeups that happen between canacquire and
892 * waiting=1. (In fact, the whole point of Sema.waiting is to
893 * avoid missing wakeups between canacquire() and sleep().) But
894 * there can be spurious wakeups between a successful
895 * canacquire() and the following semdequeue(). This wakeup is
896 * not useful to the acquirer, since he has already acquired
897 * the semaphore. Like in the previous case, though, the
898 * acquirer must pass the wakeup call along.
899 *
900 * This is all rather subtle. The code below has been verified
901 * with the spin model /sys/src/9/port/semaphore.p. The
902 * original code anticipated the second race but not the first
903 * or third, which were caught only with spin. The first race
904 * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
905 * It was lucky that my abstract model of sleep/wakeup still managed
906 * to preserve that behavior.
907 *
908 * I remain slightly concerned about memory coherence
909 * outside of locks. The spin model does not take
910 * queued processor writes into account so we have to
911 * think hard. The only variables accessed outside locks
912 * are the semaphore value itself and the boolean flag
913 * Sema.waiting. The value is only accessed with cmpswap,
914 * whose job description includes doing the right thing as
915 * far as memory coherence across processors. That leaves
916 * Sema.waiting. To handle it, we call coherence() before each
917 * read and after each write. - rsc
918 */
919
920 /* Add semaphore p with addr a to list in seg. */
921 static void
922 semqueue(Segment *s, long *a, Sema *p)
923 {
924 memset(p, 0, sizeof *p);
925 p->addr = a;
926 lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */
927 p->next = &s->sema;
928 p->prev = s->sema.prev;
929 p->next->prev = p;
930 p->prev->next = p;
931 unlock(&s->sema);
932 }
933
934 /* Remove semaphore p from list in seg. */
935 static void
936 semdequeue(Segment *s, Sema *p)
937 {
938 lock(&s->sema);
939 p->next->prev = p->prev;
940 p->prev->next = p->next;
941 unlock(&s->sema);
942 }
943
944 /* Wake up n waiters with addr a on list in seg. */
945 static void
946 semwakeup(Segment *s, long *a, long n)
947 {
948 Sema *p;
949
950 lock(&s->sema);
951 for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
952 if(p->addr == a && p->waiting){
953 p->waiting = 0;
954 coherence();
955 wakeup(p);
956 n--;
957 }
958 }
959 unlock(&s->sema);
960 }
961
962 /* Add delta to semaphore and wake up waiters as appropriate. */
963 static long
964 semrelease(Segment *s, long *addr, long delta)
965 {
966 long value;
967
968 do
969 value = *addr;
970 while(!cmpswap(addr, value, value+delta));
971 semwakeup(s, addr, delta);
972 return value+delta;
973 }
974
975 /* Try to acquire semaphore using compare-and-swap */
976 static int
977 canacquire(long *addr)
978 {
979 long value;
980
981 while((value=*addr) > 0)
982 if(cmpswap(addr, value, value-1))
983 return 1;
984 return 0;
985 }
986
987 /* Should we wake up? */
988 static int
989 semawoke(void *p)
990 {
991 coherence();
992 return !((Sema*)p)->waiting;
993 }
994
995 /* Acquire semaphore (subtract 1). */
996 static int
997 semacquire(Segment *s, long *addr, int block)
998 {
999 int acquired;
1000 Sema phore;
1001
1002 if(canacquire(addr))
1003 return 1;
1004 if(!block)
1005 return 0;
1006
1007 acquired = 0;
1008 semqueue(s, addr, &phore);
1009 for(;;){
1010 phore.waiting = 1;
1011 coherence();
1012 if(canacquire(addr)){
1013 acquired = 1;
1014 break;
1015 }
1016 if(waserror())
1017 break;
1018 sleep(&phore, semawoke, &phore);
1019 poperror();
1020 }
1021 semdequeue(s, &phore);
1022 coherence(); /* not strictly necessary due to lock in semdequeue */
1023 if(!phore.waiting)
1024 semwakeup(s, addr, 1);
1025 if(!acquired)
1026 nexterror();
1027 return 1;
1028 }
1029
1030 long
1031 syssemacquire(ulong *arg)
1032 {
1033 int block;
1034 long *addr;
1035 Segment *s;
1036
1037 validaddr(arg[0], sizeof(long), 1);
1038 evenaddr(arg[0]);
1039 addr = (long*)arg[0];
1040 block = arg[1];
1041
1042 if((s = seg(up, (ulong)addr, 0)) == nil)
1043 error(Ebadarg);
1044 if(*addr < 0)
1045 error(Ebadarg);
1046 return semacquire(s, addr, block);
1047 }
1048
1049 long
1050 syssemrelease(ulong *arg)
1051 {
1052 long *addr, delta;
1053 Segment *s;
1054
1055 validaddr(arg[0], sizeof(long), 1);
1056 evenaddr(arg[0]);
1057 addr = (long*)arg[0];
1058 delta = arg[1];
1059
1060 if((s = seg(up, (ulong)addr, 0)) == nil)
1061 error(Ebadarg);
1062 if(delta < 0 || *addr < 0)
1063 error(Ebadarg);
1064 return semrelease(s, addr, arg[1]);
1065 }
Cache object: 61130316d2ef380df5f726b7493a194c
|