1 /* $NetBSD: linux_futex.c,v 1.18.4.2 2009/03/16 01:20:37 snj Exp $ */
2
3 /*-
4 * Copyright (c) 2005 Emmanuel Dreyfus, all rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. All advertising materials mentioning features or use of this software
15 * must display the following acknowledgement:
16 * This product includes software developed by Emmanuel Dreyfus
17 * 4. The name of the author may not be used to endorse or promote
18 * products derived from this software without specific prior written
19 * permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS''
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include <sys/cdefs.h>
35 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.18.4.2 2009/03/16 01:20:37 snj Exp $");
36
37 #include <sys/param.h>
38 #include <sys/time.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/lwp.h>
42 #include <sys/queue.h>
43 #include <sys/condvar.h>
44 #include <sys/mutex.h>
45 #include <sys/once.h>
46 #include <sys/kmem.h>
47 #include <sys/kernel.h>
48 #include <sys/atomic.h>
49
50 #include <compat/linux/common/linux_types.h>
51 #include <compat/linux/common/linux_emuldata.h>
52 #include <compat/linux/common/linux_exec.h>
53 #include <compat/linux/common/linux_signal.h>
54 #include <compat/linux/common/linux_futex.h>
55 #include <compat/linux/common/linux_ipc.h>
56 #include <compat/linux/common/linux_sem.h>
57 #include <compat/linux/linux_syscallargs.h>
58
59 struct futex;
60
61 struct waiting_proc {
62 lwp_t *wp_l;
63 struct futex *wp_new_futex;
64 kcondvar_t wp_futex_cv;
65 TAILQ_ENTRY(waiting_proc) wp_list;
66 };
67 struct futex {
68 void *f_uaddr;
69 int f_refcount;
70 LIST_ENTRY(futex) f_list;
71 TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc;
72 };
73
74 static LIST_HEAD(futex_list, futex) futex_list;
75 static kmutex_t futex_lock;
76
77 #define FUTEX_LOCK mutex_enter(&futex_lock);
78 #define FUTEX_UNLOCK mutex_exit(&futex_lock);
79
80 #define FUTEX_LOCKED 1
81 #define FUTEX_UNLOCKED 0
82
83 #define FUTEX_SYSTEM_LOCK KERNEL_LOCK(1, NULL);
84 #define FUTEX_SYSTEM_UNLOCK KERNEL_UNLOCK_ONE(0);
85
86 #ifdef DEBUG_LINUX_FUTEX
87 #define FUTEXPRINTF(a) printf a
88 #else
89 #define FUTEXPRINTF(a)
90 #endif
91
92 static ONCE_DECL(futex_once);
93
94 static int
95 futex_init(void)
96 {
97 printf("futex_init: initializing futex\n");
98 mutex_init(&futex_lock, MUTEX_DEFAULT, IPL_NONE);
99 return 0;
100 }
101
102 static struct futex *futex_get(void *, int);
103 static void futex_put(struct futex *);
104 static int futex_sleep(struct futex *, lwp_t *, unsigned long);
105 static int futex_wake(struct futex *, int, struct futex *, int);
106 static int futex_atomic_op(lwp_t *, int, void *);
107
108 int
109 linux_sys_futex(struct lwp *l, const struct linux_sys_futex_args *uap, register_t *retval)
110 {
111 /* {
112 syscallarg(int *) uaddr;
113 syscallarg(int) op;
114 syscallarg(int) val;
115 syscallarg(const struct timespec *) timeout;
116 syscallarg(int *) uaddr2;
117 syscallarg(int) val3;
118 } */
119 int val;
120 int ret;
121 struct timespec timeout = { 0, 0 };
122 int error = 0;
123 struct futex *f;
124 struct futex *newf;
125 int timeout_hz;
126 struct timeval tv = {0, 0};
127 struct futex *f2;
128 int op_ret;
129
130 RUN_ONCE(&futex_once, futex_init);
131
132 /*
133 * Our implementation provides only private futexes. Most of the apps
134 * should use private futexes but don't claim so. Therefore we treat
135 * all futexes as private by clearing the FUTEX_PRIVATE_FLAG. It works
136 * in most cases (ie. when futexes are not shared on file descriptor
137 * or between different processes).
138 */
139 switch (SCARG(uap, op) & ~LINUX_FUTEX_PRIVATE_FLAG) {
140 case LINUX_FUTEX_WAIT:
141 FUTEX_SYSTEM_LOCK;
142
143 if ((error = copyin(SCARG(uap, uaddr),
144 &val, sizeof(val))) != 0) {
145 FUTEX_SYSTEM_UNLOCK;
146 return error;
147 }
148
149 if (val != SCARG(uap, val)) {
150 FUTEX_SYSTEM_UNLOCK;
151 return EWOULDBLOCK;
152 }
153
154 if (SCARG(uap, timeout) != NULL) {
155 if ((error = copyin(SCARG(uap, timeout),
156 &timeout, sizeof(timeout))) != 0) {
157 FUTEX_SYSTEM_UNLOCK;
158 return error;
159 }
160 }
161
162 FUTEXPRINTF(("FUTEX_WAIT %d.%d: val = %d, uaddr = %p, "
163 "*uaddr = %d, timeout = %lld.%09ld\n",
164 l->l_proc->p_pid, l->l_lid, SCARG(uap, val),
165 SCARG(uap, uaddr), val, (long long)timeout.tv_sec,
166 timeout.tv_nsec));
167
168 tv.tv_usec = timeout.tv_sec * 1000000 + timeout.tv_nsec / 1000;
169 timeout_hz = tvtohz(&tv);
170
171 if (timeout.tv_sec == 0 && timeout.tv_nsec == 0)
172 timeout_hz = 0;
173
174 /*
175 * If the user process requests a non null timeout,
176 * make sure we do not turn it into an infinite
177 * timeout because timeout_hz is 0.
178 *
179 * We use a minimal timeout of 1/hz. Maybe it would make
180 * sense to just return ETIMEDOUT without sleeping.
181 */
182 if (((timeout.tv_sec != 0) || (timeout.tv_nsec != 0)) &&
183 (timeout_hz == 0))
184 timeout_hz = 1;
185
186 f = futex_get(SCARG(uap, uaddr), FUTEX_UNLOCKED);
187 ret = futex_sleep(f, l, timeout_hz);
188 futex_put(f);
189
190 FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, "
191 "ret = %d\n", l->l_proc->p_pid, l->l_lid,
192 SCARG(uap, uaddr), ret));
193
194 FUTEX_SYSTEM_UNLOCK;
195 switch (ret) {
196 case EWOULDBLOCK: /* timeout */
197 return ETIMEDOUT;
198 break;
199 case EINTR: /* signal */
200 return EINTR;
201 break;
202 case 0: /* FUTEX_WAKE received */
203 FUTEXPRINTF(("FUTEX_WAIT %d.%d: uaddr = %p, got it\n",
204 l->l_proc->p_pid, l->l_lid, SCARG(uap, uaddr)));
205 return 0;
206 break;
207 default:
208 FUTEXPRINTF(("FUTEX_WAIT: unexpected ret = %d\n", ret));
209 break;
210 }
211
212 /* NOTREACHED */
213 break;
214
215 case LINUX_FUTEX_WAKE:
216 FUTEX_SYSTEM_LOCK;
217 /*
218 * XXX: Linux is able cope with different addresses
219 * corresponding to the same mapped memory in the sleeping
220 * and the waker process(es).
221 */
222 FUTEXPRINTF(("FUTEX_WAKE %d.%d: uaddr = %p, val = %d\n",
223 l->l_proc->p_pid, l->l_lid,
224 SCARG(uap, uaddr), SCARG(uap, val)));
225
226 f = futex_get(SCARG(uap, uaddr), FUTEX_UNLOCKED);
227 *retval = futex_wake(f, SCARG(uap, val), NULL, 0);
228 futex_put(f);
229
230 FUTEX_SYSTEM_UNLOCK;
231
232 break;
233
234 case LINUX_FUTEX_CMP_REQUEUE:
235 FUTEX_SYSTEM_LOCK;
236
237 if ((error = copyin(SCARG(uap, uaddr),
238 &val, sizeof(val))) != 0) {
239 FUTEX_SYSTEM_UNLOCK;
240 return error;
241 }
242
243 if (val != SCARG(uap, val3)) {
244 FUTEX_SYSTEM_UNLOCK;
245 return EAGAIN;
246 }
247
248 f = futex_get(SCARG(uap, uaddr), FUTEX_UNLOCKED);
249 newf = futex_get(SCARG(uap, uaddr2), FUTEX_UNLOCKED);
250 *retval = futex_wake(f, SCARG(uap, val), newf,
251 (int)(unsigned long)SCARG(uap, timeout));
252 futex_put(f);
253 futex_put(newf);
254
255 FUTEX_SYSTEM_UNLOCK;
256 break;
257
258 case LINUX_FUTEX_REQUEUE:
259 FUTEX_SYSTEM_LOCK;
260
261 f = futex_get(SCARG(uap, uaddr), FUTEX_UNLOCKED);
262 newf = futex_get(SCARG(uap, uaddr2), FUTEX_UNLOCKED);
263 *retval = futex_wake(f, SCARG(uap, val), newf,
264 (int)(unsigned long)SCARG(uap, timeout));
265 futex_put(f);
266 futex_put(newf);
267
268 FUTEX_SYSTEM_UNLOCK;
269 break;
270
271 case LINUX_FUTEX_FD:
272 FUTEXPRINTF(("linux_sys_futex: unimplemented op %d\n",
273 SCARG(uap, op)));
274 return ENOSYS;
275 case LINUX_FUTEX_WAKE_OP:
276 FUTEX_SYSTEM_LOCK;
277 f = futex_get(SCARG(uap, uaddr), FUTEX_UNLOCKED);
278 f2 = futex_get(SCARG(uap, uaddr2), FUTEX_UNLOCKED);
279 /*
280 * This function returns positive number as results and
281 * negative as errors
282 */
283 op_ret = futex_atomic_op(l, SCARG(uap, val3), SCARG(uap, uaddr2));
284 if (op_ret < 0) {
285 /* XXX: We don't handle EFAULT yet */
286 if (op_ret != -EFAULT) {
287 futex_put(f);
288 futex_put(f2);
289 FUTEX_SYSTEM_UNLOCK;
290 return -op_ret;
291 }
292 futex_put(f);
293 futex_put(f2);
294 FUTEX_SYSTEM_UNLOCK;
295 return EFAULT;
296 }
297
298 ret = futex_wake(f, SCARG(uap, val), NULL, 0);
299 futex_put(f);
300 if (op_ret > 0) {
301 op_ret = 0;
302 /*
303 * Linux abuses the address of the timespec parameter
304 * as the number of retries
305 */
306 op_ret += futex_wake(f2,
307 (int)(unsigned long)SCARG(uap, timeout), NULL, 0);
308 ret += op_ret;
309 }
310 futex_put(f2);
311 *retval = ret;
312 FUTEX_SYSTEM_UNLOCK;
313 break;
314 default:
315 FUTEXPRINTF(("linux_sys_futex: unknown op %d\n",
316 SCARG(uap, op)));
317 return ENOSYS;
318 }
319 return 0;
320 }
321
322 static struct futex *
323 futex_get(void *uaddr, int locked)
324 {
325 struct futex *f;
326
327 if (locked == FUTEX_UNLOCKED)
328 FUTEX_LOCK;
329
330 LIST_FOREACH(f, &futex_list, f_list) {
331 if (f->f_uaddr == uaddr) {
332 f->f_refcount++;
333 if (locked == FUTEX_UNLOCKED)
334 FUTEX_UNLOCK;
335 return f;
336 }
337 }
338
339 /* Not found, create it */
340 f = kmem_zalloc(sizeof(*f), KM_SLEEP);
341 f->f_uaddr = uaddr;
342 f->f_refcount = 1;
343 TAILQ_INIT(&f->f_waiting_proc);
344 LIST_INSERT_HEAD(&futex_list, f, f_list);
345 if (locked == FUTEX_UNLOCKED)
346 FUTEX_UNLOCK;
347
348 return f;
349 }
350
351 static void
352 futex_put(struct futex *f)
353 {
354
355 FUTEX_LOCK;
356 f->f_refcount--;
357 if (f->f_refcount == 0) {
358 KASSERT(TAILQ_EMPTY(&f->f_waiting_proc));
359 LIST_REMOVE(f, f_list);
360 kmem_free(f, sizeof(*f));
361 }
362 FUTEX_UNLOCK;
363
364 return;
365 }
366
367 static int
368 futex_sleep(struct futex *f, lwp_t *l, unsigned long timeout)
369 {
370 struct waiting_proc *wp;
371 int ret;
372
373 wp = kmem_zalloc(sizeof(*wp), KM_SLEEP);
374 wp->wp_l = l;
375 wp->wp_new_futex = NULL;
376 cv_init(&wp->wp_futex_cv, "futex");
377
378 FUTEX_LOCK;
379 TAILQ_INSERT_TAIL(&f->f_waiting_proc, wp, wp_list);
380 ret = cv_timedwait_sig(&wp->wp_futex_cv, &futex_lock, timeout);
381 TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list);
382 FUTEX_UNLOCK;
383
384 /* if we got woken up in futex_wake */
385 if ((ret == 0) && (wp->wp_new_futex != NULL)) {
386 /* suspend us on the new futex */
387 ret = futex_sleep(wp->wp_new_futex, l, timeout);
388 /* and release the old one */
389 futex_put(wp->wp_new_futex);
390 }
391
392 cv_destroy(&wp->wp_futex_cv);
393 kmem_free(wp, sizeof(*wp));
394 return ret;
395 }
396
397 static int
398 futex_wake(struct futex *f, int n, struct futex *newf, int n2)
399 {
400 struct waiting_proc *wp;
401 int count;
402
403 count = newf ? 0 : 1;
404
405 FUTEX_LOCK;
406 TAILQ_FOREACH(wp, &f->f_waiting_proc, wp_list) {
407 if (count <= n) {
408 cv_signal(&wp->wp_futex_cv);
409 count++;
410 } else {
411 if (newf == NULL)
412 continue;
413 /* futex_put called after tsleep */
414 wp->wp_new_futex = futex_get(newf->f_uaddr,
415 FUTEX_LOCKED);
416 cv_signal(&wp->wp_futex_cv);
417 if (count - n >= n2)
418 break;
419 }
420 }
421 FUTEX_UNLOCK;
422
423 return count;
424 }
425
426 static int
427 futex_atomic_op(lwp_t *l, int encoded_op, void *uaddr)
428 {
429 const int op = (encoded_op >> 28) & 7;
430 const int cmp = (encoded_op >> 24) & 15;
431 const int cmparg = (encoded_op << 20) >> 20;
432 int oparg = (encoded_op << 8) >> 20;
433 int error, oldval, cval;
434
435 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
436 oparg = 1 << oparg;
437
438 /* XXX: linux verifies access here and returns EFAULT */
439
440 if (copyin(uaddr, &cval, sizeof(int)) != 0)
441 return -EFAULT;
442
443 for (;;) {
444 int nval;
445
446 switch (op) {
447 case FUTEX_OP_SET:
448 nval = oparg;
449 break;
450 case FUTEX_OP_ADD:
451 nval = cval + oparg;
452 break;
453 case FUTEX_OP_OR:
454 nval = cval | oparg;
455 break;
456 case FUTEX_OP_ANDN:
457 nval = cval & ~oparg;
458 break;
459 case FUTEX_OP_XOR:
460 nval = cval ^ oparg;
461 break;
462 default:
463 return -ENOSYS;
464 }
465
466 error = ucas_int(uaddr, cval, nval, &oldval);
467 if (oldval == cval || error) {
468 break;
469 }
470 cval = oldval;
471 }
472
473 if (error)
474 return -EFAULT;
475
476 switch (cmp) {
477 case FUTEX_OP_CMP_EQ:
478 return (oldval == cmparg);
479 case FUTEX_OP_CMP_NE:
480 return (oldval != cmparg);
481 case FUTEX_OP_CMP_LT:
482 return (oldval < cmparg);
483 case FUTEX_OP_CMP_GE:
484 return (oldval >= cmparg);
485 case FUTEX_OP_CMP_LE:
486 return (oldval <= cmparg);
487 case FUTEX_OP_CMP_GT:
488 return (oldval > cmparg);
489 default:
490 return -ENOSYS;
491 }
492 }
493
494 int
495 linux_sys_set_robust_list(struct lwp *l,
496 const struct linux_sys_set_robust_list_args *uap, register_t *retval)
497 {
498 struct proc *p = l->l_proc;
499 struct linux_emuldata *led = p->p_emuldata;
500
501 if (SCARG(uap, len) != sizeof(*(led->robust_futexes)))
502 return EINVAL;
503 led->robust_futexes = SCARG(uap, head);
504 *retval = 0;
505 return 0;
506 }
507
508 int
509 linux_sys_get_robust_list(struct lwp *l,
510 const struct linux_sys_get_robust_list_args *uap, register_t *retval)
511 {
512 struct linux_emuldata *led;
513 struct linux_robust_list_head **head;
514 size_t len = sizeof(*led->robust_futexes);
515 int error = 0;
516
517 if (!SCARG(uap, pid)) {
518 led = l->l_proc->p_emuldata;
519 head = &led->robust_futexes;
520 } else {
521 struct proc *p;
522
523 mutex_enter(proc_lock);
524 if ((p = p_find(SCARG(uap, pid), PFIND_LOCKED)) == NULL ||
525 p->p_emul != &emul_linux) {
526 mutex_exit(proc_lock);
527 return ESRCH;
528 }
529 led = p->p_emuldata;
530 head = &led->robust_futexes;
531 mutex_exit(proc_lock);
532 }
533
534 error = copyout(&len, SCARG(uap, len), sizeof(len));
535 if (error)
536 return error;
537 return copyout(head, SCARG(uap, head), sizeof(*head));
538 }
539
540 static int
541 handle_futex_death(void *uaddr, pid_t pid, int pi)
542 {
543 int uval, nval, mval;
544 struct futex *f;
545
546 retry:
547 if (copyin(uaddr, &uval, 4))
548 return EFAULT;
549
550 if ((uval & FUTEX_TID_MASK) == pid) {
551 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
552 nval = atomic_cas_32(uaddr, uval, mval);
553
554 if (nval == -1)
555 return EFAULT;
556
557 if (nval != uval)
558 goto retry;
559
560 if (!pi && (uval & FUTEX_WAITERS)) {
561 f = futex_get(uaddr, FUTEX_UNLOCKED);
562 futex_wake(f, 1, NULL, 0);
563 }
564 }
565
566 return 0;
567 }
568
569 static int
570 fetch_robust_entry(struct linux_robust_list **entry,
571 struct linux_robust_list **head, int *pi)
572 {
573 unsigned long uentry;
574
575 if (copyin((const void *)head, &uentry, sizeof(unsigned long)))
576 return EFAULT;
577
578 *entry = (void *)(uentry & ~1UL);
579 *pi = uentry & 1;
580
581 return 0;
582 }
583
584 /* This walks the list of robust futexes, releasing them. */
585 void
586 release_futexes(struct proc *p)
587 {
588 struct linux_robust_list_head head;
589 struct linux_robust_list *entry, *next_entry, *pending;
590 unsigned int limit = 2048, pi, next_pi, pip;
591 struct linux_emuldata *led;
592 unsigned long futex_offset;
593 int rc;
594
595 led = p->p_emuldata;
596 if (led->robust_futexes == NULL)
597 return;
598
599 if (copyin(led->robust_futexes, &head, sizeof(head)))
600 return;
601
602 if (fetch_robust_entry(&entry, &head.list.next, &pi))
603 return;
604
605 if (copyin(&head.futex_offset, &futex_offset, sizeof(unsigned long)))
606 return;
607
608 if (fetch_robust_entry(&pending, &head.pending_list, &pip))
609 return;
610
611 while (entry != &head.list) {
612 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
613
614 if (entry != pending)
615 if (handle_futex_death((char *)entry + futex_offset,
616 p->p_pid, pi))
617 return;
618
619 if (rc)
620 return;
621
622 entry = next_entry;
623 pi = next_pi;
624
625 if (!--limit)
626 break;
627
628 yield(); /* XXX why? */
629 }
630
631 if (pending)
632 handle_futex_death((char *)pending + futex_offset,
633 p->p_pid, pip);
634 }
Cache object: ebffcb893b5f9d3ab09eded1f8b3fc3e
|