1 /* $NetBSD: kern_descrip.c,v 1.182.6.6 2009/04/04 23:36:27 snj Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1982, 1986, 1989, 1991, 1993
34 * The Regents of the University of California. All rights reserved.
35 * (c) UNIX System Laboratories, Inc.
36 * All or some portions of this file are derived from material licensed
37 * to the University of California by American Telephone and Telegraph
38 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
39 * the permission of UNIX System Laboratories, Inc.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
66 */
67
68 /*
69 * File descriptor management.
70 */
71
72 #include <sys/cdefs.h>
73 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.182.6.6 2009/04/04 23:36:27 snj Exp $");
74
75 #include <sys/param.h>
76 #include <sys/systm.h>
77 #include <sys/filedesc.h>
78 #include <sys/kernel.h>
79 #include <sys/vnode.h>
80 #include <sys/proc.h>
81 #include <sys/file.h>
82 #include <sys/namei.h>
83 #include <sys/socket.h>
84 #include <sys/socketvar.h>
85 #include <sys/stat.h>
86 #include <sys/ioctl.h>
87 #include <sys/fcntl.h>
88 #include <sys/pool.h>
89 #include <sys/syslog.h>
90 #include <sys/unistd.h>
91 #include <sys/resourcevar.h>
92 #include <sys/conf.h>
93 #include <sys/event.h>
94 #include <sys/kauth.h>
95 #include <sys/atomic.h>
96 #include <sys/mount.h>
97 #include <sys/syscallargs.h>
98 #include <sys/cpu.h>
99
100 static int cwdi_ctor(void *, void *, int);
101 static void cwdi_dtor(void *, void *);
102 static int file_ctor(void *, void *, int);
103 static void file_dtor(void *, void *);
104 static int fdfile_ctor(void *, void *, int);
105 static void fdfile_dtor(void *, void *);
106 static int filedesc_ctor(void *, void *, int);
107 static void filedesc_dtor(void *, void *);
108 static int filedescopen(dev_t, int, int, lwp_t *);
109
110 kmutex_t filelist_lock; /* lock on filehead */
111 struct filelist filehead; /* head of list of open files */
112 u_int nfiles; /* actual number of open files */
113
114 static pool_cache_t cwdi_cache;
115 static pool_cache_t filedesc_cache;
116 static pool_cache_t file_cache;
117 static pool_cache_t fdfile_cache;
118
119 const struct cdevsw filedesc_cdevsw = {
120 filedescopen, noclose, noread, nowrite, noioctl,
121 nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
122 };
123
124 /* For ease of reading. */
125 __strong_alias(fd_putvnode,fd_putfile)
126 __strong_alias(fd_putsock,fd_putfile)
127
128 /*
129 * Initialize the descriptor system.
130 */
131 void
132 fd_sys_init(void)
133 {
134
135 mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
136
137 file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
138 0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
139 KASSERT(file_cache != NULL);
140
141 fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
142 PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
143 NULL);
144 KASSERT(fdfile_cache != NULL);
145
146 cwdi_cache = pool_cache_init(sizeof(struct cwdinfo), coherency_unit,
147 0, 0, "cwdi", NULL, IPL_NONE, cwdi_ctor, cwdi_dtor, NULL);
148 KASSERT(cwdi_cache != NULL);
149
150 filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
151 0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
152 NULL);
153 KASSERT(filedesc_cache != NULL);
154 }
155
156 static int
157 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
158 {
159 int i, off, maxoff;
160 uint32_t sub;
161
162 KASSERT(mutex_owned(&fdp->fd_lock));
163
164 if (want > bits)
165 return -1;
166
167 off = want >> NDENTRYSHIFT;
168 i = want & NDENTRYMASK;
169 if (i) {
170 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
171 if (sub != ~0)
172 goto found;
173 off++;
174 }
175
176 maxoff = NDLOSLOTS(bits);
177 while (off < maxoff) {
178 if ((sub = bitmap[off]) != ~0)
179 goto found;
180 off++;
181 }
182
183 return (-1);
184
185 found:
186 return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
187 }
188
189 static int
190 fd_last_set(filedesc_t *fd, int last)
191 {
192 int off, i;
193 fdfile_t **ofiles = fd->fd_ofiles;
194 uint32_t *bitmap = fd->fd_lomap;
195
196 KASSERT(mutex_owned(&fd->fd_lock));
197
198 off = (last - 1) >> NDENTRYSHIFT;
199
200 while (off >= 0 && !bitmap[off])
201 off--;
202
203 if (off < 0)
204 return (-1);
205
206 i = ((off + 1) << NDENTRYSHIFT) - 1;
207 if (i >= last)
208 i = last - 1;
209
210 /* XXX should use bitmap */
211 /* XXXAD does not work for fd_copy() */
212 while (i > 0 && (ofiles[i] == NULL || !ofiles[i]->ff_allocated))
213 i--;
214
215 return (i);
216 }
217
218 void
219 fd_used(filedesc_t *fdp, unsigned fd)
220 {
221 u_int off = fd >> NDENTRYSHIFT;
222 fdfile_t *ff;
223
224 ff = fdp->fd_ofiles[fd];
225
226 KASSERT(mutex_owned(&fdp->fd_lock));
227 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
228 KASSERT(ff != NULL);
229 KASSERT(ff->ff_file == NULL);
230 KASSERT(!ff->ff_allocated);
231
232 ff->ff_allocated = 1;
233 fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
234 if (fdp->fd_lomap[off] == ~0) {
235 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
236 (1 << (off & NDENTRYMASK))) == 0);
237 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
238 }
239
240 if ((int)fd > fdp->fd_lastfile) {
241 fdp->fd_lastfile = fd;
242 }
243
244 if (fd >= NDFDFILE) {
245 fdp->fd_nused++;
246 } else {
247 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
248 }
249 }
250
251 void
252 fd_unused(filedesc_t *fdp, unsigned fd)
253 {
254 u_int off = fd >> NDENTRYSHIFT;
255 fdfile_t *ff;
256
257 ff = fdp->fd_ofiles[fd];
258
259 /*
260 * Don't assert the lock is held here, as we may be copying
261 * the table during exec() and it is not needed there.
262 * procfs and sysctl are locked out by proc::p_reflock.
263 *
264 * KASSERT(mutex_owned(&fdp->fd_lock));
265 */
266 KASSERT(ff != NULL);
267 KASSERT(ff->ff_file == NULL);
268 KASSERT(ff->ff_allocated);
269
270 if (fd < fdp->fd_freefile) {
271 fdp->fd_freefile = fd;
272 }
273
274 if (fdp->fd_lomap[off] == ~0) {
275 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
276 (1 << (off & NDENTRYMASK))) != 0);
277 fdp->fd_himap[off >> NDENTRYSHIFT] &=
278 ~(1 << (off & NDENTRYMASK));
279 }
280 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
281 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
282 ff->ff_allocated = 0;
283
284 KASSERT(fd <= fdp->fd_lastfile);
285 if (fd == fdp->fd_lastfile) {
286 fdp->fd_lastfile = fd_last_set(fdp, fd);
287 }
288
289 if (fd >= NDFDFILE) {
290 KASSERT(fdp->fd_nused > 0);
291 fdp->fd_nused--;
292 } else {
293 KASSERT(ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
294 }
295 }
296
297 /*
298 * Custom version of fd_unused() for fd_copy(), where the descriptor
299 * table is not yet fully initialized.
300 */
301 static inline void
302 fd_zap(filedesc_t *fdp, unsigned fd)
303 {
304 u_int off = fd >> NDENTRYSHIFT;
305
306 if (fd < fdp->fd_freefile) {
307 fdp->fd_freefile = fd;
308 }
309
310 if (fdp->fd_lomap[off] == ~0) {
311 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
312 (1 << (off & NDENTRYMASK))) != 0);
313 fdp->fd_himap[off >> NDENTRYSHIFT] &=
314 ~(1 << (off & NDENTRYMASK));
315 }
316 KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
317 fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
318 }
319
320 bool
321 fd_isused(filedesc_t *fdp, unsigned fd)
322 {
323 u_int off = fd >> NDENTRYSHIFT;
324
325 KASSERT(fd < fdp->fd_nfiles);
326
327 return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
328 }
329
330 /*
331 * Look up the file structure corresponding to a file descriptor
332 * and return the file, holding a reference on the descriptor.
333 */
334 inline file_t *
335 fd_getfile(unsigned fd)
336 {
337 filedesc_t *fdp;
338 fdfile_t *ff;
339 file_t *fp;
340
341 fdp = curlwp->l_fd;
342
343 /*
344 * Look up the fdfile structure representing this descriptor.
345 * Ensure that we see fd_nfiles before fd_ofiles since we
346 * are doing this unlocked. See fd_tryexpand().
347 */
348 if (__predict_false(fd >= fdp->fd_nfiles)) {
349 return NULL;
350 }
351 membar_consumer();
352 ff = fdp->fd_ofiles[fd];
353 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
354 if (__predict_false(ff == NULL)) {
355 return NULL;
356 }
357
358 /*
359 * Now get a reference to the descriptor. Issue a memory
360 * barrier to ensure that we acquire the file pointer _after_
361 * adding a reference. If no memory barrier, we could fetch
362 * a stale pointer.
363 */
364 atomic_inc_uint(&ff->ff_refcnt);
365 #ifndef __HAVE_ATOMIC_AS_MEMBAR
366 membar_enter();
367 #endif
368
369 /*
370 * If the file is not open or is being closed then put the
371 * reference back.
372 */
373 fp = ff->ff_file;
374 if (__predict_true(fp != NULL)) {
375 return fp;
376 }
377 fd_putfile(fd);
378 return NULL;
379 }
380
381 /*
382 * Release a reference to a file descriptor acquired with fd_getfile().
383 */
384 void
385 fd_putfile(unsigned fd)
386 {
387 filedesc_t *fdp;
388 fdfile_t *ff;
389 u_int u, v;
390
391 fdp = curlwp->l_fd;
392 ff = fdp->fd_ofiles[fd];
393
394 KASSERT(fd < fdp->fd_nfiles);
395 KASSERT(ff != NULL);
396 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
397 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
398
399 /*
400 * Ensure that any use of the file is complete and globally
401 * visible before dropping the final reference. If no membar,
402 * the current CPU could still access memory associated with
403 * the file after it has been freed or recycled by another
404 * CPU.
405 */
406 #ifndef __HAVE_ATOMIC_AS_MEMBAR
407 membar_exit();
408 #endif
409
410 /*
411 * Be optimistic and start out with the assumption that no other
412 * threads are trying to close the descriptor. If the CAS fails,
413 * we lost a race and/or it's being closed.
414 */
415 for (u = ff->ff_refcnt & FR_MASK;; u = v) {
416 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
417 if (__predict_true(u == v)) {
418 return;
419 }
420 if (__predict_false((v & FR_CLOSING) != 0)) {
421 break;
422 }
423 }
424
425 /* Another thread is waiting to close the file: join it. */
426 (void)fd_close(fd);
427 }
428
429 /*
430 * Convenience wrapper around fd_getfile() that returns reference
431 * to a vnode.
432 */
433 int
434 fd_getvnode(unsigned fd, file_t **fpp)
435 {
436 vnode_t *vp;
437 file_t *fp;
438
439 fp = fd_getfile(fd);
440 if (__predict_false(fp == NULL)) {
441 return EBADF;
442 }
443 if (__predict_false(fp->f_type != DTYPE_VNODE)) {
444 fd_putfile(fd);
445 return EINVAL;
446 }
447 vp = fp->f_data;
448 if (__predict_false(vp->v_type == VBAD)) {
449 /* XXX Is this case really necessary? */
450 fd_putfile(fd);
451 return EBADF;
452 }
453 *fpp = fp;
454 return 0;
455 }
456
457 /*
458 * Convenience wrapper around fd_getfile() that returns reference
459 * to a socket.
460 */
461 int
462 fd_getsock(unsigned fd, struct socket **sop)
463 {
464 file_t *fp;
465
466 fp = fd_getfile(fd);
467 if (__predict_false(fp == NULL)) {
468 return EBADF;
469 }
470 if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
471 fd_putfile(fd);
472 return ENOTSOCK;
473 }
474 *sop = fp->f_data;
475 return 0;
476 }
477
478 /*
479 * Look up the file structure corresponding to a file descriptor
480 * and return it with a reference held on the file, not the
481 * descriptor.
482 *
483 * This is heavyweight and only used when accessing descriptors
484 * from a foreign process. The caller must ensure that `p' does
485 * not exit or fork across this call.
486 *
487 * To release the file (not descriptor) reference, use closef().
488 */
489 file_t *
490 fd_getfile2(proc_t *p, unsigned fd)
491 {
492 filedesc_t *fdp;
493 fdfile_t *ff;
494 file_t *fp;
495
496 fdp = p->p_fd;
497 mutex_enter(&fdp->fd_lock);
498 if (fd > fdp->fd_nfiles) {
499 mutex_exit(&fdp->fd_lock);
500 return NULL;
501 }
502 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
503 mutex_exit(&fdp->fd_lock);
504 return NULL;
505 }
506 mutex_enter(&ff->ff_lock);
507 if ((fp = ff->ff_file) == NULL) {
508 mutex_exit(&ff->ff_lock);
509 mutex_exit(&fdp->fd_lock);
510 return NULL;
511 }
512 mutex_enter(&fp->f_lock);
513 fp->f_count++;
514 mutex_exit(&fp->f_lock);
515 mutex_exit(&ff->ff_lock);
516 mutex_exit(&fdp->fd_lock);
517
518 return fp;
519 }
520
521 /*
522 * Internal form of close. Must be called with a reference to the
523 * descriptor, and will drop the reference. When all descriptor
524 * references are dropped, releases the descriptor slot and a single
525 * reference to the file structure.
526 */
527 int
528 fd_close(unsigned fd)
529 {
530 struct flock lf;
531 filedesc_t *fdp;
532 fdfile_t *ff;
533 file_t *fp;
534 proc_t *p;
535 lwp_t *l;
536
537 l = curlwp;
538 p = l->l_proc;
539 fdp = l->l_fd;
540 ff = fdp->fd_ofiles[fd];
541
542 KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
543
544 mutex_enter(&ff->ff_lock);
545 KASSERT((ff->ff_refcnt & FR_MASK) > 0);
546 if (ff->ff_file == NULL) {
547 /*
548 * Another user of the file is already closing, and is
549 * waiting for other users of the file to drain. Release
550 * our reference, and wake up the closer.
551 */
552 atomic_dec_uint(&ff->ff_refcnt);
553 cv_broadcast(&ff->ff_closing);
554 mutex_exit(&ff->ff_lock);
555
556 /*
557 * An application error, so pretend that the descriptor
558 * was already closed. We can't safely wait for it to
559 * be closed without potentially deadlocking.
560 */
561 return (EBADF);
562 }
563 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
564
565 /*
566 * There may be multiple users of this file within the process.
567 * Notify existing and new users that the file is closing. This
568 * will prevent them from adding additional uses to this file
569 * while we are closing it.
570 */
571 fp = ff->ff_file;
572 ff->ff_file = NULL;
573 ff->ff_exclose = false;
574
575 /*
576 * We expect the caller to hold a descriptor reference - drop it.
577 * The reference count may increase beyond zero at this point due
578 * to an erroneous descriptor reference by an application, but
579 * fd_getfile() will notice that the file is being closed and drop
580 * the reference again.
581 */
582 #ifndef __HAVE_ATOMIC_AS_MEMBAR
583 membar_producer();
584 #endif
585 if (__predict_false(atomic_dec_uint_nv(&ff->ff_refcnt) != 0)) {
586 /*
587 * Wait for other references to drain. This is typically
588 * an application error - the descriptor is being closed
589 * while still in use.
590 *
591 */
592 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
593
594 /*
595 * Remove any knotes attached to the file. A knote
596 * attached to the descriptor can hold references on it.
597 */
598 mutex_exit(&ff->ff_lock);
599 if (!SLIST_EMPTY(&ff->ff_knlist)) {
600 knote_fdclose(fd);
601 }
602
603 /* Try to drain out descriptor references. */
604 (*fp->f_ops->fo_drain)(fp);
605 mutex_enter(&ff->ff_lock);
606
607 /*
608 * We need to see the count drop to zero at least once,
609 * in order to ensure that all pre-existing references
610 * have been drained. New references past this point are
611 * of no interest.
612 */
613 while ((ff->ff_refcnt & FR_MASK) != 0) {
614 cv_wait(&ff->ff_closing, &ff->ff_lock);
615 }
616 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
617 } else {
618 /* If no references, there must be no knotes. */
619 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
620 }
621 mutex_exit(&ff->ff_lock);
622
623 /*
624 * POSIX record locking dictates that any close releases ALL
625 * locks owned by this process. This is handled by setting
626 * a flag in the unlock to free ONLY locks obeying POSIX
627 * semantics, and not to free BSD-style file locks.
628 * If the descriptor was in a message, POSIX-style locks
629 * aren't passed with the descriptor.
630 */
631 if ((p->p_flag & PK_ADVLOCK) != 0 && fp->f_type == DTYPE_VNODE) {
632 lf.l_whence = SEEK_SET;
633 lf.l_start = 0;
634 lf.l_len = 0;
635 lf.l_type = F_UNLCK;
636 (void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
637 }
638
639
640 /* Free descriptor slot. */
641 mutex_enter(&fdp->fd_lock);
642 fd_unused(fdp, fd);
643 mutex_exit(&fdp->fd_lock);
644
645 /* Now drop reference to the file itself. */
646 return closef(fp);
647 }
648
649 /*
650 * Duplicate a file descriptor.
651 */
652 int
653 fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
654 {
655 proc_t *p;
656 int error;
657
658 p = curproc;
659
660 while ((error = fd_alloc(p, minfd, newp)) != 0) {
661 if (error != ENOSPC) {
662 return error;
663 }
664 fd_tryexpand(p);
665 }
666
667 curlwp->l_fd->fd_ofiles[*newp]->ff_exclose = exclose;
668 fd_affix(p, fp, *newp);
669 return 0;
670 }
671
672 /*
673 * dup2 operation.
674 */
675 int
676 fd_dup2(file_t *fp, unsigned new)
677 {
678 filedesc_t *fdp;
679 fdfile_t *ff;
680
681 fdp = curlwp->l_fd;
682
683 /*
684 * Ensure there are enough slots in the descriptor table,
685 * and allocate an fdfile_t up front in case we need it.
686 */
687 while (new >= fdp->fd_nfiles) {
688 fd_tryexpand(curproc);
689 }
690 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
691
692 /*
693 * If there is already a file open, close it. If the file is
694 * half open, wait for it to be constructed before closing it.
695 * XXX Potential for deadlock here?
696 */
697 mutex_enter(&fdp->fd_lock);
698 while (fd_isused(fdp, new)) {
699 mutex_exit(&fdp->fd_lock);
700 if (fd_getfile(new) != NULL) {
701 (void)fd_close(new);
702 } else {
703 /* XXX Crummy, but unlikely to happen. */
704 kpause("dup2", false, 1, NULL);
705 }
706 mutex_enter(&fdp->fd_lock);
707 }
708 if (fdp->fd_ofiles[new] == NULL) {
709 KASSERT(new >= NDFDFILE);
710 fdp->fd_ofiles[new] = ff;
711 ff = NULL;
712 }
713 fd_used(fdp, new);
714 mutex_exit(&fdp->fd_lock);
715
716 /* Slot is now allocated. Insert copy of the file. */
717 fd_affix(curproc, fp, new);
718 if (ff != NULL) {
719 pool_cache_put(fdfile_cache, ff);
720 }
721 return 0;
722 }
723
724 /*
725 * Drop reference to a file structure.
726 */
727 int
728 closef(file_t *fp)
729 {
730 struct flock lf;
731 int error;
732
733 /*
734 * Drop reference. If referenced elsewhere it's still open
735 * and we have nothing more to do.
736 */
737 mutex_enter(&fp->f_lock);
738 KASSERT(fp->f_count > 0);
739 if (--fp->f_count > 0) {
740 mutex_exit(&fp->f_lock);
741 return 0;
742 }
743 KASSERT(fp->f_count == 0);
744 mutex_exit(&fp->f_lock);
745
746 /* We held the last reference - release locks, close and free. */
747 if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
748 lf.l_whence = SEEK_SET;
749 lf.l_start = 0;
750 lf.l_len = 0;
751 lf.l_type = F_UNLCK;
752 (void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
753 }
754 if (fp->f_ops != NULL) {
755 error = (*fp->f_ops->fo_close)(fp);
756 } else {
757 error = 0;
758 }
759 ffree(fp);
760
761 return error;
762 }
763
764 /*
765 * Allocate a file descriptor for the process.
766 */
767 int
768 fd_alloc(proc_t *p, int want, int *result)
769 {
770 filedesc_t *fdp;
771 int i, lim, last, error;
772 u_int off, new;
773 fdfile_t *ff;
774
775 KASSERT(p == curproc || p == &proc0);
776
777 fdp = p->p_fd;
778 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
779 KASSERT(ff->ff_refcnt == 0);
780 KASSERT(ff->ff_file == NULL);
781
782 /*
783 * Search for a free descriptor starting at the higher
784 * of want or fd_freefile.
785 */
786 mutex_enter(&fdp->fd_lock);
787 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
788 lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
789 last = min(fdp->fd_nfiles, lim);
790 for (;;) {
791 if ((i = want) < fdp->fd_freefile)
792 i = fdp->fd_freefile;
793 off = i >> NDENTRYSHIFT;
794 new = fd_next_zero(fdp, fdp->fd_himap, off,
795 (last + NDENTRIES - 1) >> NDENTRYSHIFT);
796 if (new == -1)
797 break;
798 i = fd_next_zero(fdp, &fdp->fd_lomap[new],
799 new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
800 if (i == -1) {
801 /*
802 * Free file descriptor in this block was
803 * below want, try again with higher want.
804 */
805 want = (new + 1) << NDENTRYSHIFT;
806 continue;
807 }
808 i += (new << NDENTRYSHIFT);
809 if (i >= last) {
810 break;
811 }
812 if (fdp->fd_ofiles[i] == NULL) {
813 KASSERT(i >= NDFDFILE);
814 fdp->fd_ofiles[i] = ff;
815 } else {
816 pool_cache_put(fdfile_cache, ff);
817 }
818 KASSERT(fdp->fd_ofiles[i]->ff_file == NULL);
819 fd_used(fdp, i);
820 if (want <= fdp->fd_freefile) {
821 fdp->fd_freefile = i;
822 }
823 *result = i;
824 mutex_exit(&fdp->fd_lock);
825 KASSERT(i >= NDFDFILE ||
826 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
827 return 0;
828 }
829
830 /* No space in current array. Let the caller expand and retry. */
831 error = (fdp->fd_nfiles >= lim) ? EMFILE : ENOSPC;
832 mutex_exit(&fdp->fd_lock);
833 pool_cache_put(fdfile_cache, ff);
834 return error;
835 }
836
837 /*
838 * Allocate memory for the open files array.
839 */
840 static fdfile_t **
841 fd_ofile_alloc(int n)
842 {
843 uintptr_t *ptr, sz;
844
845 KASSERT(n > NDFILE);
846
847 sz = (n + 2) * sizeof(uintptr_t);
848 ptr = kmem_alloc((size_t)sz, KM_SLEEP);
849 ptr[1] = sz;
850
851 return (fdfile_t **)(ptr + 2);
852 }
853
854 /*
855 * Free an open files array.
856 */
857 static void
858 fd_ofile_free(int n, fdfile_t **of)
859 {
860 uintptr_t *ptr, sz;
861
862 KASSERT(n > NDFILE);
863
864 sz = (n + 2) * sizeof(uintptr_t);
865 ptr = (uintptr_t *)of - 2;
866 KASSERT(ptr[1] == sz);
867 kmem_free(ptr, sz);
868 }
869
870 /*
871 * Allocate descriptor bitmap.
872 */
873 static void
874 fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
875 {
876 uint8_t *ptr;
877 size_t szlo, szhi;
878
879 KASSERT(n > NDENTRIES);
880
881 szlo = NDLOSLOTS(n) * sizeof(uint32_t);
882 szhi = NDHISLOTS(n) * sizeof(uint32_t);
883 ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
884 *lo = (uint32_t *)ptr;
885 *hi = (uint32_t *)(ptr + szlo);
886 }
887
888 /*
889 * Free descriptor bitmap.
890 */
891 static void
892 fd_map_free(int n, uint32_t *lo, uint32_t *hi)
893 {
894 size_t szlo, szhi;
895
896 KASSERT(n > NDENTRIES);
897
898 szlo = NDLOSLOTS(n) * sizeof(uint32_t);
899 szhi = NDHISLOTS(n) * sizeof(uint32_t);
900 KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
901 kmem_free(lo, szlo + szhi);
902 }
903
904 /*
905 * Expand a process' descriptor table.
906 */
907 void
908 fd_tryexpand(proc_t *p)
909 {
910 filedesc_t *fdp;
911 int i, numfiles, oldnfiles;
912 fdfile_t **newofile;
913 uint32_t *newhimap, *newlomap;
914
915 KASSERT(p == curproc || p == &proc0);
916
917 fdp = p->p_fd;
918 newhimap = NULL;
919 newlomap = NULL;
920 oldnfiles = fdp->fd_nfiles;
921
922 if (oldnfiles < NDEXTENT)
923 numfiles = NDEXTENT;
924 else
925 numfiles = 2 * oldnfiles;
926
927 newofile = fd_ofile_alloc(numfiles);
928 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
929 fd_map_alloc(numfiles, &newlomap, &newhimap);
930 }
931
932 mutex_enter(&fdp->fd_lock);
933 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
934 if (fdp->fd_nfiles != oldnfiles) {
935 /* fdp changed; caller must retry */
936 mutex_exit(&fdp->fd_lock);
937 fd_ofile_free(numfiles, newofile);
938 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
939 fd_map_free(numfiles, newlomap, newhimap);
940 }
941 return;
942 }
943
944 /* Copy the existing ofile array and zero the new portion. */
945 i = sizeof(fdfile_t *) * fdp->fd_nfiles;
946 memcpy(newofile, fdp->fd_ofiles, i);
947 memset((uint8_t *)newofile + i, 0, numfiles * sizeof(fdfile_t *) - i);
948
949 /*
950 * Link old ofiles array into list to be discarded. We defer
951 * freeing until process exit if the descriptor table is visble
952 * to other threads.
953 */
954 if (oldnfiles > NDFILE) {
955 if ((fdp->fd_refcnt | p->p_nlwps) > 1) {
956 fdp->fd_ofiles[-2] = (void *)fdp->fd_discard;
957 fdp->fd_discard = fdp->fd_ofiles - 2;
958 } else {
959 fd_ofile_free(oldnfiles, fdp->fd_ofiles);
960 }
961 }
962
963 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
964 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
965 memcpy(newhimap, fdp->fd_himap, i);
966 memset((uint8_t *)newhimap + i, 0,
967 NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
968
969 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
970 memcpy(newlomap, fdp->fd_lomap, i);
971 memset((uint8_t *)newlomap + i, 0,
972 NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
973
974 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
975 fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
976 }
977 fdp->fd_himap = newhimap;
978 fdp->fd_lomap = newlomap;
979 }
980
981 /*
982 * All other modifications must become globally visible before
983 * the change to fd_nfiles. See fd_getfile().
984 */
985 fdp->fd_ofiles = newofile;
986 membar_producer();
987 fdp->fd_nfiles = numfiles;
988 mutex_exit(&fdp->fd_lock);
989
990 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
991 }
992
993 /*
994 * Create a new open file structure and allocate a file descriptor
995 * for the current process.
996 */
997 int
998 fd_allocfile(file_t **resultfp, int *resultfd)
999 {
1000 file_t *fp;
1001 proc_t *p;
1002 int error;
1003
1004 p = curproc;
1005
1006 while ((error = fd_alloc(p, 0, resultfd)) != 0) {
1007 if (error != ENOSPC) {
1008 return error;
1009 }
1010 fd_tryexpand(p);
1011 }
1012
1013 fp = pool_cache_get(file_cache, PR_WAITOK);
1014 KASSERT(fp->f_count == 0);
1015 KASSERT(fp->f_msgcount == 0);
1016 KASSERT(fp->f_unpcount == 0);
1017 fp->f_cred = kauth_cred_get();
1018 kauth_cred_hold(fp->f_cred);
1019
1020 if (__predict_false(atomic_inc_uint_nv(&nfiles) >= maxfiles)) {
1021 fd_abort(p, fp, *resultfd);
1022 tablefull("file", "increase kern.maxfiles or MAXFILES");
1023 return ENFILE;
1024 }
1025
1026 /*
1027 * Don't allow recycled files to be scanned.
1028 */
1029 if ((fp->f_flag & FSCAN) != 0) {
1030 mutex_enter(&fp->f_lock);
1031 atomic_and_uint(&fp->f_flag, ~FSCAN);
1032 mutex_exit(&fp->f_lock);
1033 }
1034
1035 fp->f_advice = 0;
1036 fp->f_msgcount = 0;
1037 fp->f_offset = 0;
1038 *resultfp = fp;
1039
1040 return 0;
1041 }
1042
1043 /*
1044 * Successful creation of a new descriptor: make visible to the process.
1045 */
1046 void
1047 fd_affix(proc_t *p, file_t *fp, unsigned fd)
1048 {
1049 fdfile_t *ff;
1050 filedesc_t *fdp;
1051
1052 KASSERT(p == curproc || p == &proc0);
1053
1054 /* Add a reference to the file structure. */
1055 mutex_enter(&fp->f_lock);
1056 fp->f_count++;
1057 mutex_exit(&fp->f_lock);
1058
1059 /*
1060 * Insert the new file into the descriptor slot.
1061 *
1062 * The memory barriers provided by lock activity in this routine
1063 * ensure that any updates to the file structure become globally
1064 * visible before the file becomes visible to other LWPs in the
1065 * current process.
1066 */
1067 fdp = p->p_fd;
1068 ff = fdp->fd_ofiles[fd];
1069
1070 KASSERT(ff != NULL);
1071 KASSERT(ff->ff_file == NULL);
1072 KASSERT(ff->ff_allocated);
1073 KASSERT(fd_isused(fdp, fd));
1074 KASSERT(fd >= NDFDFILE ||
1075 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1076
1077 /* No need to lock in order to make file initially visible. */
1078 ff->ff_file = fp;
1079 }
1080
1081 /*
1082 * Abort creation of a new descriptor: free descriptor slot and file.
1083 */
1084 void
1085 fd_abort(proc_t *p, file_t *fp, unsigned fd)
1086 {
1087 filedesc_t *fdp;
1088 fdfile_t *ff;
1089
1090 KASSERT(p == curproc || p == &proc0);
1091
1092 fdp = p->p_fd;
1093 ff = fdp->fd_ofiles[fd];
1094
1095 KASSERT(fd >= NDFDFILE ||
1096 fdp->fd_ofiles[fd] == (fdfile_t *)fdp->fd_dfdfile[fd]);
1097
1098 mutex_enter(&fdp->fd_lock);
1099 KASSERT(fd_isused(fdp, fd));
1100 fd_unused(fdp, fd);
1101 mutex_exit(&fdp->fd_lock);
1102
1103 if (fp != NULL) {
1104 ffree(fp);
1105 }
1106 }
1107
1108 /*
1109 * Free a file descriptor.
1110 */
1111 void
1112 ffree(file_t *fp)
1113 {
1114
1115 KASSERT(fp->f_count == 0);
1116
1117 atomic_dec_uint(&nfiles);
1118 kauth_cred_free(fp->f_cred);
1119 pool_cache_put(file_cache, fp);
1120 }
1121
1122 /*
1123 * Create an initial cwdinfo structure, using the same current and root
1124 * directories as curproc.
1125 */
1126 struct cwdinfo *
1127 cwdinit(void)
1128 {
1129 struct cwdinfo *cwdi;
1130 struct cwdinfo *copy;
1131
1132 cwdi = pool_cache_get(cwdi_cache, PR_WAITOK);
1133 copy = curproc->p_cwdi;
1134
1135 rw_enter(©->cwdi_lock, RW_READER);
1136 cwdi->cwdi_cdir = copy->cwdi_cdir;
1137 if (cwdi->cwdi_cdir)
1138 VREF(cwdi->cwdi_cdir);
1139 cwdi->cwdi_rdir = copy->cwdi_rdir;
1140 if (cwdi->cwdi_rdir)
1141 VREF(cwdi->cwdi_rdir);
1142 cwdi->cwdi_edir = copy->cwdi_edir;
1143 if (cwdi->cwdi_edir)
1144 VREF(cwdi->cwdi_edir);
1145 cwdi->cwdi_cmask = copy->cwdi_cmask;
1146 cwdi->cwdi_refcnt = 1;
1147 rw_exit(©->cwdi_lock);
1148
1149 return (cwdi);
1150 }
1151
1152 static int
1153 cwdi_ctor(void *arg, void *obj, int flags)
1154 {
1155 struct cwdinfo *cwdi = obj;
1156
1157 rw_init(&cwdi->cwdi_lock);
1158
1159 return 0;
1160 }
1161
1162 static void
1163 cwdi_dtor(void *arg, void *obj)
1164 {
1165 struct cwdinfo *cwdi = obj;
1166
1167 rw_destroy(&cwdi->cwdi_lock);
1168 }
1169
1170 static int
1171 file_ctor(void *arg, void *obj, int flags)
1172 {
1173 file_t *fp = obj;
1174
1175 memset(fp, 0, sizeof(*fp));
1176 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1177
1178 mutex_enter(&filelist_lock);
1179 LIST_INSERT_HEAD(&filehead, fp, f_list);
1180 mutex_exit(&filelist_lock);
1181
1182 return 0;
1183 }
1184
1185 static void
1186 file_dtor(void *arg, void *obj)
1187 {
1188 file_t *fp = obj;
1189
1190 mutex_enter(&filelist_lock);
1191 LIST_REMOVE(fp, f_list);
1192 mutex_exit(&filelist_lock);
1193
1194 mutex_destroy(&fp->f_lock);
1195 }
1196
1197 static int
1198 fdfile_ctor(void *arg, void *obj, int flags)
1199 {
1200 fdfile_t *ff = obj;
1201
1202 memset(ff, 0, sizeof(*ff));
1203 mutex_init(&ff->ff_lock, MUTEX_DEFAULT, IPL_NONE);
1204 cv_init(&ff->ff_closing, "fdclose");
1205
1206 return 0;
1207 }
1208
1209 static void
1210 fdfile_dtor(void *arg, void *obj)
1211 {
1212 fdfile_t *ff = obj;
1213
1214 mutex_destroy(&ff->ff_lock);
1215 cv_destroy(&ff->ff_closing);
1216 }
1217
1218 file_t *
1219 fgetdummy(void)
1220 {
1221 file_t *fp;
1222
1223 fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
1224 if (fp != NULL) {
1225 memset(fp, 0, sizeof(*fp));
1226 mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
1227 }
1228 return fp;
1229 }
1230
1231 void
1232 fputdummy(file_t *fp)
1233 {
1234
1235 mutex_destroy(&fp->f_lock);
1236 kmem_free(fp, sizeof(*fp));
1237 }
1238
1239 /*
1240 * Make p2 share p1's cwdinfo.
1241 */
1242 void
1243 cwdshare(struct proc *p2)
1244 {
1245 struct cwdinfo *cwdi;
1246
1247 cwdi = curproc->p_cwdi;
1248
1249 atomic_inc_uint(&cwdi->cwdi_refcnt);
1250 p2->p_cwdi = cwdi;
1251 }
1252
1253 /*
1254 * Release a cwdinfo structure.
1255 */
1256 void
1257 cwdfree(struct cwdinfo *cwdi)
1258 {
1259
1260 if (atomic_dec_uint_nv(&cwdi->cwdi_refcnt) > 0)
1261 return;
1262
1263 vrele(cwdi->cwdi_cdir);
1264 if (cwdi->cwdi_rdir)
1265 vrele(cwdi->cwdi_rdir);
1266 if (cwdi->cwdi_edir)
1267 vrele(cwdi->cwdi_edir);
1268 pool_cache_put(cwdi_cache, cwdi);
1269 }
1270
1271 /*
1272 * Create an initial filedesc structure.
1273 */
1274 filedesc_t *
1275 fd_init(filedesc_t *fdp)
1276 {
1277 unsigned fd;
1278
1279 if (fdp == NULL) {
1280 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1281 } else {
1282 filedesc_ctor(NULL, fdp, PR_WAITOK);
1283 }
1284
1285 fdp->fd_refcnt = 1;
1286 fdp->fd_ofiles = fdp->fd_dfiles;
1287 fdp->fd_nfiles = NDFILE;
1288 fdp->fd_himap = fdp->fd_dhimap;
1289 fdp->fd_lomap = fdp->fd_dlomap;
1290 KASSERT(fdp->fd_lastfile == -1);
1291 KASSERT(fdp->fd_lastkqfile == -1);
1292 KASSERT(fdp->fd_knhash == NULL);
1293
1294 memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
1295 offsetof(filedesc_t, fd_startzero));
1296 for (fd = 0; fd < NDFDFILE; fd++) {
1297 fdp->fd_ofiles[fd] = (fdfile_t *)fdp->fd_dfdfile[fd];
1298 }
1299
1300 return fdp;
1301 }
1302
1303 /*
1304 * Initialize a file descriptor table.
1305 */
1306 static int
1307 filedesc_ctor(void *arg, void *obj, int flag)
1308 {
1309 filedesc_t *fdp = obj;
1310 int i;
1311
1312 memset(fdp, 0, sizeof(*fdp));
1313 mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
1314 fdp->fd_lastfile = -1;
1315 fdp->fd_lastkqfile = -1;
1316
1317 CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
1318 for (i = 0; i < NDFDFILE; i++) {
1319 fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
1320 }
1321
1322 return 0;
1323 }
1324
1325 static void
1326 filedesc_dtor(void *arg, void *obj)
1327 {
1328 filedesc_t *fdp = obj;
1329 int i;
1330
1331 for (i = 0; i < NDFDFILE; i++) {
1332 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
1333 }
1334
1335 mutex_destroy(&fdp->fd_lock);
1336 }
1337
1338 /*
1339 * Make p2 share p1's filedesc structure.
1340 */
1341 void
1342 fd_share(struct proc *p2)
1343 {
1344 filedesc_t *fdp;
1345
1346 fdp = curlwp->l_fd;
1347 p2->p_fd = fdp;
1348 atomic_inc_uint(&fdp->fd_refcnt);
1349 }
1350
1351 /*
1352 * Copy a filedesc structure.
1353 */
1354 filedesc_t *
1355 fd_copy(void)
1356 {
1357 filedesc_t *newfdp, *fdp;
1358 fdfile_t *ff, *fflist, **ffp, **nffp, *ff2;
1359 int i, nused, numfiles, lastfile, j, newlast;
1360 file_t *fp;
1361
1362 fdp = curproc->p_fd;
1363 newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1364 newfdp->fd_refcnt = 1;
1365
1366 KASSERT(newfdp->fd_knhash == NULL);
1367 KASSERT(newfdp->fd_knhashmask == 0);
1368 KASSERT(newfdp->fd_discard == NULL);
1369
1370 for (;;) {
1371 numfiles = fdp->fd_nfiles;
1372 lastfile = fdp->fd_lastfile;
1373
1374 /*
1375 * If the number of open files fits in the internal arrays
1376 * of the open file structure, use them, otherwise allocate
1377 * additional memory for the number of descriptors currently
1378 * in use.
1379 */
1380 if (lastfile < NDFILE) {
1381 i = NDFILE;
1382 newfdp->fd_ofiles = newfdp->fd_dfiles;
1383 } else {
1384 /*
1385 * Compute the smallest multiple of NDEXTENT needed
1386 * for the file descriptors currently in use,
1387 * allowing the table to shrink.
1388 */
1389 i = numfiles;
1390 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
1391 i /= 2;
1392 }
1393 newfdp->fd_ofiles = fd_ofile_alloc(i);
1394 KASSERT(i > NDFILE);
1395 }
1396 if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
1397 newfdp->fd_himap = newfdp->fd_dhimap;
1398 newfdp->fd_lomap = newfdp->fd_dlomap;
1399 } else {
1400 fd_map_alloc(i, &newfdp->fd_lomap,
1401 &newfdp->fd_himap);
1402 }
1403
1404 /*
1405 * Allocate and string together fdfile structures.
1406 * We abuse fdfile_t::ff_file here, but it will be
1407 * cleared before this routine returns.
1408 */
1409 nused = fdp->fd_nused;
1410 fflist = NULL;
1411 for (j = nused; j != 0; j--) {
1412 ff = pool_cache_get(fdfile_cache, PR_WAITOK);
1413 ff->ff_file = (void *)fflist;
1414 fflist = ff;
1415 }
1416
1417 mutex_enter(&fdp->fd_lock);
1418 if (numfiles == fdp->fd_nfiles && nused == fdp->fd_nused &&
1419 lastfile == fdp->fd_lastfile) {
1420 break;
1421 }
1422 mutex_exit(&fdp->fd_lock);
1423 if (i > NDFILE) {
1424 fd_ofile_free(i, newfdp->fd_ofiles);
1425 }
1426 if (NDHISLOTS(i) > NDHISLOTS(NDFILE)) {
1427 fd_map_free(i, newfdp->fd_lomap, newfdp->fd_himap);
1428 }
1429 while (fflist != NULL) {
1430 ff = fflist;
1431 fflist = (void *)ff->ff_file;
1432 ff->ff_file = NULL;
1433 pool_cache_put(fdfile_cache, ff);
1434 }
1435 }
1436
1437 newfdp->fd_nfiles = i;
1438 newfdp->fd_freefile = fdp->fd_freefile;
1439 newfdp->fd_exclose = fdp->fd_exclose;
1440
1441 /*
1442 * Clear the entries that will not be copied over.
1443 * Avoid calling memset with 0 size.
1444 */
1445 if (lastfile < (i-1)) {
1446 memset(newfdp->fd_ofiles + lastfile + 1, 0,
1447 (i - lastfile - 1) * sizeof(file_t **));
1448 }
1449 if (i < NDENTRIES * NDENTRIES) {
1450 i = NDENTRIES * NDENTRIES; /* size of inlined bitmaps */
1451 }
1452 memcpy(newfdp->fd_himap, fdp->fd_himap, NDHISLOTS(i)*sizeof(uint32_t));
1453 memcpy(newfdp->fd_lomap, fdp->fd_lomap, NDLOSLOTS(i)*sizeof(uint32_t));
1454
1455 ffp = fdp->fd_ofiles;
1456 nffp = newfdp->fd_ofiles;
1457 j = imax(lastfile, (NDFDFILE - 1));
1458 newlast = -1;
1459 KASSERT(j < fdp->fd_nfiles);
1460 for (i = 0; i <= j; i++, ffp++, *nffp++ = ff2) {
1461 ff = *ffp;
1462 /* Install built-in fdfiles even if unused here. */
1463 if (i < NDFDFILE) {
1464 ff2 = (fdfile_t *)newfdp->fd_dfdfile[i];
1465 } else {
1466 ff2 = NULL;
1467 }
1468 /* Determine if descriptor is active in parent. */
1469 if (ff == NULL || !fd_isused(fdp, i)) {
1470 KASSERT(ff != NULL || i >= NDFDFILE);
1471 continue;
1472 }
1473 mutex_enter(&ff->ff_lock);
1474 fp = ff->ff_file;
1475 if (fp == NULL) {
1476 /* Descriptor is half-open: free slot. */
1477 fd_zap(newfdp, i);
1478 mutex_exit(&ff->ff_lock);
1479 continue;
1480 }
1481 if (fp->f_type == DTYPE_KQUEUE) {
1482 /* kqueue descriptors cannot be copied. */
1483 fd_zap(newfdp, i);
1484 mutex_exit(&ff->ff_lock);
1485 continue;
1486 }
1487 /* It's active: add a reference to the file. */
1488 mutex_enter(&fp->f_lock);
1489 fp->f_count++;
1490 mutex_exit(&fp->f_lock);
1491 /* Consume one fdfile_t to represent it. */
1492 if (i >= NDFDFILE) {
1493 ff2 = fflist;
1494 fflist = (void *)ff2->ff_file;
1495 }
1496 ff2->ff_file = fp;
1497 ff2->ff_exclose = ff->ff_exclose;
1498 ff2->ff_allocated = true;
1499 mutex_exit(&ff->ff_lock);
1500 if (i > newlast) {
1501 newlast = i;
1502 }
1503 }
1504 mutex_exit(&fdp->fd_lock);
1505
1506 /* Discard unused fdfile_t structures. */
1507 while (__predict_false(fflist != NULL)) {
1508 ff = fflist;
1509 fflist = (void *)ff->ff_file;
1510 ff->ff_file = NULL;
1511 pool_cache_put(fdfile_cache, ff);
1512 nused--;
1513 }
1514 KASSERT(nused >= 0);
1515 KASSERT(newfdp->fd_ofiles[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
1516
1517 newfdp->fd_nused = nused;
1518 newfdp->fd_lastfile = newlast;
1519
1520 return (newfdp);
1521 }
1522
1523 /*
1524 * Release a filedesc structure.
1525 */
1526 void
1527 fd_free(void)
1528 {
1529 filedesc_t *fdp;
1530 fdfile_t *ff;
1531 file_t *fp;
1532 int fd, lastfd;
1533 void **discard;
1534
1535 fdp = curlwp->l_fd;
1536
1537 KASSERT(fdp->fd_ofiles[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
1538
1539 if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
1540 return;
1541
1542 /*
1543 * Close any files that the process holds open.
1544 */
1545 for (fd = 0, lastfd = fdp->fd_nfiles - 1; fd <= lastfd; fd++) {
1546 ff = fdp->fd_ofiles[fd];
1547 KASSERT(fd >= NDFDFILE ||
1548 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1549 if ((ff = fdp->fd_ofiles[fd]) == NULL)
1550 continue;
1551 if ((fp = ff->ff_file) != NULL) {
1552 /*
1553 * Must use fd_close() here as kqueue holds
1554 * long term references to descriptors.
1555 */
1556 ff->ff_refcnt++;
1557 fd_close(fd);
1558 }
1559 KASSERT(ff->ff_refcnt == 0);
1560 KASSERT(ff->ff_file == NULL);
1561 KASSERT(!ff->ff_exclose);
1562 KASSERT(!ff->ff_allocated);
1563 if (fd >= NDFDFILE) {
1564 pool_cache_put(fdfile_cache, ff);
1565 }
1566 }
1567
1568 /*
1569 * Clean out the descriptor table for the next user and return
1570 * to the cache.
1571 */
1572 while ((discard = fdp->fd_discard) != NULL) {
1573 fdp->fd_discard = discard[0];
1574 kmem_free(discard, (uintptr_t)discard[1]);
1575 }
1576 if (NDHISLOTS(fdp->fd_nfiles) > NDHISLOTS(NDFILE)) {
1577 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
1578 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
1579 fd_map_free(fdp->fd_nfiles, fdp->fd_lomap, fdp->fd_himap);
1580 }
1581 if (fdp->fd_nfiles > NDFILE) {
1582 KASSERT(fdp->fd_ofiles != fdp->fd_dfiles);
1583 fd_ofile_free(fdp->fd_nfiles, fdp->fd_ofiles);
1584 }
1585 if (fdp->fd_knhash != NULL) {
1586 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
1587 fdp->fd_knhash = NULL;
1588 fdp->fd_knhashmask = 0;
1589 } else {
1590 KASSERT(fdp->fd_knhashmask == 0);
1591 }
1592 fdp->fd_lastkqfile = -1;
1593 pool_cache_put(filedesc_cache, fdp);
1594 }
1595
1596 /*
1597 * File Descriptor pseudo-device driver (/dev/fd/).
1598 *
1599 * Opening minor device N dup()s the file (if any) connected to file
1600 * descriptor N belonging to the calling process. Note that this driver
1601 * consists of only the ``open()'' routine, because all subsequent
1602 * references to this file will be direct to the other driver.
1603 */
1604 static int
1605 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1606 {
1607
1608 /*
1609 * XXX Kludge: set dupfd to contain the value of the
1610 * the file descriptor being sought for duplication. The error
1611 * return ensures that the vnode for this device will be released
1612 * by vn_open. Open will detect this special error and take the
1613 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
1614 * will simply report the error.
1615 */
1616 l->l_dupfd = minor(dev); /* XXX */
1617 return EDUPFD;
1618 }
1619
1620 /*
1621 * Duplicate the specified descriptor to a free descriptor.
1622 */
1623 int
1624 fd_dupopen(int old, int *new, int mode, int error)
1625 {
1626 filedesc_t *fdp;
1627 fdfile_t *ff;
1628 file_t *fp;
1629
1630 if ((fp = fd_getfile(old)) == NULL) {
1631 return EBADF;
1632 }
1633 fdp = curlwp->l_fd;
1634 ff = fdp->fd_ofiles[old];
1635
1636 /*
1637 * There are two cases of interest here.
1638 *
1639 * For EDUPFD simply dup (dfd) to file descriptor
1640 * (indx) and return.
1641 *
1642 * For EMOVEFD steal away the file structure from (dfd) and
1643 * store it in (indx). (dfd) is effectively closed by
1644 * this operation.
1645 *
1646 * Any other error code is just returned.
1647 */
1648 switch (error) {
1649 case EDUPFD:
1650 /*
1651 * Check that the mode the file is being opened for is a
1652 * subset of the mode of the existing descriptor.
1653 */
1654 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
1655 error = EACCES;
1656 break;
1657 }
1658
1659 /* Copy it. */
1660 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1661 break;
1662
1663 case EMOVEFD:
1664 /* Copy it. */
1665 error = fd_dup(fp, 0, new, fdp->fd_ofiles[old]->ff_exclose);
1666 if (error != 0) {
1667 break;
1668 }
1669
1670 /* Steal away the file pointer from 'old'. */
1671 (void)fd_close(old);
1672 return 0;
1673 }
1674
1675 fd_putfile(old);
1676 return error;
1677 }
1678
1679 /*
1680 * Close open files on exec.
1681 */
1682 void
1683 fd_closeexec(void)
1684 {
1685 struct cwdinfo *cwdi;
1686 proc_t *p;
1687 filedesc_t *fdp;
1688 fdfile_t *ff;
1689 lwp_t *l;
1690 int fd;
1691
1692 l = curlwp;
1693 p = l->l_proc;
1694 fdp = p->p_fd;
1695 cwdi = p->p_cwdi;
1696
1697 if (cwdi->cwdi_refcnt > 1) {
1698 cwdi = cwdinit();
1699 cwdfree(p->p_cwdi);
1700 p->p_cwdi = cwdi;
1701 }
1702 if (p->p_cwdi->cwdi_edir) {
1703 vrele(p->p_cwdi->cwdi_edir);
1704 }
1705
1706 if (fdp->fd_refcnt > 1) {
1707 fdp = fd_copy();
1708 fd_free();
1709 p->p_fd = fdp;
1710 l->l_fd = fdp;
1711 }
1712 if (!fdp->fd_exclose) {
1713 return;
1714 }
1715 fdp->fd_exclose = false;
1716
1717 for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
1718 if ((ff = fdp->fd_ofiles[fd]) == NULL) {
1719 KASSERT(fd >= NDFDFILE);
1720 continue;
1721 }
1722 KASSERT(fd >= NDFDFILE ||
1723 ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
1724 if (ff->ff_file == NULL)
1725 continue;
1726 if (ff->ff_exclose) {
1727 /*
1728 * We need a reference to close the file.
1729 * No other threads can see the fdfile_t at
1730 * this point, so don't bother locking.
1731 */
1732 KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
1733 ff->ff_refcnt++;
1734 fd_close(fd);
1735 }
1736 }
1737 }
1738
1739 /*
1740 * It is unsafe for set[ug]id processes to be started with file
1741 * descriptors 0..2 closed, as these descriptors are given implicit
1742 * significance in the Standard C library. fdcheckstd() will create a
1743 * descriptor referencing /dev/null for each of stdin, stdout, and
1744 * stderr that is not already open.
1745 */
1746 #define CHECK_UPTO 3
1747 int
1748 fd_checkstd(void)
1749 {
1750 struct proc *p;
1751 struct nameidata nd;
1752 filedesc_t *fdp;
1753 file_t *fp;
1754 struct proc *pp;
1755 int fd, i, error, flags = FREAD|FWRITE;
1756 char closed[CHECK_UPTO * 3 + 1], which[3 + 1];
1757
1758 p = curproc;
1759 closed[0] = '\0';
1760 if ((fdp = p->p_fd) == NULL)
1761 return (0);
1762 for (i = 0; i < CHECK_UPTO; i++) {
1763 KASSERT(i >= NDFDFILE ||
1764 fdp->fd_ofiles[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
1765 if (fdp->fd_ofiles[i]->ff_file != NULL)
1766 continue;
1767 snprintf(which, sizeof(which), ",%d", i);
1768 strlcat(closed, which, sizeof(closed));
1769 if ((error = fd_allocfile(&fp, &fd)) != 0)
1770 return (error);
1771 KASSERT(fd < CHECK_UPTO);
1772 NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/dev/null");
1773 if ((error = vn_open(&nd, flags, 0)) != 0) {
1774 fd_abort(p, fp, fd);
1775 return (error);
1776 }
1777 fp->f_data = nd.ni_vp;
1778 fp->f_flag = flags;
1779 fp->f_ops = &vnops;
1780 fp->f_type = DTYPE_VNODE;
1781 VOP_UNLOCK(nd.ni_vp, 0);
1782 fd_affix(p, fp, fd);
1783 }
1784 if (closed[0] != '\0') {
1785 mutex_enter(proc_lock);
1786 pp = p->p_pptr;
1787 mutex_enter(pp->p_lock);
1788 log(LOG_WARNING, "set{u,g}id pid %d (%s) "
1789 "was invoked by uid %d ppid %d (%s) "
1790 "with fd %s closed\n",
1791 p->p_pid, p->p_comm, kauth_cred_geteuid(pp->p_cred),
1792 pp->p_pid, pp->p_comm, &closed[1]);
1793 mutex_exit(pp->p_lock);
1794 mutex_exit(proc_lock);
1795 }
1796 return (0);
1797 }
1798 #undef CHECK_UPTO
1799
1800 /*
1801 * Sets descriptor owner. If the owner is a process, 'pgid'
1802 * is set to positive value, process ID. If the owner is process group,
1803 * 'pgid' is set to -pg_id.
1804 */
1805 int
1806 fsetown(pid_t *pgid, u_long cmd, const void *data)
1807 {
1808 int id = *(const int *)data;
1809 int error;
1810
1811 switch (cmd) {
1812 case TIOCSPGRP:
1813 if (id < 0)
1814 return (EINVAL);
1815 id = -id;
1816 break;
1817 default:
1818 break;
1819 }
1820
1821 if (id > 0 && !pfind(id))
1822 return (ESRCH);
1823 else if (id < 0 && (error = pgid_in_session(curproc, -id)))
1824 return (error);
1825
1826 *pgid = id;
1827 return (0);
1828 }
1829
1830 /*
1831 * Return descriptor owner information. If the value is positive,
1832 * it's process ID. If it's negative, it's process group ID and
1833 * needs the sign removed before use.
1834 */
1835 int
1836 fgetown(pid_t pgid, u_long cmd, void *data)
1837 {
1838
1839 switch (cmd) {
1840 case TIOCGPGRP:
1841 *(int *)data = -pgid;
1842 break;
1843 default:
1844 *(int *)data = pgid;
1845 break;
1846 }
1847 return (0);
1848 }
1849
1850 /*
1851 * Send signal to descriptor owner, either process or process group.
1852 */
1853 void
1854 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
1855 {
1856 ksiginfo_t ksi;
1857
1858 KASSERT(!cpu_intr_p());
1859
1860 if (pgid == 0) {
1861 return;
1862 }
1863
1864 KSI_INIT(&ksi);
1865 ksi.ksi_signo = signo;
1866 ksi.ksi_code = code;
1867 ksi.ksi_band = band;
1868
1869 mutex_enter(proc_lock);
1870 if (pgid > 0) {
1871 struct proc *p1;
1872
1873 p1 = p_find(pgid, PFIND_LOCKED);
1874 if (p1 != NULL) {
1875 kpsignal(p1, &ksi, fdescdata);
1876 }
1877 } else {
1878 struct pgrp *pgrp;
1879
1880 KASSERT(pgid < 0);
1881 pgrp = pg_find(-pgid, PFIND_LOCKED);
1882 if (pgrp != NULL) {
1883 kpgsignal(pgrp, &ksi, fdescdata, 0);
1884 }
1885 }
1886 mutex_exit(proc_lock);
1887 }
1888
1889 int
1890 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
1891 void *data)
1892 {
1893
1894 fp->f_flag = flag;
1895 fp->f_type = DTYPE_MISC;
1896 fp->f_ops = fops;
1897 fp->f_data = data;
1898 curlwp->l_dupfd = fd;
1899 fd_affix(curproc, fp, fd);
1900
1901 return EMOVEFD;
1902 }
1903
1904 int
1905 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
1906 {
1907
1908 if (cmd == F_SETFL)
1909 return 0;
1910
1911 return EOPNOTSUPP;
1912 }
1913
1914 int
1915 fnullop_poll(file_t *fp, int which)
1916 {
1917
1918 return 0;
1919 }
1920
1921 int
1922 fnullop_kqfilter(file_t *fp, struct knote *kn)
1923 {
1924
1925 return 0;
1926 }
1927
1928 void
1929 fnullop_drain(file_t *fp)
1930 {
1931
1932 }
1933
1934 int
1935 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
1936 kauth_cred_t cred, int flags)
1937 {
1938
1939 return EOPNOTSUPP;
1940 }
1941
1942 int
1943 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
1944 kauth_cred_t cred, int flags)
1945 {
1946
1947 return EOPNOTSUPP;
1948 }
1949
1950 int
1951 fbadop_ioctl(file_t *fp, u_long com, void *data)
1952 {
1953
1954 return EOPNOTSUPP;
1955 }
1956
1957 int
1958 fbadop_stat(file_t *fp, struct stat *sb)
1959 {
1960
1961 return EOPNOTSUPP;
1962 }
1963
1964 int
1965 fbadop_close(file_t *fp)
1966 {
1967
1968 return EOPNOTSUPP;
1969 }
Cache object: c5fa7bda68f354837fbcdfa166158e41
|