[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_descrip.c

Version: -  FREEBSD  -  FREEBSD7  -  FREEBSD70  -  FREEBSD6  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  OPENSOLARIS  -  minix-3-1-1  -  TRUSTEDBSD-SEBSD  -  TRUSTEDBSD-SEDARWIN  -  TRUSTEDBSD-SEDARWIN7 
Ident_Mode: -  plain  -  excerpts  -  bigexcerpts 

  1 /*-
  2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  3  *      The Regents of the University of California.  All rights reserved.
  4  * (c) UNIX System Laboratories, Inc.
  5  * All or some portions of this file are derived from material licensed
  6  * to the University of California by American Telephone and Telegraph
  7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  8  * the permission of UNIX System Laboratories, Inc.
  9  *
 10  * Redistribution and use in source and binary forms, with or without
 11  * modification, are permitted provided that the following conditions
 12  * are met:
 13  * 1. Redistributions of source code must retain the above copyright
 14  *    notice, this list of conditions and the following disclaimer.
 15  * 2. Redistributions in binary form must reproduce the above copyright
 16  *    notice, this list of conditions and the following disclaimer in the
 17  *    documentation and/or other materials provided with the distribution.
 18  * 4. Neither the name of the University nor the names of its contributors
 19  *    may be used to endorse or promote products derived from this software
 20  *    without specific prior written permission.
 21  *
 22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 32  * SUCH DAMAGE.
 33  *
 34  *      @(#)kern_descrip.c      8.6 (Berkeley) 4/19/94
 35  */
 36 
 37 #include <sys/cdefs.h>
 38 __FBSDID("$FreeBSD: src/sys/kern/kern_descrip.c,v 1.337 2008/08/20 08:31:58 ed Exp $");
 39 
 40 #include "opt_compat.h"
 41 #include "opt_ddb.h"
 42 #include "opt_ktrace.h"
 43 
 44 #include <sys/param.h>
 45 #include <sys/systm.h>
 46 
 47 #include <sys/conf.h>
 48 #include <sys/domain.h>
 49 #include <sys/fcntl.h>
 50 #include <sys/file.h>
 51 #include <sys/filedesc.h>
 52 #include <sys/filio.h>
 53 #include <sys/jail.h>
 54 #include <sys/kernel.h>
 55 #include <sys/limits.h>
 56 #include <sys/lock.h>
 57 #include <sys/malloc.h>
 58 #include <sys/mount.h>
 59 #include <sys/mqueue.h>
 60 #include <sys/mutex.h>
 61 #include <sys/namei.h>
 62 #include <sys/priv.h>
 63 #include <sys/proc.h>
 64 #include <sys/protosw.h>
 65 #include <sys/resourcevar.h>
 66 #include <sys/signalvar.h>
 67 #include <sys/socketvar.h>
 68 #include <sys/stat.h>
 69 #include <sys/sx.h>
 70 #include <sys/syscallsubr.h>
 71 #include <sys/sysctl.h>
 72 #include <sys/sysproto.h>
 73 #include <sys/tty.h>
 74 #include <sys/unistd.h>
 75 #include <sys/user.h>
 76 #include <sys/vnode.h>
 77 #ifdef KTRACE
 78 #include <sys/ktrace.h>
 79 #endif
 80 
 81 #include <security/audit/audit.h>
 82 
 83 #include <vm/uma.h>
 84 
 85 #include <ddb/ddb.h>
 86 
 87 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 88 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
 89                      "file desc to leader structures");
 90 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 91 
 92 static uma_zone_t file_zone;
 93 
 94 
 95 /* Flags for do_dup() */
 96 #define DUP_FIXED       0x1     /* Force fixed allocation */
 97 #define DUP_FCNTL       0x2     /* fcntl()-style errors */
 98 
 99 static int do_dup(struct thread *td, int flags, int old, int new,
100     register_t *retval);
101 static int      fd_first_free(struct filedesc *, int, int);
102 static int      fd_last_used(struct filedesc *, int, int);
103 static void     fdgrowtable(struct filedesc *, int);
104 static void     fdunused(struct filedesc *fdp, int fd);
105 static void     fdused(struct filedesc *fdp, int fd);
106 
107 /*
108  * A process is initially started out with NDFILE descriptors stored within
109  * this structure, selected to be enough for typical applications based on
110  * the historical limit of 20 open files (and the usage of descriptors by
111  * shells).  If these descriptors are exhausted, a larger descriptor table
112  * may be allocated, up to a process' resource limit; the internal arrays
113  * are then unused.
114  */
115 #define NDFILE          20
116 #define NDSLOTSIZE      sizeof(NDSLOTTYPE)
117 #define NDENTRIES       (NDSLOTSIZE * __CHAR_BIT)
118 #define NDSLOT(x)       ((x) / NDENTRIES)
119 #define NDBIT(x)        ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
120 #define NDSLOTS(x)      (((x) + NDENTRIES - 1) / NDENTRIES)
121 
122 /*
123  * Storage required per open file descriptor.
124  */
125 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
126 
127 /*
128  * Basic allocation of descriptors:
129  * one of the above, plus arrays for NDFILE descriptors.
130  */
131 struct filedesc0 {
132         struct  filedesc fd_fd;
133         /*
134          * These arrays are used when the number of open files is
135          * <= NDFILE, and are then pointed to by the pointers above.
136          */
137         struct  file *fd_dfiles[NDFILE];
138         char    fd_dfileflags[NDFILE];
139         NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
140 };
141 
142 /*
143  * Descriptor management.
144  */
145 volatile int openfiles;                 /* actual number of open files */
146 struct mtx sigio_lock;          /* mtx to protect pointers to sigio */
147 void    (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
148 
149 /* A mutex to protect the association between a proc and filedesc. */
150 static struct mtx       fdesc_mtx;
151 
152 /*
153  * Find the first zero bit in the given bitmap, starting at low and not
154  * exceeding size - 1.
155  */
156 static int
157 fd_first_free(struct filedesc *fdp, int low, int size)
158 {
159         NDSLOTTYPE *map = fdp->fd_map;
160         NDSLOTTYPE mask;
161         int off, maxoff;
162 
163         if (low >= size)
164                 return (low);
165 
166         off = NDSLOT(low);
167         if (low % NDENTRIES) {
168                 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
169                 if ((mask &= ~map[off]) != 0UL)
170                         return (off * NDENTRIES + ffsl(mask) - 1);
171                 ++off;
172         }
173         for (maxoff = NDSLOTS(size); off < maxoff; ++off)
174                 if (map[off] != ~0UL)
175                         return (off * NDENTRIES + ffsl(~map[off]) - 1);
176         return (size);
177 }
178 
179 /*
180  * Find the highest non-zero bit in the given bitmap, starting at low and
181  * not exceeding size - 1.
182  */
183 static int
184 fd_last_used(struct filedesc *fdp, int low, int size)
185 {
186         NDSLOTTYPE *map = fdp->fd_map;
187         NDSLOTTYPE mask;
188         int off, minoff;
189 
190         if (low >= size)
191                 return (-1);
192 
193         off = NDSLOT(size);
194         if (size % NDENTRIES) {
195                 mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
196                 if ((mask &= map[off]) != 0)
197                         return (off * NDENTRIES + flsl(mask) - 1);
198                 --off;
199         }
200         for (minoff = NDSLOT(low); off >= minoff; --off)
201                 if (map[off] != 0)
202                         return (off * NDENTRIES + flsl(map[off]) - 1);
203         return (low - 1);
204 }
205 
206 static int
207 fdisused(struct filedesc *fdp, int fd)
208 {
209         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
210             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
211         return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
212 }
213 
214 /*
215  * Mark a file descriptor as used.
216  */
217 static void
218 fdused(struct filedesc *fdp, int fd)
219 {
220 
221         FILEDESC_XLOCK_ASSERT(fdp);
222         KASSERT(!fdisused(fdp, fd),
223             ("fd already used"));
224 
225         fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
226         if (fd > fdp->fd_lastfile)
227                 fdp->fd_lastfile = fd;
228         if (fd == fdp->fd_freefile)
229                 fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
230 }
231 
232 /*
233  * Mark a file descriptor as unused.
234  */
235 static void
236 fdunused(struct filedesc *fdp, int fd)
237 {
238 
239         FILEDESC_XLOCK_ASSERT(fdp);
240         KASSERT(fdisused(fdp, fd),
241             ("fd is already unused"));
242         KASSERT(fdp->fd_ofiles[fd] == NULL,
243             ("fd is still in use"));
244 
245         fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
246         if (fd < fdp->fd_freefile)
247                 fdp->fd_freefile = fd;
248         if (fd == fdp->fd_lastfile)
249                 fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
250 }
251 
252 /*
253  * System calls on descriptors.
254  */
255 #ifndef _SYS_SYSPROTO_H_
256 struct getdtablesize_args {
257         int     dummy;
258 };
259 #endif
260 /* ARGSUSED */
261 int
262 getdtablesize(struct thread *td, struct getdtablesize_args *uap)
263 {
264         struct proc *p = td->td_proc;
265 
266         PROC_LOCK(p);
267         td->td_retval[0] =
268             min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
269         PROC_UNLOCK(p);
270         return (0);
271 }
272 
273 /*
274  * Duplicate a file descriptor to a particular value.
275  *
276  * Note: keep in mind that a potential race condition exists when closing
277  * descriptors from a shared descriptor table (via rfork).
278  */
279 #ifndef _SYS_SYSPROTO_H_
280 struct dup2_args {
281         u_int   from;
282         u_int   to;
283 };
284 #endif
285 /* ARGSUSED */
286 int
287 dup2(struct thread *td, struct dup2_args *uap)
288 {
289 
290         return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
291                     td->td_retval));
292 }
293 
294 /*
295  * Duplicate a file descriptor.
296  */
297 #ifndef _SYS_SYSPROTO_H_
298 struct dup_args {
299         u_int   fd;
300 };
301 #endif
302 /* ARGSUSED */
303 int
304 dup(struct thread *td, struct dup_args *uap)
305 {
306 
307         return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
308 }
309 
310 /*
311  * The file control system call.
312  */
313 #ifndef _SYS_SYSPROTO_H_
314 struct fcntl_args {
315         int     fd;
316         int     cmd;
317         long    arg;
318 };
319 #endif
320 /* ARGSUSED */
321 int
322 fcntl(struct thread *td, struct fcntl_args *uap)
323 {
324         struct flock fl;
325         struct oflock ofl;
326         intptr_t arg;
327         int error;
328         int cmd;
329 
330         error = 0;
331         cmd = uap->cmd;
332         switch (uap->cmd) {
333         case F_OGETLK:
334         case F_OSETLK:
335         case F_OSETLKW:
336                 /*
337                  * Convert old flock structure to new.
338                  */
339                 error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
340                 fl.l_start = ofl.l_start;
341                 fl.l_len = ofl.l_len;
342                 fl.l_pid = ofl.l_pid;
343                 fl.l_type = ofl.l_type;
344                 fl.l_whence = ofl.l_whence;
345                 fl.l_sysid = 0;
346 
347                 switch (uap->cmd) {
348                 case F_OGETLK:
349                     cmd = F_GETLK;
350                     break;
351                 case F_OSETLK:
352                     cmd = F_SETLK;
353                     break;
354                 case F_OSETLKW:
355                     cmd = F_SETLKW;
356                     break;
357                 }
358                 arg = (intptr_t)&fl;
359                 break;
360         case F_GETLK:
361         case F_SETLK:
362         case F_SETLKW:
363         case F_SETLK_REMOTE:
364                 error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
365                 arg = (intptr_t)&fl;
366                 break;
367         default:
368                 arg = uap->arg;
369                 break;
370         }
371         if (error)
372                 return (error);
373         error = kern_fcntl(td, uap->fd, cmd, arg);
374         if (error)
375                 return (error);
376         if (uap->cmd == F_OGETLK) {
377                 ofl.l_start = fl.l_start;
378                 ofl.l_len = fl.l_len;
379                 ofl.l_pid = fl.l_pid;
380                 ofl.l_type = fl.l_type;
381                 ofl.l_whence = fl.l_whence;
382                 error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
383         } else if (uap->cmd == F_GETLK) {
384                 error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
385         }
386         return (error);
387 }
388 
389 static inline struct file *
390 fdtofp(int fd, struct filedesc *fdp)
391 {
392         struct file *fp;
393 
394         FILEDESC_LOCK_ASSERT(fdp);
395         if ((unsigned)fd >= fdp->fd_nfiles ||
396             (fp = fdp->fd_ofiles[fd]) == NULL)
397                 return (NULL);
398         return (fp);
399 }
400 
401 int
402 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
403 {
404         struct filedesc *fdp;
405         struct flock *flp;
406         struct file *fp;
407         struct proc *p;
408         char *pop;
409         struct vnode *vp;
410         int error, flg, tmp;
411         int vfslocked;
412 
413         vfslocked = 0;
414         error = 0;
415         flg = F_POSIX;
416         p = td->td_proc;
417         fdp = p->p_fd;
418 
419         switch (cmd) {
420         case F_DUPFD:
421                 tmp = arg;
422                 error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
423                 break;
424 
425         case F_DUP2FD:
426                 tmp = arg;
427                 error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
428                 break;
429 
430         case F_GETFD:
431                 FILEDESC_SLOCK(fdp);
432                 if ((fp = fdtofp(fd, fdp)) == NULL) {
433                         FILEDESC_SUNLOCK(fdp);
434                         error = EBADF;
435                         break;
436                 }
437                 pop = &fdp->fd_ofileflags[fd];
438                 td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
439                 FILEDESC_SUNLOCK(fdp);
440                 break;
441 
442         case F_SETFD:
443                 FILEDESC_XLOCK(fdp);
444                 if ((fp = fdtofp(fd, fdp)) == NULL) {
445                         FILEDESC_XUNLOCK(fdp);
446                         error = EBADF;
447                         break;
448                 }
449                 pop = &fdp->fd_ofileflags[fd];
450                 *pop = (*pop &~ UF_EXCLOSE) |
451                     (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
452                 FILEDESC_XUNLOCK(fdp);
453                 break;
454 
455         case F_GETFL:
456                 FILEDESC_SLOCK(fdp);
457                 if ((fp = fdtofp(fd, fdp)) == NULL) {
458                         FILEDESC_SUNLOCK(fdp);
459                         error = EBADF;
460                         break;
461                 }
462                 td->td_retval[0] = OFLAGS(fp->f_flag);
463                 FILEDESC_SUNLOCK(fdp);
464                 break;
465 
466         case F_SETFL:
467                 FILEDESC_SLOCK(fdp);
468                 if ((fp = fdtofp(fd, fdp)) == NULL) {
469                         FILEDESC_SUNLOCK(fdp);
470                         error = EBADF;
471                         break;
472                 }
473                 fhold(fp);
474                 FILEDESC_SUNLOCK(fdp);
475                 do {
476                         tmp = flg = fp->f_flag;
477                         tmp &= ~FCNTLFLAGS;
478                         tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
479                 } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
480                 tmp = fp->f_flag & FNONBLOCK;
481                 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
482                 if (error) {
483                         fdrop(fp, td);
484                         break;
485                 }
486                 tmp = fp->f_flag & FASYNC;
487                 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
488                 if (error == 0) {
489                         fdrop(fp, td);
490                         break;
491                 }
492                 atomic_clear_int(&fp->f_flag, FNONBLOCK);
493                 tmp = 0;
494                 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
495                 fdrop(fp, td);
496                 break;
497 
498         case F_GETOWN:
499                 FILEDESC_SLOCK(fdp);
500                 if ((fp = fdtofp(fd, fdp)) == NULL) {
501                         FILEDESC_SUNLOCK(fdp);
502                         error = EBADF;
503                         break;
504                 }
505                 fhold(fp);
506                 FILEDESC_SUNLOCK(fdp);
507                 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
508                 if (error == 0)
509                         td->td_retval[0] = tmp;
510                 fdrop(fp, td);
511                 break;
512 
513         case F_SETOWN:
514                 FILEDESC_SLOCK(fdp);
515                 if ((fp = fdtofp(fd, fdp)) == NULL) {
516                         FILEDESC_SUNLOCK(fdp);
517                         error = EBADF;
518                         break;
519                 }
520                 fhold(fp);
521                 FILEDESC_SUNLOCK(fdp);
522                 tmp = arg;
523                 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
524                 fdrop(fp, td);
525                 break;
526 
527         case F_SETLK_REMOTE:
528                 error = priv_check(td, PRIV_NFS_LOCKD);
529                 if (error)
530                         return (error);
531                 flg = F_REMOTE;
532                 goto do_setlk;
533 
534         case F_SETLKW:
535                 flg |= F_WAIT;
536                 /* FALLTHROUGH F_SETLK */
537 
538         case F_SETLK:
539         do_setlk:
540                 FILEDESC_SLOCK(fdp);
541                 if ((fp = fdtofp(fd, fdp)) == NULL) {
542                         FILEDESC_SUNLOCK(fdp);
543                         error = EBADF;
544                         break;
545                 }
546                 if (fp->f_type != DTYPE_VNODE) {
547                         FILEDESC_SUNLOCK(fdp);
548                         error = EBADF;
549                         break;
550                 }
551                 flp = (struct flock *)arg;
552                 if (flp->l_whence == SEEK_CUR) {
553                         if (fp->f_offset < 0 ||
554                             (flp->l_start > 0 &&
555                              fp->f_offset > OFF_MAX - flp->l_start)) {
556                                 FILEDESC_SUNLOCK(fdp);
557                                 error = EOVERFLOW;
558                                 break;
559                         }
560                         flp->l_start += fp->f_offset;
561                 }
562 
563                 /*
564                  * VOP_ADVLOCK() may block.
565                  */
566                 fhold(fp);
567                 FILEDESC_SUNLOCK(fdp);
568                 vp = fp->f_vnode;
569                 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
570                 switch (flp->l_type) {
571                 case F_RDLCK:
572                         if ((fp->f_flag & FREAD) == 0) {
573                                 error = EBADF;
574                                 break;
575                         }
576                         PROC_LOCK(p->p_leader);
577                         p->p_leader->p_flag |= P_ADVLOCK;
578                         PROC_UNLOCK(p->p_leader);
579                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
580                             flp, flg);
581                         break;
582                 case F_WRLCK:
583                         if ((fp->f_flag & FWRITE) == 0) {
584                                 error = EBADF;
585                                 break;
586                         }
587                         PROC_LOCK(p->p_leader);
588                         p->p_leader->p_flag |= P_ADVLOCK;
589                         PROC_UNLOCK(p->p_leader);
590                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
591                             flp, flg);
592                         break;
593                 case F_UNLCK:
594                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
595                             flp, flg);
596                         break;
597                 case F_UNLCKSYS:
598                         /*
599                          * Temporary api for testing remote lock
600                          * infrastructure.
601                          */
602                         if (flg != F_REMOTE) {
603                                 error = EINVAL;
604                                 break;
605                         }
606                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
607                             F_UNLCKSYS, flp, flg);
608                         break;
609                 default:
610                         error = EINVAL;
611                         break;
612                 }
613                 VFS_UNLOCK_GIANT(vfslocked);
614                 vfslocked = 0;
615                 /* Check for race with close */
616                 FILEDESC_SLOCK(fdp);
617                 if ((unsigned) fd >= fdp->fd_nfiles ||
618                     fp != fdp->fd_ofiles[fd]) {
619                         FILEDESC_SUNLOCK(fdp);
620                         flp->l_whence = SEEK_SET;
621                         flp->l_start = 0;
622                         flp->l_len = 0;
623                         flp->l_type = F_UNLCK;
624                         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
625                         (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
626                                            F_UNLCK, flp, F_POSIX);
627                         VFS_UNLOCK_GIANT(vfslocked);
628                         vfslocked = 0;
629                 } else
630                         FILEDESC_SUNLOCK(fdp);
631                 fdrop(fp, td);
632                 break;
633 
634         case F_GETLK:
635                 FILEDESC_SLOCK(fdp);
636                 if ((fp = fdtofp(fd, fdp)) == NULL) {
637                         FILEDESC_SUNLOCK(fdp);
638                         error = EBADF;
639                         break;
640                 }
641                 if (fp->f_type != DTYPE_VNODE) {
642                         FILEDESC_SUNLOCK(fdp);
643                         error = EBADF;
644                         break;
645                 }
646                 flp = (struct flock *)arg;
647                 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
648                     flp->l_type != F_UNLCK) {
649                         FILEDESC_SUNLOCK(fdp);
650                         error = EINVAL;
651                         break;
652                 }
653                 if (flp->l_whence == SEEK_CUR) {
654                         if ((flp->l_start > 0 &&
655                             fp->f_offset > OFF_MAX - flp->l_start) ||
656                             (flp->l_start < 0 &&
657                              fp->f_offset < OFF_MIN - flp->l_start)) {
658                                 FILEDESC_SUNLOCK(fdp);
659                                 error = EOVERFLOW;
660                                 break;
661                         }
662                         flp->l_start += fp->f_offset;
663                 }
664                 /*
665                  * VOP_ADVLOCK() may block.
666                  */
667                 fhold(fp);
668                 FILEDESC_SUNLOCK(fdp);
669                 vp = fp->f_vnode;
670                 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
671                 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
672                     F_POSIX);
673                 VFS_UNLOCK_GIANT(vfslocked);
674                 vfslocked = 0;
675                 fdrop(fp, td);
676                 break;
677         default:
678                 error = EINVAL;
679                 break;
680         }
681         VFS_UNLOCK_GIANT(vfslocked);
682         return (error);
683 }
684 
685 /*
686  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
687  */
688 static int
689 do_dup(struct thread *td, int flags, int old, int new,
690     register_t *retval)
691 {
692         struct filedesc *fdp;
693         struct proc *p;
694         struct file *fp;
695         struct file *delfp;
696         int error, holdleaders, maxfd;
697 
698         p = td->td_proc;
699         fdp = p->p_fd;
700 
701         /*
702          * Verify we have a valid descriptor to dup from and possibly to
703          * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
704          * return EINVAL when the new descriptor is out of bounds.
705          */
706         if (old < 0)
707                 return (EBADF);
708         if (new < 0)
709                 return (flags & DUP_FCNTL ? EINVAL : EBADF);
710         PROC_LOCK(p);
711         maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
712         PROC_UNLOCK(p);
713         if (new >= maxfd)
714                 return (flags & DUP_FCNTL ? EINVAL : EMFILE);
715 
716         FILEDESC_XLOCK(fdp);
717         if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
718                 FILEDESC_XUNLOCK(fdp);
719                 return (EBADF);
720         }
721         if (flags & DUP_FIXED && old == new) {
722                 *retval = new;
723                 FILEDESC_XUNLOCK(fdp);
724                 return (0);
725         }
726         fp = fdp->fd_ofiles[old];
727         fhold(fp);
728 
729         /*
730          * If the caller specified a file descriptor, make sure the file
731          * table is large enough to hold it, and grab it.  Otherwise, just
732          * allocate a new descriptor the usual way.  Since the filedesc
733          * lock may be temporarily dropped in the process, we have to look
734          * out for a race.
735          */
736         if (flags & DUP_FIXED) {
737                 if (new >= fdp->fd_nfiles)
738                         fdgrowtable(fdp, new + 1);
739                 if (fdp->fd_ofiles[new] == NULL)
740                         fdused(fdp, new);
741         } else {
742                 if ((error = fdalloc(td, new, &new)) != 0) {
743                         FILEDESC_XUNLOCK(fdp);
744                         fdrop(fp, td);
745                         return (error);
746                 }
747         }
748 
749         /*
750          * If the old file changed out from under us then treat it as a
751          * bad file descriptor.  Userland should do its own locking to
752          * avoid this case.
753          */
754         if (fdp->fd_ofiles[old] != fp) {
755                 /* we've allocated a descriptor which we won't use */
756                 if (fdp->fd_ofiles[new] == NULL)
757                         fdunused(fdp, new);
758                 FILEDESC_XUNLOCK(fdp);
759                 fdrop(fp, td);
760                 return (EBADF);
761         }
762         KASSERT(old != new,
763             ("new fd is same as old"));
764 
765         /*
766          * Save info on the descriptor being overwritten.  We cannot close
767          * it without introducing an ownership race for the slot, since we
768          * need to drop the filedesc lock to call closef().
769          *
770          * XXX this duplicates parts of close().
771          */
772         delfp = fdp->fd_ofiles[new];
773         holdleaders = 0;
774         if (delfp != NULL) {
775                 if (td->td_proc->p_fdtol != NULL) {
776                         /*
777                          * Ask fdfree() to sleep to ensure that all relevant
778                          * process leaders can be traversed in closef().
779                          */
780                         fdp->fd_holdleaderscount++;
781                         holdleaders = 1;
782                 }
783         }
784 
785         /*
786          * Duplicate the source descriptor
787          */
788         fdp->fd_ofiles[new] = fp;
789         fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
790         if (new > fdp->fd_lastfile)
791                 fdp->fd_lastfile = new;
792         *retval = new;
793 
794         /*
795          * If we dup'd over a valid file, we now own the reference to it
796          * and must dispose of it using closef() semantics (as if a
797          * close() were performed on it).
798          *
799          * XXX this duplicates parts of close().
800          */
801         if (delfp != NULL) {
802                 knote_fdclose(td, new);
803                 if (delfp->f_type == DTYPE_MQUEUE)
804                         mq_fdclose(td, new, delfp);
805                 FILEDESC_XUNLOCK(fdp);
806                 (void) closef(delfp, td);
807                 if (holdleaders) {
808                         FILEDESC_XLOCK(fdp);
809                         fdp->fd_holdleaderscount--;
810                         if (fdp->fd_holdleaderscount == 0 &&
811                             fdp->fd_holdleaderswakeup != 0) {
812                                 fdp->fd_holdleaderswakeup = 0;
813                                 wakeup(&fdp->fd_holdleaderscount);
814                         }
815                         FILEDESC_XUNLOCK(fdp);
816                 }
817         } else {
818                 FILEDESC_XUNLOCK(fdp);
819         }
820         return (0);
821 }
822 
823 /*
824  * If sigio is on the list associated with a process or process group,
825  * disable signalling from the device, remove sigio from the list and
826  * free sigio.
827  */
828 void
829 funsetown(struct sigio **sigiop)
830 {
831         struct sigio *sigio;
832 
833         SIGIO_LOCK();
834         sigio = *sigiop;
835         if (sigio == NULL) {
836                 SIGIO_UNLOCK();
837                 return;
838         }
839         *(sigio->sio_myref) = NULL;
840         if ((sigio)->sio_pgid < 0) {
841                 struct pgrp *pg = (sigio)->sio_pgrp;
842                 PGRP_LOCK(pg);
843                 SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
844                              sigio, sio_pgsigio);
845                 PGRP_UNLOCK(pg);
846         } else {
847                 struct proc *p = (sigio)->sio_proc;
848                 PROC_LOCK(p);
849                 SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
850                              sigio, sio_pgsigio);
851                 PROC_UNLOCK(p);
852         }
853         SIGIO_UNLOCK();
854         crfree(sigio->sio_ucred);
855         FREE(sigio, M_SIGIO);
856 }
857 
858 /*
859  * Free a list of sigio structures.
860  * We only need to lock the SIGIO_LOCK because we have made ourselves
861  * inaccessible to callers of fsetown and therefore do not need to lock
862  * the proc or pgrp struct for the list manipulation.
863  */
864 void
865 funsetownlst(struct sigiolst *sigiolst)
866 {
867         struct proc *p;
868         struct pgrp *pg;
869         struct sigio *sigio;
870 
871         sigio = SLIST_FIRST(sigiolst);
872         if (sigio == NULL)
873                 return;
874         p = NULL;
875         pg = NULL;
876 
877         /*
878          * Every entry of the list should belong
879          * to a single proc or pgrp.
880          */
881         if (sigio->sio_pgid < 0) {
882                 pg = sigio->sio_pgrp;
883                 PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
884         } else /* if (sigio->sio_pgid > 0) */ {
885                 p = sigio->sio_proc;
886                 PROC_LOCK_ASSERT(p, MA_NOTOWNED);
887         }
888 
889         SIGIO_LOCK();
890         while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
891                 *(sigio->sio_myref) = NULL;
892                 if (pg != NULL) {
893                         KASSERT(sigio->sio_pgid < 0,
894                             ("Proc sigio in pgrp sigio list"));
895                         KASSERT(sigio->sio_pgrp == pg,
896                             ("Bogus pgrp in sigio list"));
897                         PGRP_LOCK(pg);
898                         SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
899                             sio_pgsigio);
900                         PGRP_UNLOCK(pg);
901                 } else /* if (p != NULL) */ {
902                         KASSERT(sigio->sio_pgid > 0,
903                             ("Pgrp sigio in proc sigio list"));
904                         KASSERT(sigio->sio_proc == p,
905                             ("Bogus proc in sigio list"));
906                         PROC_LOCK(p);
907                         SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
908                             sio_pgsigio);
909                         PROC_UNLOCK(p);
910                 }
911                 SIGIO_UNLOCK();
912                 crfree(sigio->sio_ucred);
913                 FREE(sigio, M_SIGIO);
914                 SIGIO_LOCK();
915         }
916         SIGIO_UNLOCK();
917 }
918 
919 /*
920  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
921  *
922  * After permission checking, add a sigio structure to the sigio list for
923  * the process or process group.
924  */
925 int
926 fsetown(pid_t pgid, struct sigio **sigiop)
927 {
928         struct proc *proc;
929         struct pgrp *pgrp;
930         struct sigio *sigio;
931         int ret;
932 
933         if (pgid == 0) {
934                 funsetown(sigiop);
935                 return (0);
936         }
937 
938         ret = 0;
939 
940         /* Allocate and fill in the new sigio out of locks. */
941         MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
942         sigio->sio_pgid = pgid;
943         sigio->sio_ucred = crhold(curthread->td_ucred);
944         sigio->sio_myref = sigiop;
945 
946         sx_slock(&proctree_lock);
947         if (pgid > 0) {
948                 proc = pfind(pgid);
949                 if (proc == NULL) {
950                         ret = ESRCH;
951                         goto fail;
952                 }
953 
954                 /*
955                  * Policy - Don't allow a process to FSETOWN a process
956                  * in another session.
957                  *
958                  * Remove this test to allow maximum flexibility or
959                  * restrict FSETOWN to the current process or process
960                  * group for maximum safety.
961                  */
962                 PROC_UNLOCK(proc);
963                 if (proc->p_session != curthread->td_proc->p_session) {
964                         ret = EPERM;
965                         goto fail;
966                 }
967 
968                 pgrp = NULL;
969         } else /* if (pgid < 0) */ {
970                 pgrp = pgfind(-pgid);
971                 if (pgrp ==