FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c
1 /*
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
39 */
40
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD: releng/5.2/sys/kern/vfs_vnops.c 120743 2003-10-04 14:35:22Z jeff $");
43
44 #include "opt_mac.h"
45
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/fcntl.h>
49 #include <sys/file.h>
50 #include <sys/stat.h>
51 #include <sys/proc.h>
52 #include <sys/limits.h>
53 #include <sys/lock.h>
54 #include <sys/mac.h>
55 #include <sys/mount.h>
56 #include <sys/mutex.h>
57 #include <sys/namei.h>
58 #include <sys/vnode.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/filio.h>
62 #include <sys/sx.h>
63 #include <sys/ttycom.h>
64 #include <sys/conf.h>
65 #include <sys/syslog.h>
66
67 static fo_rdwr_t vn_read;
68 static fo_rdwr_t vn_write;
69 static fo_ioctl_t vn_ioctl;
70 static fo_poll_t vn_poll;
71 static fo_kqfilter_t vn_kqfilter;
72 static fo_stat_t vn_statfile;
73 static fo_close_t vn_closefile;
74
75 struct fileops vnops = {
76 .fo_read = vn_read,
77 .fo_write = vn_write,
78 .fo_ioctl = vn_ioctl,
79 .fo_poll = vn_poll,
80 .fo_kqfilter = vn_kqfilter,
81 .fo_stat = vn_statfile,
82 .fo_close = vn_closefile,
83 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
84 };
85
86 int
87 vn_open(ndp, flagp, cmode, fdidx)
88 struct nameidata *ndp;
89 int *flagp, cmode, fdidx;
90 {
91 struct thread *td = ndp->ni_cnd.cn_thread;
92
93 return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx));
94 }
95
96 /*
97 * Common code for vnode open operations.
98 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
99 *
100 * Note that this does NOT free nameidata for the successful case,
101 * due to the NDINIT being done elsewhere.
102 */
103 int
104 vn_open_cred(ndp, flagp, cmode, cred, fdidx)
105 struct nameidata *ndp;
106 int *flagp, cmode;
107 struct ucred *cred;
108 int fdidx;
109 {
110 struct vnode *vp;
111 struct mount *mp;
112 struct thread *td = ndp->ni_cnd.cn_thread;
113 struct vattr vat;
114 struct vattr *vap = &vat;
115 int mode, fmode, error;
116 #ifdef LOOKUP_SHARED
117 int exclusive; /* The current intended lock state */
118
119 exclusive = 0;
120 #endif
121
122 restart:
123 fmode = *flagp;
124 if (fmode & O_CREAT) {
125 ndp->ni_cnd.cn_nameiop = CREATE;
126 ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
127 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
128 ndp->ni_cnd.cn_flags |= FOLLOW;
129 bwillwrite();
130 if ((error = namei(ndp)) != 0)
131 return (error);
132 if (ndp->ni_vp == NULL) {
133 VATTR_NULL(vap);
134 vap->va_type = VREG;
135 vap->va_mode = cmode;
136 if (fmode & O_EXCL)
137 vap->va_vaflags |= VA_EXCLUSIVE;
138 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
139 NDFREE(ndp, NDF_ONLY_PNBUF);
140 vput(ndp->ni_dvp);
141 if ((error = vn_start_write(NULL, &mp,
142 V_XSLEEP | PCATCH)) != 0)
143 return (error);
144 goto restart;
145 }
146 #ifdef MAC
147 error = mac_check_vnode_create(cred, ndp->ni_dvp,
148 &ndp->ni_cnd, vap);
149 if (error == 0) {
150 #endif
151 VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
152 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
153 &ndp->ni_cnd, vap);
154 #ifdef MAC
155 }
156 #endif
157 vput(ndp->ni_dvp);
158 vn_finished_write(mp);
159 if (error) {
160 NDFREE(ndp, NDF_ONLY_PNBUF);
161 return (error);
162 }
163 ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
164 ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
165 fmode &= ~O_TRUNC;
166 vp = ndp->ni_vp;
167 #ifdef LOOKUP_SHARED
168 exclusive = 1;
169 #endif
170 } else {
171 if (ndp->ni_dvp == ndp->ni_vp)
172 vrele(ndp->ni_dvp);
173 else
174 vput(ndp->ni_dvp);
175 ndp->ni_dvp = NULL;
176 vp = ndp->ni_vp;
177 if (fmode & O_EXCL) {
178 error = EEXIST;
179 goto bad;
180 }
181 fmode &= ~O_CREAT;
182 }
183 } else {
184 ndp->ni_cnd.cn_nameiop = LOOKUP;
185 #ifdef LOOKUP_SHARED
186 ndp->ni_cnd.cn_flags =
187 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
188 LOCKSHARED | LOCKLEAF;
189 #else
190 ndp->ni_cnd.cn_flags =
191 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
192 #endif
193 if ((error = namei(ndp)) != 0)
194 return (error);
195 vp = ndp->ni_vp;
196 }
197 if (vp->v_type == VLNK) {
198 error = EMLINK;
199 goto bad;
200 }
201 if (vp->v_type == VSOCK) {
202 error = EOPNOTSUPP;
203 goto bad;
204 }
205 mode = 0;
206 if (fmode & (FWRITE | O_TRUNC)) {
207 if (vp->v_type == VDIR) {
208 error = EISDIR;
209 goto bad;
210 }
211 mode |= VWRITE;
212 }
213 if (fmode & FREAD)
214 mode |= VREAD;
215 if (fmode & O_APPEND)
216 mode |= VAPPEND;
217 #ifdef MAC
218 error = mac_check_vnode_open(cred, vp, mode);
219 if (error)
220 goto bad;
221 #endif
222 if ((fmode & O_CREAT) == 0) {
223 if (mode & VWRITE) {
224 error = vn_writechk(vp);
225 if (error)
226 goto bad;
227 }
228 if (mode) {
229 error = VOP_ACCESS(vp, mode, cred, td);
230 if (error)
231 goto bad;
232 }
233 }
234 if ((error = VOP_GETATTR(vp, vap, cred, td)) == 0) {
235 vp->v_cachedfs = vap->va_fsid;
236 vp->v_cachedid = vap->va_fileid;
237 }
238 if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0)
239 goto bad;
240 /*
241 * Make sure that a VM object is created for VMIO support.
242 */
243 if (vn_canvmio(vp) == TRUE) {
244 #ifdef LOOKUP_SHARED
245 int flock;
246
247 if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0)
248 VOP_LOCK(vp, LK_UPGRADE, td);
249 /*
250 * In cases where the object is marked as dead object_create
251 * will unlock and relock exclusive. It is safe to call in
252 * here with a shared lock because we only examine fields that
253 * the shared lock guarantees will be stable. In the UPGRADE
254 * case it is not likely that anyone has used this vnode yet
255 * so there will be no contention. The logic after this call
256 * restores the requested locking state.
257 */
258 #endif
259 if ((error = vfs_object_create(vp, td, cred)) != 0) {
260 VOP_UNLOCK(vp, 0, td);
261 VOP_CLOSE(vp, fmode, cred, td);
262 NDFREE(ndp, NDF_ONLY_PNBUF);
263 vrele(vp);
264 *flagp = fmode;
265 return (error);
266 }
267 #ifdef LOOKUP_SHARED
268 flock = VOP_ISLOCKED(vp, td);
269 if (!exclusive && flock == LK_EXCLUSIVE)
270 VOP_LOCK(vp, LK_DOWNGRADE, td);
271 #endif
272 }
273
274 if (fmode & FWRITE)
275 vp->v_writecount++;
276 *flagp = fmode;
277 ASSERT_VOP_LOCKED(vp, "vn_open_cred");
278 return (0);
279 bad:
280 NDFREE(ndp, NDF_ONLY_PNBUF);
281 vput(vp);
282 *flagp = fmode;
283 ndp->ni_vp = NULL;
284 return (error);
285 }
286
287 /*
288 * Check for write permissions on the specified vnode.
289 * Prototype text segments cannot be written.
290 */
291 int
292 vn_writechk(vp)
293 register struct vnode *vp;
294 {
295
296 ASSERT_VOP_LOCKED(vp, "vn_writechk");
297 /*
298 * If there's shared text associated with
299 * the vnode, try to free it up once. If
300 * we fail, we can't allow writing.
301 */
302 if (vp->v_vflag & VV_TEXT)
303 return (ETXTBSY);
304
305 return (0);
306 }
307
308 /*
309 * Vnode close call
310 */
311 int
312 vn_close(vp, flags, file_cred, td)
313 register struct vnode *vp;
314 int flags;
315 struct ucred *file_cred;
316 struct thread *td;
317 {
318 int error;
319
320 if (flags & FWRITE)
321 vp->v_writecount--;
322 error = VOP_CLOSE(vp, flags, file_cred, td);
323 /*
324 * XXX - In certain instances VOP_CLOSE has to do the vrele
325 * itself. If the vrele has been done, it will return EAGAIN
326 * to indicate that the vrele should not be done again. When
327 * this happens, we just return success. The correct thing to
328 * do would be to have all VOP_CLOSE instances do the vrele.
329 */
330 if (error == EAGAIN)
331 return (0);
332 vrele(vp);
333 return (error);
334 }
335
336 /*
337 * Sequential heuristic - detect sequential operation
338 */
339 static __inline
340 int
341 sequential_heuristic(struct uio *uio, struct file *fp)
342 {
343
344 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
345 uio->uio_offset == fp->f_nextoff) {
346 /*
347 * XXX we assume that the filesystem block size is
348 * the default. Not true, but still gives us a pretty
349 * good indicator of how sequential the read operations
350 * are.
351 */
352 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
353 if (fp->f_seqcount > IO_SEQMAX)
354 fp->f_seqcount = IO_SEQMAX;
355 return(fp->f_seqcount << IO_SEQSHIFT);
356 }
357
358 /*
359 * Not sequential, quick draw-down of seqcount
360 */
361 if (fp->f_seqcount > 1)
362 fp->f_seqcount = 1;
363 else
364 fp->f_seqcount = 0;
365 return(0);
366 }
367
368 /*
369 * Package up an I/O request on a vnode into a uio and do it.
370 */
371 int
372 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
373 aresid, td)
374 enum uio_rw rw;
375 struct vnode *vp;
376 caddr_t base;
377 int len;
378 off_t offset;
379 enum uio_seg segflg;
380 int ioflg;
381 struct ucred *active_cred;
382 struct ucred *file_cred;
383 int *aresid;
384 struct thread *td;
385 {
386 struct uio auio;
387 struct iovec aiov;
388 struct mount *mp;
389 struct ucred *cred;
390 int error;
391
392 if ((ioflg & IO_NODELOCKED) == 0) {
393 mp = NULL;
394 if (rw == UIO_WRITE) {
395 if (vp->v_type != VCHR &&
396 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
397 != 0)
398 return (error);
399 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
400 } else {
401 /*
402 * XXX This should be LK_SHARED but I don't trust VFS
403 * enough to leave it like that until it has been
404 * reviewed further.
405 */
406 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
407 }
408
409 }
410 auio.uio_iov = &aiov;
411 auio.uio_iovcnt = 1;
412 aiov.iov_base = base;
413 aiov.iov_len = len;
414 auio.uio_resid = len;
415 auio.uio_offset = offset;
416 auio.uio_segflg = segflg;
417 auio.uio_rw = rw;
418 auio.uio_td = td;
419 error = 0;
420 #ifdef MAC
421 if ((ioflg & IO_NOMACCHECK) == 0) {
422 if (rw == UIO_READ)
423 error = mac_check_vnode_read(active_cred, file_cred,
424 vp);
425 else
426 error = mac_check_vnode_write(active_cred, file_cred,
427 vp);
428 }
429 #endif
430 if (error == 0) {
431 if (file_cred)
432 cred = file_cred;
433 else
434 cred = active_cred;
435 if (rw == UIO_READ)
436 error = VOP_READ(vp, &auio, ioflg, cred);
437 else
438 error = VOP_WRITE(vp, &auio, ioflg, cred);
439 }
440 if (aresid)
441 *aresid = auio.uio_resid;
442 else
443 if (auio.uio_resid && error == 0)
444 error = EIO;
445 if ((ioflg & IO_NODELOCKED) == 0) {
446 if (rw == UIO_WRITE)
447 vn_finished_write(mp);
448 VOP_UNLOCK(vp, 0, td);
449 }
450 return (error);
451 }
452
453 /*
454 * Package up an I/O request on a vnode into a uio and do it. The I/O
455 * request is split up into smaller chunks and we try to avoid saturating
456 * the buffer cache while potentially holding a vnode locked, so we
457 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
458 * to give other processes a chance to lock the vnode (either other processes
459 * core'ing the same binary, or unrelated processes scanning the directory).
460 */
461 int
462 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
463 file_cred, aresid, td)
464 enum uio_rw rw;
465 struct vnode *vp;
466 caddr_t base;
467 int len;
468 off_t offset;
469 enum uio_seg segflg;
470 int ioflg;
471 struct ucred *active_cred;
472 struct ucred *file_cred;
473 int *aresid;
474 struct thread *td;
475 {
476 int error = 0;
477
478 do {
479 int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
480
481 if (rw != UIO_READ && vp->v_type == VREG)
482 bwillwrite();
483 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
484 ioflg, active_cred, file_cred, aresid, td);
485 len -= chunk; /* aresid calc already includes length */
486 if (error)
487 break;
488 offset += chunk;
489 base += chunk;
490 uio_yield();
491 } while (len);
492 if (aresid)
493 *aresid += len;
494 return (error);
495 }
496
497 /*
498 * File table vnode read routine.
499 */
500 static int
501 vn_read(fp, uio, active_cred, flags, td)
502 struct file *fp;
503 struct uio *uio;
504 struct ucred *active_cred;
505 struct thread *td;
506 int flags;
507 {
508 struct vnode *vp;
509 int error, ioflag;
510
511 mtx_lock(&Giant);
512 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
513 uio->uio_td, td));
514 vp = fp->f_vnode;
515 ioflag = 0;
516 if (fp->f_flag & FNONBLOCK)
517 ioflag |= IO_NDELAY;
518 if (fp->f_flag & O_DIRECT)
519 ioflag |= IO_DIRECT;
520 VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
521 /*
522 * According to McKusick the vn lock is protecting f_offset here.
523 * Once this field has it's own lock we can acquire this shared.
524 */
525 if ((flags & FOF_OFFSET) == 0) {
526 vn_lock(vp, LK_EXCLUSIVE | LK_NOPAUSE | LK_RETRY, td);
527 uio->uio_offset = fp->f_offset;
528 } else
529 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
530
531 ioflag |= sequential_heuristic(uio, fp);
532
533 #ifdef MAC
534 error = mac_check_vnode_read(active_cred, fp->f_cred, vp);
535 if (error == 0)
536 #endif
537 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
538 if ((flags & FOF_OFFSET) == 0)
539 fp->f_offset = uio->uio_offset;
540 fp->f_nextoff = uio->uio_offset;
541 VOP_UNLOCK(vp, 0, td);
542 mtx_unlock(&Giant);
543 return (error);
544 }
545
546 /*
547 * File table vnode write routine.
548 */
549 static int
550 vn_write(fp, uio, active_cred, flags, td)
551 struct file *fp;
552 struct uio *uio;
553 struct ucred *active_cred;
554 struct thread *td;
555 int flags;
556 {
557 struct vnode *vp;
558 struct mount *mp;
559 int error, ioflag;
560
561 mtx_lock(&Giant);
562 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
563 uio->uio_td, td));
564 vp = fp->f_vnode;
565 if (vp->v_type == VREG)
566 bwillwrite();
567 ioflag = IO_UNIT;
568 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
569 ioflag |= IO_APPEND;
570 if (fp->f_flag & FNONBLOCK)
571 ioflag |= IO_NDELAY;
572 if (fp->f_flag & O_DIRECT)
573 ioflag |= IO_DIRECT;
574 if ((fp->f_flag & O_FSYNC) ||
575 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
576 ioflag |= IO_SYNC;
577 mp = NULL;
578 if (vp->v_type != VCHR &&
579 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
580 mtx_unlock(&Giant);
581 return (error);
582 }
583 VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
584 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
585 if ((flags & FOF_OFFSET) == 0)
586 uio->uio_offset = fp->f_offset;
587 ioflag |= sequential_heuristic(uio, fp);
588 #ifdef MAC
589 error = mac_check_vnode_write(active_cred, fp->f_cred, vp);
590 if (error == 0)
591 #endif
592 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
593 if ((flags & FOF_OFFSET) == 0)
594 fp->f_offset = uio->uio_offset;
595 fp->f_nextoff = uio->uio_offset;
596 VOP_UNLOCK(vp, 0, td);
597 vn_finished_write(mp);
598 mtx_unlock(&Giant);
599 return (error);
600 }
601
602 /*
603 * File table vnode stat routine.
604 */
605 static int
606 vn_statfile(fp, sb, active_cred, td)
607 struct file *fp;
608 struct stat *sb;
609 struct ucred *active_cred;
610 struct thread *td;
611 {
612 struct vnode *vp = fp->f_vnode;
613 int error;
614
615 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
616 error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
617 VOP_UNLOCK(vp, 0, td);
618
619 return (error);
620 }
621
622 /*
623 * Stat a vnode; implementation for the stat syscall
624 */
625 int
626 vn_stat(vp, sb, active_cred, file_cred, td)
627 struct vnode *vp;
628 register struct stat *sb;
629 struct ucred *active_cred;
630 struct ucred *file_cred;
631 struct thread *td;
632 {
633 struct vattr vattr;
634 register struct vattr *vap;
635 int error;
636 u_short mode;
637
638 #ifdef MAC
639 error = mac_check_vnode_stat(active_cred, file_cred, vp);
640 if (error)
641 return (error);
642 #endif
643
644 vap = &vattr;
645 error = VOP_GETATTR(vp, vap, active_cred, td);
646 if (error)
647 return (error);
648
649 vp->v_cachedfs = vap->va_fsid;
650 vp->v_cachedid = vap->va_fileid;
651
652 /*
653 * Zero the spare stat fields
654 */
655 bzero(sb, sizeof *sb);
656
657 /*
658 * Copy from vattr table
659 */
660 if (vap->va_fsid != VNOVAL)
661 sb->st_dev = vap->va_fsid;
662 else
663 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
664 sb->st_ino = vap->va_fileid;
665 mode = vap->va_mode;
666 switch (vap->va_type) {
667 case VREG:
668 mode |= S_IFREG;
669 break;
670 case VDIR:
671 mode |= S_IFDIR;
672 break;
673 case VBLK:
674 mode |= S_IFBLK;
675 break;
676 case VCHR:
677 mode |= S_IFCHR;
678 break;
679 case VLNK:
680 mode |= S_IFLNK;
681 /* This is a cosmetic change, symlinks do not have a mode. */
682 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
683 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
684 else
685 sb->st_mode |= ACCESSPERMS; /* 0777 */
686 break;
687 case VSOCK:
688 mode |= S_IFSOCK;
689 break;
690 case VFIFO:
691 mode |= S_IFIFO;
692 break;
693 default:
694 return (EBADF);
695 };
696 sb->st_mode = mode;
697 sb->st_nlink = vap->va_nlink;
698 sb->st_uid = vap->va_uid;
699 sb->st_gid = vap->va_gid;
700 sb->st_rdev = vap->va_rdev;
701 if (vap->va_size > OFF_MAX)
702 return (EOVERFLOW);
703 sb->st_size = vap->va_size;
704 sb->st_atimespec = vap->va_atime;
705 sb->st_mtimespec = vap->va_mtime;
706 sb->st_ctimespec = vap->va_ctime;
707 sb->st_birthtimespec = vap->va_birthtime;
708
709 /*
710 * According to www.opengroup.org, the meaning of st_blksize is
711 * "a filesystem-specific preferred I/O block size for this
712 * object. In some filesystem types, this may vary from file
713 * to file"
714 * Default to PAGE_SIZE after much discussion.
715 */
716
717 if (vap->va_type == VREG) {
718 sb->st_blksize = vap->va_blocksize;
719 } else if (vn_isdisk(vp, NULL)) {
720 sb->st_blksize = vp->v_rdev->si_bsize_best;
721 if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
722 sb->st_blksize = vp->v_rdev->si_bsize_phys;
723 if (sb->st_blksize < BLKDEV_IOSIZE)
724 sb->st_blksize = BLKDEV_IOSIZE;
725 } else {
726 sb->st_blksize = PAGE_SIZE;
727 }
728
729 sb->st_flags = vap->va_flags;
730 if (suser(td))
731 sb->st_gen = 0;
732 else
733 sb->st_gen = vap->va_gen;
734
735 #if (S_BLKSIZE == 512)
736 /* Optimize this case */
737 sb->st_blocks = vap->va_bytes >> 9;
738 #else
739 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
740 #endif
741 return (0);
742 }
743
744 /*
745 * File table vnode ioctl routine.
746 */
747 static int
748 vn_ioctl(fp, com, data, active_cred, td)
749 struct file *fp;
750 u_long com;
751 void *data;
752 struct ucred *active_cred;
753 struct thread *td;
754 {
755 struct vnode *vp = fp->f_vnode;
756 struct vnode *vpold;
757 struct vattr vattr;
758 int error;
759
760 switch (vp->v_type) {
761
762 case VREG:
763 case VDIR:
764 if (com == FIONREAD) {
765 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
766 error = VOP_GETATTR(vp, &vattr, active_cred, td);
767 VOP_UNLOCK(vp, 0, td);
768 if (error)
769 return (error);
770 *(int *)data = vattr.va_size - fp->f_offset;
771 return (0);
772 }
773 if (com == FIONBIO || com == FIOASYNC) /* XXX */
774 return (0); /* XXX */
775 /* FALLTHROUGH */
776
777 default:
778 #if 0
779 return (ENOTTY);
780 #endif
781 case VFIFO:
782 case VCHR:
783 case VBLK:
784 if (com == FIODTYPE) {
785 if (vp->v_type != VCHR && vp->v_type != VBLK)
786 return (ENOTTY);
787 *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
788 return (0);
789 }
790 error = VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td);
791 if (error == ENOIOCTL) {
792 #ifdef DIAGNOSTIC
793 Debugger("ENOIOCTL leaked through");
794 #endif
795 error = ENOTTY;
796 }
797 if (error == 0 && com == TIOCSCTTY) {
798
799 /* Do nothing if reassigning same control tty */
800 sx_slock(&proctree_lock);
801 if (td->td_proc->p_session->s_ttyvp == vp) {
802 sx_sunlock(&proctree_lock);
803 return (0);
804 }
805
806 vpold = td->td_proc->p_session->s_ttyvp;
807 VREF(vp);
808 SESS_LOCK(td->td_proc->p_session);
809 td->td_proc->p_session->s_ttyvp = vp;
810 SESS_UNLOCK(td->td_proc->p_session);
811
812 sx_sunlock(&proctree_lock);
813
814 /* Get rid of reference to old control tty */
815 if (vpold)
816 vrele(vpold);
817 }
818 return (error);
819 }
820 }
821
822 /*
823 * File table vnode poll routine.
824 */
825 static int
826 vn_poll(fp, events, active_cred, td)
827 struct file *fp;
828 int events;
829 struct ucred *active_cred;
830 struct thread *td;
831 {
832 struct vnode *vp;
833 #ifdef MAC
834 int error;
835 #endif
836
837 vp = fp->f_vnode;
838 #ifdef MAC
839 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
840 error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
841 VOP_UNLOCK(vp, 0, td);
842 if (error)
843 return (error);
844 #endif
845
846 return (VOP_POLL(vp, events, fp->f_cred, td));
847 }
848
849 /*
850 * Check that the vnode is still valid, and if so
851 * acquire requested lock.
852 */
853 int
854 #ifndef DEBUG_LOCKS
855 vn_lock(vp, flags, td)
856 #else
857 debug_vn_lock(vp, flags, td, filename, line)
858 #endif
859 struct vnode *vp;
860 int flags;
861 struct thread *td;
862 #ifdef DEBUG_LOCKS
863 const char *filename;
864 int line;
865 #endif
866 {
867 int error;
868
869 do {
870 if ((flags & LK_INTERLOCK) == 0)
871 VI_LOCK(vp);
872 if ((vp->v_iflag & VI_XLOCK) && vp->v_vxproc != curthread) {
873 if ((flags & LK_NOWAIT) != 0) {
874 VI_UNLOCK(vp);
875 return (ENOENT);
876 }
877 vp->v_iflag |= VI_XWANT;
878 msleep(vp, VI_MTX(vp), PINOD, "vn_lock", 0);
879 if ((flags & LK_RETRY) == 0) {
880 VI_UNLOCK(vp);
881 return (ENOENT);
882 }
883 }
884 #ifdef DEBUG_LOCKS
885 vp->filename = filename;
886 vp->line = line;
887 #endif
888 /*
889 * lockmgr drops interlock before it will return for
890 * any reason. So force the code above to relock it.
891 */
892 error = VOP_LOCK(vp, flags | LK_NOPAUSE | LK_INTERLOCK, td);
893 flags &= ~LK_INTERLOCK;
894 } while (flags & LK_RETRY && error != 0);
895 return (error);
896 }
897
898 /*
899 * File table vnode close routine.
900 */
901 static int
902 vn_closefile(fp, td)
903 struct file *fp;
904 struct thread *td;
905 {
906
907 fp->f_ops = &badfileops;
908 return (vn_close(fp->f_vnode, fp->f_flag, fp->f_cred, td));
909 }
910
911 /*
912 * Preparing to start a filesystem write operation. If the operation is
913 * permitted, then we bump the count of operations in progress and
914 * proceed. If a suspend request is in progress, we wait until the
915 * suspension is over, and then proceed.
916 */
917 int
918 vn_start_write(vp, mpp, flags)
919 struct vnode *vp;
920 struct mount **mpp;
921 int flags;
922 {
923 struct mount *mp;
924 int error;
925
926 /*
927 * If a vnode is provided, get and return the mount point that
928 * to which it will write.
929 */
930 if (vp != NULL) {
931 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
932 *mpp = NULL;
933 if (error != EOPNOTSUPP)
934 return (error);
935 return (0);
936 }
937 }
938 if ((mp = *mpp) == NULL)
939 return (0);
940 /*
941 * Check on status of suspension.
942 */
943 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
944 if (flags & V_NOWAIT)
945 return (EWOULDBLOCK);
946 error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
947 "suspfs", 0);
948 if (error)
949 return (error);
950 }
951 if (flags & V_XSLEEP)
952 return (0);
953 mp->mnt_writeopcount++;
954 return (0);
955 }
956
957 /*
958 * Secondary suspension. Used by operations such as vop_inactive
959 * routines that are needed by the higher level functions. These
960 * are allowed to proceed until all the higher level functions have
961 * completed (indicated by mnt_writeopcount dropping to zero). At that
962 * time, these operations are halted until the suspension is over.
963 */
964 int
965 vn_write_suspend_wait(vp, mp, flags)
966 struct vnode *vp;
967 struct mount *mp;
968 int flags;
969 {
970 int error;
971
972 if (vp != NULL) {
973 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
974 if (error != EOPNOTSUPP)
975 return (error);
976 return (0);
977 }
978 }
979 /*
980 * If we are not suspended or have not yet reached suspended
981 * mode, then let the operation proceed.
982 */
983 if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
984 return (0);
985 if (flags & V_NOWAIT)
986 return (EWOULDBLOCK);
987 /*
988 * Wait for the suspension to finish.
989 */
990 return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
991 "suspfs", 0));
992 }
993
994 /*
995 * Filesystem write operation has completed. If we are suspending and this
996 * operation is the last one, notify the suspender that the suspension is
997 * now in effect.
998 */
999 void
1000 vn_finished_write(mp)
1001 struct mount *mp;
1002 {
1003
1004 if (mp == NULL)
1005 return;
1006 mp->mnt_writeopcount--;
1007 if (mp->mnt_writeopcount < 0)
1008 panic("vn_finished_write: neg cnt");
1009 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1010 mp->mnt_writeopcount <= 0)
1011 wakeup(&mp->mnt_writeopcount);
1012 }
1013
1014 /*
1015 * Request a filesystem to suspend write operations.
1016 */
1017 int
1018 vfs_write_suspend(mp)
1019 struct mount *mp;
1020 {
1021 struct thread *td = curthread;
1022 int error;
1023
1024 if (mp->mnt_kern_flag & MNTK_SUSPEND)
1025 return (0);
1026 mp->mnt_kern_flag |= MNTK_SUSPEND;
1027 if (mp->mnt_writeopcount > 0)
1028 (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
1029 if ((error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) != 0) {
1030 vfs_write_resume(mp);
1031 return (error);
1032 }
1033 mp->mnt_kern_flag |= MNTK_SUSPENDED;
1034 return (0);
1035 }
1036
1037 /*
1038 * Request a filesystem to resume write operations.
1039 */
1040 void
1041 vfs_write_resume(mp)
1042 struct mount *mp;
1043 {
1044
1045 if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
1046 return;
1047 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
1048 wakeup(&mp->mnt_writeopcount);
1049 wakeup(&mp->mnt_flag);
1050 }
1051
1052 /*
1053 * Implement kqueues for files by translating it to vnode operation.
1054 */
1055 static int
1056 vn_kqfilter(struct file *fp, struct knote *kn)
1057 {
1058
1059 return (VOP_KQFILTER(fp->f_vnode, kn));
1060 }
1061
1062 /*
1063 * Simplified in-kernel wrapper calls for extended attribute access.
1064 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1065 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1066 */
1067 int
1068 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1069 const char *attrname, int *buflen, char *buf, struct thread *td)
1070 {
1071 struct uio auio;
1072 struct iovec iov;
1073 int error;
1074
1075 iov.iov_len = *buflen;
1076 iov.iov_base = buf;
1077
1078 auio.uio_iov = &iov;
1079 auio.uio_iovcnt = 1;
1080 auio.uio_rw = UIO_READ;
1081 auio.uio_segflg = UIO_SYSSPACE;
1082 auio.uio_td = td;
1083 auio.uio_offset = 0;
1084 auio.uio_resid = *buflen;
1085
1086 if ((ioflg & IO_NODELOCKED) == 0)
1087 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1088
1089 /* authorize attribute retrieval as kernel */
1090 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1091 td);
1092
1093 if ((ioflg & IO_NODELOCKED) == 0)
1094 VOP_UNLOCK(vp, 0, td);
1095
1096 if (error == 0) {
1097 *buflen = *buflen - auio.uio_resid;
1098 }
1099
1100 return (error);
1101 }
1102
1103 /*
1104 * XXX failure mode if partially written?
1105 */
1106 int
1107 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1108 const char *attrname, int buflen, char *buf, struct thread *td)
1109 {
1110 struct uio auio;
1111 struct iovec iov;
1112 struct mount *mp;
1113 int error;
1114
1115 iov.iov_len = buflen;
1116 iov.iov_base = buf;
1117
1118 auio.uio_iov = &iov;
1119 auio.uio_iovcnt = 1;
1120 auio.uio_rw = UIO_WRITE;
1121 auio.uio_segflg = UIO_SYSSPACE;
1122 auio.uio_td = td;
1123 auio.uio_offset = 0;
1124 auio.uio_resid = buflen;
1125
1126 if ((ioflg & IO_NODELOCKED) == 0) {
1127 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1128 return (error);
1129 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1130 }
1131
1132 /* authorize attribute setting as kernel */
1133 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1134
1135 if ((ioflg & IO_NODELOCKED) == 0) {
1136 vn_finished_write(mp);
1137 VOP_UNLOCK(vp, 0, td);
1138 }
1139
1140 return (error);
1141 }
1142
1143 int
1144 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1145 const char *attrname, struct thread *td)
1146 {
1147 struct mount *mp;
1148 int error;
1149
1150 if ((ioflg & IO_NODELOCKED) == 0) {
1151 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1152 return (error);
1153 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1154 }
1155
1156 /* authorize attribute removal as kernel */
1157 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1158 if (error == EOPNOTSUPP)
1159 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1160 NULL, td);
1161
1162 if ((ioflg & IO_NODELOCKED) == 0) {
1163 vn_finished_write(mp);
1164 VOP_UNLOCK(vp, 0, td);
1165 }
1166
1167 return (error);
1168 }
Cache object: 3d2337c3a6999d5499e53f357975f588
|