FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c
1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39
40 #include "opt_mac.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/fcntl.h>
45 #include <sys/file.h>
46 #include <sys/kdb.h>
47 #include <sys/stat.h>
48 #include <sys/proc.h>
49 #include <sys/limits.h>
50 #include <sys/lock.h>
51 #include <sys/mac.h>
52 #include <sys/mount.h>
53 #include <sys/mutex.h>
54 #include <sys/namei.h>
55 #include <sys/vnode.h>
56 #include <sys/bio.h>
57 #include <sys/buf.h>
58 #include <sys/filio.h>
59 #include <sys/sx.h>
60 #include <sys/ttycom.h>
61 #include <sys/conf.h>
62 #include <sys/syslog.h>
63 #include <sys/unistd.h>
64
65 static fo_rdwr_t vn_read;
66 static fo_rdwr_t vn_write;
67 static fo_ioctl_t vn_ioctl;
68 static fo_poll_t vn_poll;
69 static fo_kqfilter_t vn_kqfilter;
70 static fo_stat_t vn_statfile;
71 static fo_close_t vn_closefile;
72
73 struct fileops vnops = {
74 .fo_read = vn_read,
75 .fo_write = vn_write,
76 .fo_ioctl = vn_ioctl,
77 .fo_poll = vn_poll,
78 .fo_kqfilter = vn_kqfilter,
79 .fo_stat = vn_statfile,
80 .fo_close = vn_closefile,
81 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
82 };
83
84 int
85 vn_open(ndp, flagp, cmode, fdidx)
86 struct nameidata *ndp;
87 int *flagp, cmode, fdidx;
88 {
89 struct thread *td = ndp->ni_cnd.cn_thread;
90
91 return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx));
92 }
93
94 /*
95 * Common code for vnode open operations.
96 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
97 *
98 * Note that this does NOT free nameidata for the successful case,
99 * due to the NDINIT being done elsewhere.
100 */
101 int
102 vn_open_cred(ndp, flagp, cmode, cred, fdidx)
103 struct nameidata *ndp;
104 int *flagp, cmode;
105 struct ucred *cred;
106 int fdidx;
107 {
108 struct vnode *vp;
109 struct mount *mp;
110 struct thread *td = ndp->ni_cnd.cn_thread;
111 struct vattr vat;
112 struct vattr *vap = &vat;
113 int mode, fmode, error;
114 int vfslocked;
115
116 restart:
117 vfslocked = 0;
118 fmode = *flagp;
119 if (fmode & O_CREAT) {
120 ndp->ni_cnd.cn_nameiop = CREATE;
121 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
122 MPSAFE | AUDITVNODE1;
123 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
124 ndp->ni_cnd.cn_flags |= FOLLOW;
125 bwillwrite();
126 if ((error = namei(ndp)) != 0)
127 return (error);
128 vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
129 ndp->ni_cnd.cn_flags &= ~MPSAFE;
130 if (ndp->ni_vp == NULL) {
131 VATTR_NULL(vap);
132 vap->va_type = VREG;
133 vap->va_mode = cmode;
134 if (fmode & O_EXCL)
135 vap->va_vaflags |= VA_EXCLUSIVE;
136 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
137 NDFREE(ndp, NDF_ONLY_PNBUF);
138 vput(ndp->ni_dvp);
139 VFS_UNLOCK_GIANT(vfslocked);
140 if ((error = vn_start_write(NULL, &mp,
141 V_XSLEEP | PCATCH)) != 0)
142 return (error);
143 goto restart;
144 }
145 #ifdef MAC
146 error = mac_check_vnode_create(cred, ndp->ni_dvp,
147 &ndp->ni_cnd, vap);
148 if (error == 0) {
149 #endif
150 VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
151 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
152 &ndp->ni_cnd, vap);
153 #ifdef MAC
154 }
155 #endif
156 vput(ndp->ni_dvp);
157 vn_finished_write(mp);
158 if (error) {
159 VFS_UNLOCK_GIANT(vfslocked);
160 NDFREE(ndp, NDF_ONLY_PNBUF);
161 return (error);
162 }
163 fmode &= ~O_TRUNC;
164 vp = ndp->ni_vp;
165 } else {
166 if (ndp->ni_dvp == ndp->ni_vp)
167 vrele(ndp->ni_dvp);
168 else
169 vput(ndp->ni_dvp);
170 ndp->ni_dvp = NULL;
171 vp = ndp->ni_vp;
172 if (fmode & O_EXCL) {
173 error = EEXIST;
174 goto bad;
175 }
176 fmode &= ~O_CREAT;
177 }
178 } else {
179 ndp->ni_cnd.cn_nameiop = LOOKUP;
180 ndp->ni_cnd.cn_flags = ISOPEN |
181 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
182 LOCKLEAF | MPSAFE | AUDITVNODE1;
183 if ((error = namei(ndp)) != 0)
184 return (error);
185 ndp->ni_cnd.cn_flags &= ~MPSAFE;
186 vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
187 vp = ndp->ni_vp;
188 }
189 if (vp->v_type == VLNK) {
190 error = EMLINK;
191 goto bad;
192 }
193 if (vp->v_type == VSOCK) {
194 error = EOPNOTSUPP;
195 goto bad;
196 }
197 mode = 0;
198 if (fmode & (FWRITE | O_TRUNC)) {
199 if (vp->v_type == VDIR) {
200 error = EISDIR;
201 goto bad;
202 }
203 mode |= VWRITE;
204 }
205 if (fmode & FREAD)
206 mode |= VREAD;
207 if (fmode & O_APPEND)
208 mode |= VAPPEND;
209 #ifdef MAC
210 error = mac_check_vnode_open(cred, vp, mode);
211 if (error)
212 goto bad;
213 #endif
214 if ((fmode & O_CREAT) == 0) {
215 if (mode & VWRITE) {
216 error = vn_writechk(vp);
217 if (error)
218 goto bad;
219 }
220 if (mode) {
221 error = VOP_ACCESS(vp, mode, cred, td);
222 if (error)
223 goto bad;
224 }
225 }
226 if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0)
227 goto bad;
228
229 if (fmode & FWRITE)
230 vp->v_writecount++;
231 *flagp = fmode;
232 ASSERT_VOP_ELOCKED(vp, "vn_open_cred");
233 if (fdidx == -1)
234 VFS_UNLOCK_GIANT(vfslocked);
235 return (0);
236 bad:
237 NDFREE(ndp, NDF_ONLY_PNBUF);
238 vput(vp);
239 VFS_UNLOCK_GIANT(vfslocked);
240 *flagp = fmode;
241 ndp->ni_vp = NULL;
242 return (error);
243 }
244
245 /*
246 * Check for write permissions on the specified vnode.
247 * Prototype text segments cannot be written.
248 */
249 int
250 vn_writechk(vp)
251 register struct vnode *vp;
252 {
253
254 ASSERT_VOP_LOCKED(vp, "vn_writechk");
255 /*
256 * If there's shared text associated with
257 * the vnode, try to free it up once. If
258 * we fail, we can't allow writing.
259 */
260 if (vp->v_vflag & VV_TEXT)
261 return (ETXTBSY);
262
263 return (0);
264 }
265
266 /*
267 * Vnode close call
268 */
269 int
270 vn_close(vp, flags, file_cred, td)
271 register struct vnode *vp;
272 int flags;
273 struct ucred *file_cred;
274 struct thread *td;
275 {
276 struct mount *mp;
277 int error;
278
279 VFS_ASSERT_GIANT(vp->v_mount);
280
281 vn_start_write(vp, &mp, V_WAIT);
282 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
283 if (flags & FWRITE)
284 vp->v_writecount--;
285 error = VOP_CLOSE(vp, flags, file_cred, td);
286 vput(vp);
287 vn_finished_write(mp);
288 return (error);
289 }
290
291 /*
292 * Sequential heuristic - detect sequential operation
293 */
294 static __inline
295 int
296 sequential_heuristic(struct uio *uio, struct file *fp)
297 {
298
299 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
300 uio->uio_offset == fp->f_nextoff) {
301 /*
302 * XXX we assume that the filesystem block size is
303 * the default. Not true, but still gives us a pretty
304 * good indicator of how sequential the read operations
305 * are.
306 */
307 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
308 if (fp->f_seqcount > IO_SEQMAX)
309 fp->f_seqcount = IO_SEQMAX;
310 return(fp->f_seqcount << IO_SEQSHIFT);
311 }
312
313 /*
314 * Not sequential, quick draw-down of seqcount
315 */
316 if (fp->f_seqcount > 1)
317 fp->f_seqcount = 1;
318 else
319 fp->f_seqcount = 0;
320 return(0);
321 }
322
323 /*
324 * Package up an I/O request on a vnode into a uio and do it.
325 */
326 int
327 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
328 aresid, td)
329 enum uio_rw rw;
330 struct vnode *vp;
331 caddr_t base;
332 int len;
333 off_t offset;
334 enum uio_seg segflg;
335 int ioflg;
336 struct ucred *active_cred;
337 struct ucred *file_cred;
338 int *aresid;
339 struct thread *td;
340 {
341 struct uio auio;
342 struct iovec aiov;
343 struct mount *mp;
344 struct ucred *cred;
345 int error;
346
347 VFS_ASSERT_GIANT(vp->v_mount);
348
349 if ((ioflg & IO_NODELOCKED) == 0) {
350 mp = NULL;
351 if (rw == UIO_WRITE) {
352 if (vp->v_type != VCHR &&
353 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
354 != 0)
355 return (error);
356 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
357 } else {
358 /*
359 * XXX This should be LK_SHARED but I don't trust VFS
360 * enough to leave it like that until it has been
361 * reviewed further.
362 */
363 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
364 }
365
366 }
367 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
368 auio.uio_iov = &aiov;
369 auio.uio_iovcnt = 1;
370 aiov.iov_base = base;
371 aiov.iov_len = len;
372 auio.uio_resid = len;
373 auio.uio_offset = offset;
374 auio.uio_segflg = segflg;
375 auio.uio_rw = rw;
376 auio.uio_td = td;
377 error = 0;
378 #ifdef MAC
379 if ((ioflg & IO_NOMACCHECK) == 0) {
380 if (rw == UIO_READ)
381 error = mac_check_vnode_read(active_cred, file_cred,
382 vp);
383 else
384 error = mac_check_vnode_write(active_cred, file_cred,
385 vp);
386 }
387 #endif
388 if (error == 0) {
389 if (file_cred)
390 cred = file_cred;
391 else
392 cred = active_cred;
393 if (rw == UIO_READ)
394 error = VOP_READ(vp, &auio, ioflg, cred);
395 else
396 error = VOP_WRITE(vp, &auio, ioflg, cred);
397 }
398 if (aresid)
399 *aresid = auio.uio_resid;
400 else
401 if (auio.uio_resid && error == 0)
402 error = EIO;
403 if ((ioflg & IO_NODELOCKED) == 0) {
404 if (rw == UIO_WRITE && vp->v_type != VCHR)
405 vn_finished_write(mp);
406 VOP_UNLOCK(vp, 0, td);
407 }
408 return (error);
409 }
410
411 /*
412 * Package up an I/O request on a vnode into a uio and do it. The I/O
413 * request is split up into smaller chunks and we try to avoid saturating
414 * the buffer cache while potentially holding a vnode locked, so we
415 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
416 * to give other processes a chance to lock the vnode (either other processes
417 * core'ing the same binary, or unrelated processes scanning the directory).
418 */
419 int
420 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
421 file_cred, aresid, td)
422 enum uio_rw rw;
423 struct vnode *vp;
424 caddr_t base;
425 size_t len;
426 off_t offset;
427 enum uio_seg segflg;
428 int ioflg;
429 struct ucred *active_cred;
430 struct ucred *file_cred;
431 size_t *aresid;
432 struct thread *td;
433 {
434 int error = 0;
435 int iaresid;
436
437 VFS_ASSERT_GIANT(vp->v_mount);
438
439 do {
440 int chunk;
441
442 /*
443 * Force `offset' to a multiple of MAXBSIZE except possibly
444 * for the first chunk, so that filesystems only need to
445 * write full blocks except possibly for the first and last
446 * chunks.
447 */
448 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
449
450 if (chunk > len)
451 chunk = len;
452 if (rw != UIO_READ && vp->v_type == VREG)
453 bwillwrite();
454 iaresid = 0;
455 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
456 ioflg, active_cred, file_cred, &iaresid, td);
457 len -= chunk; /* aresid calc already includes length */
458 if (error)
459 break;
460 offset += chunk;
461 base += chunk;
462 uio_yield();
463 } while (len);
464 if (aresid)
465 *aresid = len + iaresid;
466 return (error);
467 }
468
469 /*
470 * File table vnode read routine.
471 */
472 static int
473 vn_read(fp, uio, active_cred, flags, td)
474 struct file *fp;
475 struct uio *uio;
476 struct ucred *active_cred;
477 struct thread *td;
478 int flags;
479 {
480 struct vnode *vp;
481 int error, ioflag;
482 int vfslocked;
483
484 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
485 uio->uio_td, td));
486 vp = fp->f_vnode;
487 ioflag = 0;
488 if (fp->f_flag & FNONBLOCK)
489 ioflag |= IO_NDELAY;
490 if (fp->f_flag & O_DIRECT)
491 ioflag |= IO_DIRECT;
492 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
493 VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
494 /*
495 * According to McKusick the vn lock was protecting f_offset here.
496 * It is now protected by the FOFFSET_LOCKED flag.
497 */
498 if ((flags & FOF_OFFSET) == 0) {
499 FILE_LOCK(fp);
500 while(fp->f_vnread_flags & FOFFSET_LOCKED) {
501 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
502 msleep(&fp->f_vnread_flags,fp->f_mtxp,PUSER -1,"vnread offlock",0);
503 }
504 fp->f_vnread_flags |= FOFFSET_LOCKED;
505 FILE_UNLOCK(fp);
506 vn_lock(vp, LK_SHARED | LK_RETRY, td);
507 uio->uio_offset = fp->f_offset;
508 } else
509 vn_lock(vp, LK_SHARED | LK_RETRY, td);
510
511 ioflag |= sequential_heuristic(uio, fp);
512
513 #ifdef MAC
514 error = mac_check_vnode_read(active_cred, fp->f_cred, vp);
515 if (error == 0)
516 #endif
517 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
518 if ((flags & FOF_OFFSET) == 0) {
519 fp->f_offset = uio->uio_offset;
520 FILE_LOCK(fp);
521 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
522 wakeup(&fp->f_vnread_flags);
523 fp->f_vnread_flags = 0;
524 FILE_UNLOCK(fp);
525 }
526 fp->f_nextoff = uio->uio_offset;
527 VOP_UNLOCK(vp, 0, td);
528 VFS_UNLOCK_GIANT(vfslocked);
529 return (error);
530 }
531
532 /*
533 * File table vnode write routine.
534 */
535 static int
536 vn_write(fp, uio, active_cred, flags, td)
537 struct file *fp;
538 struct uio *uio;
539 struct ucred *active_cred;
540 struct thread *td;
541 int flags;
542 {
543 struct vnode *vp;
544 struct mount *mp;
545 int error, ioflag;
546 int vfslocked;
547
548 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
549 uio->uio_td, td));
550 vp = fp->f_vnode;
551 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
552 if (vp->v_type == VREG)
553 bwillwrite();
554 ioflag = IO_UNIT;
555 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
556 ioflag |= IO_APPEND;
557 if (fp->f_flag & FNONBLOCK)
558 ioflag |= IO_NDELAY;
559 if (fp->f_flag & O_DIRECT)
560 ioflag |= IO_DIRECT;
561 if ((fp->f_flag & O_FSYNC) ||
562 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
563 ioflag |= IO_SYNC;
564 mp = NULL;
565 if (vp->v_type != VCHR &&
566 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
567 goto unlock;
568 VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
569 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
570 if ((flags & FOF_OFFSET) == 0)
571 uio->uio_offset = fp->f_offset;
572 ioflag |= sequential_heuristic(uio, fp);
573 #ifdef MAC
574 error = mac_check_vnode_write(active_cred, fp->f_cred, vp);
575 if (error == 0)
576 #endif
577 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
578 if ((flags & FOF_OFFSET) == 0)
579 fp->f_offset = uio->uio_offset;
580 fp->f_nextoff = uio->uio_offset;
581 VOP_UNLOCK(vp, 0, td);
582 if (vp->v_type != VCHR)
583 vn_finished_write(mp);
584 unlock:
585 VFS_UNLOCK_GIANT(vfslocked);
586 return (error);
587 }
588
589 /*
590 * File table vnode stat routine.
591 */
592 static int
593 vn_statfile(fp, sb, active_cred, td)
594 struct file *fp;
595 struct stat *sb;
596 struct ucred *active_cred;
597 struct thread *td;
598 {
599 struct vnode *vp = fp->f_vnode;
600 int vfslocked;
601 int error;
602
603 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
604 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
605 error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
606 VOP_UNLOCK(vp, 0, td);
607 VFS_UNLOCK_GIANT(vfslocked);
608
609 return (error);
610 }
611
612 /*
613 * Stat a vnode; implementation for the stat syscall
614 */
615 int
616 vn_stat(vp, sb, active_cred, file_cred, td)
617 struct vnode *vp;
618 register struct stat *sb;
619 struct ucred *active_cred;
620 struct ucred *file_cred;
621 struct thread *td;
622 {
623 struct vattr vattr;
624 register struct vattr *vap;
625 int error;
626 u_short mode;
627
628 #ifdef MAC
629 error = mac_check_vnode_stat(active_cred, file_cred, vp);
630 if (error)
631 return (error);
632 #endif
633
634 vap = &vattr;
635 error = VOP_GETATTR(vp, vap, active_cred, td);
636 if (error)
637 return (error);
638
639 /*
640 * Zero the spare stat fields
641 */
642 bzero(sb, sizeof *sb);
643
644 /*
645 * Copy from vattr table
646 */
647 if (vap->va_fsid != VNOVAL)
648 sb->st_dev = vap->va_fsid;
649 else
650 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
651 sb->st_ino = vap->va_fileid;
652 mode = vap->va_mode;
653 switch (vap->va_type) {
654 case VREG:
655 mode |= S_IFREG;
656 break;
657 case VDIR:
658 mode |= S_IFDIR;
659 break;
660 case VBLK:
661 mode |= S_IFBLK;
662 break;
663 case VCHR:
664 mode |= S_IFCHR;
665 break;
666 case VLNK:
667 mode |= S_IFLNK;
668 /* This is a cosmetic change, symlinks do not have a mode. */
669 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
670 sb->st_mode &= ~ACCESSPERMS; /* 0000 */
671 else
672 sb->st_mode |= ACCESSPERMS; /* 0777 */
673 break;
674 case VSOCK:
675 mode |= S_IFSOCK;
676 break;
677 case VFIFO:
678 mode |= S_IFIFO;
679 break;
680 default:
681 return (EBADF);
682 };
683 sb->st_mode = mode;
684 sb->st_nlink = vap->va_nlink;
685 sb->st_uid = vap->va_uid;
686 sb->st_gid = vap->va_gid;
687 sb->st_rdev = vap->va_rdev;
688 if (vap->va_size > OFF_MAX)
689 return (EOVERFLOW);
690 sb->st_size = vap->va_size;
691 sb->st_atimespec = vap->va_atime;
692 sb->st_mtimespec = vap->va_mtime;
693 sb->st_ctimespec = vap->va_ctime;
694 sb->st_birthtimespec = vap->va_birthtime;
695
696 /*
697 * According to www.opengroup.org, the meaning of st_blksize is
698 * "a filesystem-specific preferred I/O block size for this
699 * object. In some filesystem types, this may vary from file
700 * to file"
701 * Default to PAGE_SIZE after much discussion.
702 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
703 */
704
705 sb->st_blksize = PAGE_SIZE;
706
707 sb->st_flags = vap->va_flags;
708 if (suser(td))
709 sb->st_gen = 0;
710 else
711 sb->st_gen = vap->va_gen;
712
713 #if (S_BLKSIZE == 512)
714 /* Optimize this case */
715 sb->st_blocks = vap->va_bytes >> 9;
716 #else
717 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
718 #endif
719 return (0);
720 }
721
722 /*
723 * File table vnode ioctl routine.
724 */
725 static int
726 vn_ioctl(fp, com, data, active_cred, td)
727 struct file *fp;
728 u_long com;
729 void *data;
730 struct ucred *active_cred;
731 struct thread *td;
732 {
733 struct vnode *vp = fp->f_vnode;
734 struct vattr vattr;
735 int vfslocked;
736 int error;
737
738 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
739 error = ENOTTY;
740 switch (vp->v_type) {
741 case VREG:
742 case VDIR:
743 if (com == FIONREAD) {
744 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
745 error = VOP_GETATTR(vp, &vattr, active_cred, td);
746 VOP_UNLOCK(vp, 0, td);
747 if (!error)
748 *(int *)data = vattr.va_size - fp->f_offset;
749 }
750 if (com == FIONBIO || com == FIOASYNC) /* XXX */
751 error = 0;
752 else
753 error = VOP_IOCTL(vp, com, data, fp->f_flag,
754 active_cred, td);
755 break;
756
757 default:
758 break;
759 }
760 VFS_UNLOCK_GIANT(vfslocked);
761 return (error);
762 }
763
764 /*
765 * File table vnode poll routine.
766 */
767 static int
768 vn_poll(fp, events, active_cred, td)
769 struct file *fp;
770 int events;
771 struct ucred *active_cred;
772 struct thread *td;
773 {
774 struct vnode *vp;
775 int error;
776
777 mtx_lock(&Giant);
778
779 vp = fp->f_vnode;
780 #ifdef MAC
781 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
782 error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
783 VOP_UNLOCK(vp, 0, td);
784 if (!error)
785 #endif
786
787 error = VOP_POLL(vp, events, fp->f_cred, td);
788 mtx_unlock(&Giant);
789 return (error);
790 }
791
792 /*
793 * Check that the vnode is still valid, and if so
794 * acquire requested lock.
795 */
796 int
797 vn_lock(vp, flags, td)
798 struct vnode *vp;
799 int flags;
800 struct thread *td;
801 {
802 int error;
803
804 do {
805 if ((flags & LK_INTERLOCK) == 0)
806 VI_LOCK(vp);
807 if ((flags & LK_NOWAIT || (flags & LK_TYPE_MASK) == 0) &&
808 vp->v_iflag & VI_DOOMED) {
809 VI_UNLOCK(vp);
810 return (ENOENT);
811 }
812 /*
813 * Just polling to check validity.
814 */
815 if ((flags & LK_TYPE_MASK) == 0) {
816 VI_UNLOCK(vp);
817 return (0);
818 }
819 /*
820 * lockmgr drops interlock before it will return for
821 * any reason. So force the code above to relock it.
822 */
823 error = VOP_LOCK(vp, flags | LK_INTERLOCK, td);
824 flags &= ~LK_INTERLOCK;
825 KASSERT((flags & LK_RETRY) == 0 || error == 0,
826 ("LK_RETRY set with incompatible flags %d\n", flags));
827 /*
828 * Callers specify LK_RETRY if they wish to get dead vnodes.
829 * If RETRY is not set, we return ENOENT instead.
830 */
831 if (error == 0 && vp->v_iflag & VI_DOOMED &&
832 (flags & LK_RETRY) == 0) {
833 VOP_UNLOCK(vp, 0, td);
834 error = ENOENT;
835 break;
836 }
837 } while (flags & LK_RETRY && error != 0);
838 return (error);
839 }
840
841 /*
842 * File table vnode close routine.
843 */
844 static int
845 vn_closefile(fp, td)
846 struct file *fp;
847 struct thread *td;
848 {
849 struct vnode *vp;
850 struct flock lf;
851 int vfslocked;
852 int error;
853
854 vp = fp->f_vnode;
855
856 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
857 if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
858 lf.l_whence = SEEK_SET;
859 lf.l_start = 0;
860 lf.l_len = 0;
861 lf.l_type = F_UNLCK;
862 (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
863 }
864
865 fp->f_ops = &badfileops;
866
867 error = vn_close(vp, fp->f_flag, fp->f_cred, td);
868 VFS_UNLOCK_GIANT(vfslocked);
869 return (error);
870 }
871
872 /*
873 * Preparing to start a filesystem write operation. If the operation is
874 * permitted, then we bump the count of operations in progress and
875 * proceed. If a suspend request is in progress, we wait until the
876 * suspension is over, and then proceed.
877 */
878 int
879 vn_start_write(vp, mpp, flags)
880 struct vnode *vp;
881 struct mount **mpp;
882 int flags;
883 {
884 struct mount *mp;
885 int error;
886
887 error = 0;
888 /*
889 * If a vnode is provided, get and return the mount point that
890 * to which it will write.
891 */
892 if (vp != NULL) {
893 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
894 *mpp = NULL;
895 if (error != EOPNOTSUPP)
896 return (error);
897 return (0);
898 }
899 }
900 if ((mp = *mpp) == NULL)
901 return (0);
902 MNT_ILOCK(mp);
903 if (vp == NULL)
904 MNT_REF(mp);
905 /*
906 * Check on status of suspension.
907 */
908 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
909 if (flags & V_NOWAIT) {
910 error = EWOULDBLOCK;
911 goto unlock;
912 }
913 error = msleep(&mp->mnt_flag, MNT_MTX(mp),
914 (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
915 if (error)
916 goto unlock;
917 }
918 if (flags & V_XSLEEP)
919 goto unlock;
920 mp->mnt_writeopcount++;
921 unlock:
922 MNT_REL(mp);
923 MNT_IUNLOCK(mp);
924 return (error);
925 }
926
927 /*
928 * Secondary suspension. Used by operations such as vop_inactive
929 * routines that are needed by the higher level functions. These
930 * are allowed to proceed until all the higher level functions have
931 * completed (indicated by mnt_writeopcount dropping to zero). At that
932 * time, these operations are halted until the suspension is over.
933 */
934 int
935 vn_write_suspend_wait(vp, mp, flags)
936 struct vnode *vp;
937 struct mount *mp;
938 int flags;
939 {
940 int error;
941
942 if (vp != NULL) {
943 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
944 if (error != EOPNOTSUPP)
945 return (error);
946 return (0);
947 }
948 }
949 /*
950 * If we are not suspended or have not yet reached suspended
951 * mode, then let the operation proceed.
952 */
953 if (mp == NULL)
954 return (0);
955 MNT_ILOCK(mp);
956 if (vp == NULL)
957 MNT_REF(mp);
958 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) {
959 MNT_REL(mp);
960 MNT_IUNLOCK(mp);
961 return (0);
962 }
963 if (flags & V_NOWAIT) {
964 MNT_REL(mp);
965 MNT_IUNLOCK(mp);
966 return (EWOULDBLOCK);
967 }
968 /*
969 * Wait for the suspension to finish.
970 */
971 error = msleep(&mp->mnt_flag, MNT_MTX(mp),
972 (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
973 vfs_rel(mp);
974 return (error);
975 }
976
977 /*
978 * Secondary suspension. Used by operations such as vop_inactive
979 * routines that are needed by the higher level functions. These
980 * are allowed to proceed until all the higher level functions have
981 * completed (indicated by mnt_writeopcount dropping to zero). At that
982 * time, these operations are halted until the suspension is over.
983 */
984 int
985 vn_start_secondary_write(vp, mpp, flags)
986 struct vnode *vp;
987 struct mount **mpp;
988 int flags;
989 {
990 struct mount *mp;
991 int error;
992
993 retry:
994 if (vp != NULL) {
995 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
996 *mpp = NULL;
997 if (error != EOPNOTSUPP)
998 return (error);
999 return (0);
1000 }
1001 }
1002 /*
1003 * If we are not suspended or have not yet reached suspended
1004 * mode, then let the operation proceed.
1005 */
1006 if ((mp = *mpp) == NULL)
1007 return (0);
1008 MNT_ILOCK(mp);
1009 if (vp == NULL)
1010 MNT_REF(mp);
1011 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1012 mp->mnt_secondary_writes++;
1013 mp->mnt_secondary_accwrites++;
1014 MNT_REL(mp);
1015 MNT_IUNLOCK(mp);
1016 return (0);
1017 }
1018 if (flags & V_NOWAIT) {
1019 MNT_REL(mp);
1020 MNT_IUNLOCK(mp);
1021 return (EWOULDBLOCK);
1022 }
1023 /*
1024 * Wait for the suspension to finish.
1025 */
1026 error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1027 (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1028 vfs_rel(mp);
1029 if (error == 0)
1030 goto retry;
1031 return (error);
1032 }
1033
1034 /*
1035 * Filesystem write operation has completed. If we are suspending and this
1036 * operation is the last one, notify the suspender that the suspension is
1037 * now in effect.
1038 */
1039 void
1040 vn_finished_write(mp)
1041 struct mount *mp;
1042 {
1043 if (mp == NULL)
1044 return;
1045 MNT_ILOCK(mp);
1046 mp->mnt_writeopcount--;
1047 if (mp->mnt_writeopcount < 0)
1048 panic("vn_finished_write: neg cnt");
1049 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1050 mp->mnt_writeopcount <= 0)
1051 wakeup(&mp->mnt_writeopcount);
1052 MNT_IUNLOCK(mp);
1053 }
1054
1055
1056 /*
1057 * Filesystem secondary write operation has completed. If we are
1058 * suspending and this operation is the last one, notify the suspender
1059 * that the suspension is now in effect.
1060 */
1061 void
1062 vn_finished_secondary_write(mp)
1063 struct mount *mp;
1064 {
1065 if (mp == NULL)
1066 return;
1067 MNT_ILOCK(mp);
1068 mp->mnt_secondary_writes--;
1069 if (mp->mnt_secondary_writes < 0)
1070 panic("vn_finished_secondary_write: neg cnt");
1071 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1072 mp->mnt_secondary_writes <= 0)
1073 wakeup(&mp->mnt_secondary_writes);
1074 MNT_IUNLOCK(mp);
1075 }
1076
1077
1078
1079 /*
1080 * Request a filesystem to suspend write operations.
1081 */
1082 int
1083 vfs_write_suspend(mp)
1084 struct mount *mp;
1085 {
1086 struct thread *td = curthread;
1087 int error;
1088
1089 MNT_ILOCK(mp);
1090 if (mp->mnt_kern_flag & MNTK_SUSPEND) {
1091 MNT_IUNLOCK(mp);
1092 return (0);
1093 }
1094 mp->mnt_kern_flag |= MNTK_SUSPEND;
1095 if (mp->mnt_writeopcount > 0)
1096 (void) msleep(&mp->mnt_writeopcount,
1097 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1098 else
1099 MNT_IUNLOCK(mp);
1100 if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0)
1101 vfs_write_resume(mp);
1102 return (error);
1103 }
1104
1105 /*
1106 * Request a filesystem to resume write operations.
1107 */
1108 void
1109 vfs_write_resume(mp)
1110 struct mount *mp;
1111 {
1112
1113 MNT_ILOCK(mp);
1114 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1115 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1116 MNTK_SUSPENDED);
1117 wakeup(&mp->mnt_writeopcount);
1118 wakeup(&mp->mnt_flag);
1119 }
1120 MNT_IUNLOCK(mp);
1121 }
1122
1123 /*
1124 * Implement kqueues for files by translating it to vnode operation.
1125 */
1126 static int
1127 vn_kqfilter(struct file *fp, struct knote *kn)
1128 {
1129 int error;
1130
1131 mtx_lock(&Giant);
1132 error = VOP_KQFILTER(fp->f_vnode, kn);
1133 mtx_unlock(&Giant);
1134
1135 return error;
1136 }
1137
1138 /*
1139 * Simplified in-kernel wrapper calls for extended attribute access.
1140 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1141 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1142 */
1143 int
1144 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1145 const char *attrname, int *buflen, char *buf, struct thread *td)
1146 {
1147 struct uio auio;
1148 struct iovec iov;
1149 int error;
1150
1151 iov.iov_len = *buflen;
1152 iov.iov_base = buf;
1153
1154 auio.uio_iov = &iov;
1155 auio.uio_iovcnt = 1;
1156 auio.uio_rw = UIO_READ;
1157 auio.uio_segflg = UIO_SYSSPACE;
1158 auio.uio_td = td;
1159 auio.uio_offset = 0;
1160 auio.uio_resid = *buflen;
1161
1162 if ((ioflg & IO_NODELOCKED) == 0)
1163 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1164
1165 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1166
1167 /* authorize attribute retrieval as kernel */
1168 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1169 td);
1170
1171 if ((ioflg & IO_NODELOCKED) == 0)
1172 VOP_UNLOCK(vp, 0, td);
1173
1174 if (error == 0) {
1175 *buflen = *buflen - auio.uio_resid;
1176 }
1177
1178 return (error);
1179 }
1180
1181 /*
1182 * XXX failure mode if partially written?
1183 */
1184 int
1185 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1186 const char *attrname, int buflen, char *buf, struct thread *td)
1187 {
1188 struct uio auio;
1189 struct iovec iov;
1190 struct mount *mp;
1191 int error;
1192
1193 iov.iov_len = buflen;
1194 iov.iov_base = buf;
1195
1196 auio.uio_iov = &iov;
1197 auio.uio_iovcnt = 1;
1198 auio.uio_rw = UIO_WRITE;
1199 auio.uio_segflg = UIO_SYSSPACE;
1200 auio.uio_td = td;
1201 auio.uio_offset = 0;
1202 auio.uio_resid = buflen;
1203
1204 if ((ioflg & IO_NODELOCKED) == 0) {
1205 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1206 return (error);
1207 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1208 }
1209
1210 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1211
1212 /* authorize attribute setting as kernel */
1213 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1214
1215 if ((ioflg & IO_NODELOCKED) == 0) {
1216 vn_finished_write(mp);
1217 VOP_UNLOCK(vp, 0, td);
1218 }
1219
1220 return (error);
1221 }
1222
1223 int
1224 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1225 const char *attrname, struct thread *td)
1226 {
1227 struct mount *mp;
1228 int error;
1229
1230 if ((ioflg & IO_NODELOCKED) == 0) {
1231 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1232 return (error);
1233 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1234 }
1235
1236 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1237
1238 /* authorize attribute removal as kernel */
1239 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1240 if (error == EOPNOTSUPP)
1241 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1242 NULL, td);
1243
1244 if ((ioflg & IO_NODELOCKED) == 0) {
1245 vn_finished_write(mp);
1246 VOP_UNLOCK(vp, 0, td);
1247 }
1248
1249 return (error);
1250 }
Cache object: c3127021bb336a3beedee7ad0d243011
|