FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c
1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD: releng/8.0/sys/kern/vfs_vnops.c 196974 2009-09-08 14:43:42Z kib $");
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/fcntl.h>
43 #include <sys/file.h>
44 #include <sys/kdb.h>
45 #include <sys/stat.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/limits.h>
49 #include <sys/lock.h>
50 #include <sys/mount.h>
51 #include <sys/mutex.h>
52 #include <sys/namei.h>
53 #include <sys/vnode.h>
54 #include <sys/bio.h>
55 #include <sys/buf.h>
56 #include <sys/filio.h>
57 #include <sys/sx.h>
58 #include <sys/ttycom.h>
59 #include <sys/conf.h>
60 #include <sys/syslog.h>
61 #include <sys/unistd.h>
62
63 #include <security/mac/mac_framework.h>
64
65 static fo_rdwr_t vn_read;
66 static fo_rdwr_t vn_write;
67 static fo_truncate_t vn_truncate;
68 static fo_ioctl_t vn_ioctl;
69 static fo_poll_t vn_poll;
70 static fo_kqfilter_t vn_kqfilter;
71 static fo_stat_t vn_statfile;
72 static fo_close_t vn_closefile;
73
74 struct fileops vnops = {
75 .fo_read = vn_read,
76 .fo_write = vn_write,
77 .fo_truncate = vn_truncate,
78 .fo_ioctl = vn_ioctl,
79 .fo_poll = vn_poll,
80 .fo_kqfilter = vn_kqfilter,
81 .fo_stat = vn_statfile,
82 .fo_close = vn_closefile,
83 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
84 };
85
86 int
87 vn_open(ndp, flagp, cmode, fp)
88 struct nameidata *ndp;
89 int *flagp, cmode;
90 struct file *fp;
91 {
92 struct thread *td = ndp->ni_cnd.cn_thread;
93
94 return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
95 }
96
97 /*
98 * Common code for vnode open operations.
99 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
100 *
101 * Note that this does NOT free nameidata for the successful case,
102 * due to the NDINIT being done elsewhere.
103 */
104 int
105 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
106 struct ucred *cred, struct file *fp)
107 {
108 struct vnode *vp;
109 struct mount *mp;
110 struct thread *td = ndp->ni_cnd.cn_thread;
111 struct vattr vat;
112 struct vattr *vap = &vat;
113 int fmode, error;
114 accmode_t accmode;
115 int vfslocked, mpsafe;
116
117 mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
118 restart:
119 vfslocked = 0;
120 fmode = *flagp;
121 if (fmode & O_CREAT) {
122 ndp->ni_cnd.cn_nameiop = CREATE;
123 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
124 MPSAFE;
125 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
126 ndp->ni_cnd.cn_flags |= FOLLOW;
127 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
128 ndp->ni_cnd.cn_flags |= AUDITVNODE1;
129 bwillwrite();
130 if ((error = namei(ndp)) != 0)
131 return (error);
132 vfslocked = NDHASGIANT(ndp);
133 if (!mpsafe)
134 ndp->ni_cnd.cn_flags &= ~MPSAFE;
135 if (ndp->ni_vp == NULL) {
136 VATTR_NULL(vap);
137 vap->va_type = VREG;
138 vap->va_mode = cmode;
139 if (fmode & O_EXCL)
140 vap->va_vaflags |= VA_EXCLUSIVE;
141 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
142 NDFREE(ndp, NDF_ONLY_PNBUF);
143 vput(ndp->ni_dvp);
144 VFS_UNLOCK_GIANT(vfslocked);
145 if ((error = vn_start_write(NULL, &mp,
146 V_XSLEEP | PCATCH)) != 0)
147 return (error);
148 goto restart;
149 }
150 #ifdef MAC
151 error = mac_vnode_check_create(cred, ndp->ni_dvp,
152 &ndp->ni_cnd, vap);
153 if (error == 0)
154 #endif
155 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
156 &ndp->ni_cnd, vap);
157 vput(ndp->ni_dvp);
158 vn_finished_write(mp);
159 if (error) {
160 VFS_UNLOCK_GIANT(vfslocked);
161 NDFREE(ndp, NDF_ONLY_PNBUF);
162 return (error);
163 }
164 fmode &= ~O_TRUNC;
165 vp = ndp->ni_vp;
166 } else {
167 if (ndp->ni_dvp == ndp->ni_vp)
168 vrele(ndp->ni_dvp);
169 else
170 vput(ndp->ni_dvp);
171 ndp->ni_dvp = NULL;
172 vp = ndp->ni_vp;
173 if (fmode & O_EXCL) {
174 error = EEXIST;
175 goto bad;
176 }
177 fmode &= ~O_CREAT;
178 }
179 } else {
180 ndp->ni_cnd.cn_nameiop = LOOKUP;
181 ndp->ni_cnd.cn_flags = ISOPEN |
182 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
183 LOCKLEAF | MPSAFE;
184 if (!(fmode & FWRITE))
185 ndp->ni_cnd.cn_flags |= LOCKSHARED;
186 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
187 ndp->ni_cnd.cn_flags |= AUDITVNODE1;
188 if ((error = namei(ndp)) != 0)
189 return (error);
190 if (!mpsafe)
191 ndp->ni_cnd.cn_flags &= ~MPSAFE;
192 vfslocked = NDHASGIANT(ndp);
193 vp = ndp->ni_vp;
194 }
195 if (vp->v_type == VLNK) {
196 error = EMLINK;
197 goto bad;
198 }
199 if (vp->v_type == VSOCK) {
200 error = EOPNOTSUPP;
201 goto bad;
202 }
203 accmode = 0;
204 if (fmode & (FWRITE | O_TRUNC)) {
205 if (vp->v_type == VDIR) {
206 error = EISDIR;
207 goto bad;
208 }
209 accmode |= VWRITE;
210 }
211 if (fmode & FREAD)
212 accmode |= VREAD;
213 if (fmode & FEXEC)
214 accmode |= VEXEC;
215 if (fmode & O_APPEND)
216 accmode |= VAPPEND;
217 #ifdef MAC
218 error = mac_vnode_check_open(cred, vp, accmode);
219 if (error)
220 goto bad;
221 #endif
222 if ((fmode & O_CREAT) == 0) {
223 if (accmode & VWRITE) {
224 error = vn_writechk(vp);
225 if (error)
226 goto bad;
227 }
228 if (accmode) {
229 error = VOP_ACCESS(vp, accmode, cred, td);
230 if (error)
231 goto bad;
232 }
233 }
234 if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
235 goto bad;
236
237 if (fmode & FWRITE)
238 vp->v_writecount++;
239 *flagp = fmode;
240 ASSERT_VOP_LOCKED(vp, "vn_open_cred");
241 if (!mpsafe)
242 VFS_UNLOCK_GIANT(vfslocked);
243 return (0);
244 bad:
245 NDFREE(ndp, NDF_ONLY_PNBUF);
246 vput(vp);
247 VFS_UNLOCK_GIANT(vfslocked);
248 *flagp = fmode;
249 ndp->ni_vp = NULL;
250 return (error);
251 }
252
253 /*
254 * Check for write permissions on the specified vnode.
255 * Prototype text segments cannot be written.
256 */
257 int
258 vn_writechk(vp)
259 register struct vnode *vp;
260 {
261
262 ASSERT_VOP_LOCKED(vp, "vn_writechk");
263 /*
264 * If there's shared text associated with
265 * the vnode, try to free it up once. If
266 * we fail, we can't allow writing.
267 */
268 if (vp->v_vflag & VV_TEXT)
269 return (ETXTBSY);
270
271 return (0);
272 }
273
274 /*
275 * Vnode close call
276 */
277 int
278 vn_close(vp, flags, file_cred, td)
279 register struct vnode *vp;
280 int flags;
281 struct ucred *file_cred;
282 struct thread *td;
283 {
284 struct mount *mp;
285 int error, lock_flags;
286
287 if (!(flags & FWRITE) && vp->v_mount != NULL &&
288 vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
289 lock_flags = LK_SHARED;
290 else
291 lock_flags = LK_EXCLUSIVE;
292
293 VFS_ASSERT_GIANT(vp->v_mount);
294
295 vn_start_write(vp, &mp, V_WAIT);
296 vn_lock(vp, lock_flags | LK_RETRY);
297 if (flags & FWRITE) {
298 VNASSERT(vp->v_writecount > 0, vp,
299 ("vn_close: negative writecount"));
300 vp->v_writecount--;
301 }
302 error = VOP_CLOSE(vp, flags, file_cred, td);
303 vput(vp);
304 vn_finished_write(mp);
305 return (error);
306 }
307
308 /*
309 * Heuristic to detect sequential operation.
310 */
311 static int
312 sequential_heuristic(struct uio *uio, struct file *fp)
313 {
314
315 /*
316 * Offset 0 is handled specially. open() sets f_seqcount to 1 so
317 * that the first I/O is normally considered to be slightly
318 * sequential. Seeking to offset 0 doesn't change sequentiality
319 * unless previous seeks have reduced f_seqcount to 0, in which
320 * case offset 0 is not special.
321 */
322 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
323 uio->uio_offset == fp->f_nextoff) {
324 /*
325 * f_seqcount is in units of fixed-size blocks so that it
326 * depends mainly on the amount of sequential I/O and not
327 * much on the number of sequential I/O's. The fixed size
328 * of 16384 is hard-coded here since it is (not quite) just
329 * a magic size that works well here. This size is more
330 * closely related to the best I/O size for real disks than
331 * to any block size used by software.
332 */
333 fp->f_seqcount += howmany(uio->uio_resid, 16384);
334 if (fp->f_seqcount > IO_SEQMAX)
335 fp->f_seqcount = IO_SEQMAX;
336 return (fp->f_seqcount << IO_SEQSHIFT);
337 }
338
339 /* Not sequential. Quickly draw-down sequentiality. */
340 if (fp->f_seqcount > 1)
341 fp->f_seqcount = 1;
342 else
343 fp->f_seqcount = 0;
344 return (0);
345 }
346
347 /*
348 * Package up an I/O request on a vnode into a uio and do it.
349 */
350 int
351 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
352 aresid, td)
353 enum uio_rw rw;
354 struct vnode *vp;
355 void *base;
356 int len;
357 off_t offset;
358 enum uio_seg segflg;
359 int ioflg;
360 struct ucred *active_cred;
361 struct ucred *file_cred;
362 int *aresid;
363 struct thread *td;
364 {
365 struct uio auio;
366 struct iovec aiov;
367 struct mount *mp;
368 struct ucred *cred;
369 int error, lock_flags;
370
371 VFS_ASSERT_GIANT(vp->v_mount);
372
373 if ((ioflg & IO_NODELOCKED) == 0) {
374 mp = NULL;
375 if (rw == UIO_WRITE) {
376 if (vp->v_type != VCHR &&
377 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
378 != 0)
379 return (error);
380 if (MNT_SHARED_WRITES(mp) ||
381 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
382 lock_flags = LK_SHARED;
383 } else {
384 lock_flags = LK_EXCLUSIVE;
385 }
386 vn_lock(vp, lock_flags | LK_RETRY);
387 } else
388 vn_lock(vp, LK_SHARED | LK_RETRY);
389
390 }
391 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
392 auio.uio_iov = &aiov;
393 auio.uio_iovcnt = 1;
394 aiov.iov_base = base;
395 aiov.iov_len = len;
396 auio.uio_resid = len;
397 auio.uio_offset = offset;
398 auio.uio_segflg = segflg;
399 auio.uio_rw = rw;
400 auio.uio_td = td;
401 error = 0;
402 #ifdef MAC
403 if ((ioflg & IO_NOMACCHECK) == 0) {
404 if (rw == UIO_READ)
405 error = mac_vnode_check_read(active_cred, file_cred,
406 vp);
407 else
408 error = mac_vnode_check_write(active_cred, file_cred,
409 vp);
410 }
411 #endif
412 if (error == 0) {
413 if (file_cred)
414 cred = file_cred;
415 else
416 cred = active_cred;
417 if (rw == UIO_READ)
418 error = VOP_READ(vp, &auio, ioflg, cred);
419 else
420 error = VOP_WRITE(vp, &auio, ioflg, cred);
421 }
422 if (aresid)
423 *aresid = auio.uio_resid;
424 else
425 if (auio.uio_resid && error == 0)
426 error = EIO;
427 if ((ioflg & IO_NODELOCKED) == 0) {
428 if (rw == UIO_WRITE && vp->v_type != VCHR)
429 vn_finished_write(mp);
430 VOP_UNLOCK(vp, 0);
431 }
432 return (error);
433 }
434
435 /*
436 * Package up an I/O request on a vnode into a uio and do it. The I/O
437 * request is split up into smaller chunks and we try to avoid saturating
438 * the buffer cache while potentially holding a vnode locked, so we
439 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
440 * to give other processes a chance to lock the vnode (either other processes
441 * core'ing the same binary, or unrelated processes scanning the directory).
442 */
443 int
444 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
445 file_cred, aresid, td)
446 enum uio_rw rw;
447 struct vnode *vp;
448 void *base;
449 size_t len;
450 off_t offset;
451 enum uio_seg segflg;
452 int ioflg;
453 struct ucred *active_cred;
454 struct ucred *file_cred;
455 size_t *aresid;
456 struct thread *td;
457 {
458 int error = 0;
459 int iaresid;
460
461 VFS_ASSERT_GIANT(vp->v_mount);
462
463 do {
464 int chunk;
465
466 /*
467 * Force `offset' to a multiple of MAXBSIZE except possibly
468 * for the first chunk, so that filesystems only need to
469 * write full blocks except possibly for the first and last
470 * chunks.
471 */
472 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
473
474 if (chunk > len)
475 chunk = len;
476 if (rw != UIO_READ && vp->v_type == VREG)
477 bwillwrite();
478 iaresid = 0;
479 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
480 ioflg, active_cred, file_cred, &iaresid, td);
481 len -= chunk; /* aresid calc already includes length */
482 if (error)
483 break;
484 offset += chunk;
485 base = (char *)base + chunk;
486 uio_yield();
487 } while (len);
488 if (aresid)
489 *aresid = len + iaresid;
490 return (error);
491 }
492
493 /*
494 * File table vnode read routine.
495 */
496 static int
497 vn_read(fp, uio, active_cred, flags, td)
498 struct file *fp;
499 struct uio *uio;
500 struct ucred *active_cred;
501 struct thread *td;
502 int flags;
503 {
504 struct vnode *vp;
505 int error, ioflag;
506 struct mtx *mtxp;
507 int vfslocked;
508
509 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
510 uio->uio_td, td));
511 mtxp = NULL;
512 vp = fp->f_vnode;
513 ioflag = 0;
514 if (fp->f_flag & FNONBLOCK)
515 ioflag |= IO_NDELAY;
516 if (fp->f_flag & O_DIRECT)
517 ioflag |= IO_DIRECT;
518 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
519 /*
520 * According to McKusick the vn lock was protecting f_offset here.
521 * It is now protected by the FOFFSET_LOCKED flag.
522 */
523 if ((flags & FOF_OFFSET) == 0) {
524 mtxp = mtx_pool_find(mtxpool_sleep, fp);
525 mtx_lock(mtxp);
526 while(fp->f_vnread_flags & FOFFSET_LOCKED) {
527 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
528 msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
529 "vnread offlock", 0);
530 }
531 fp->f_vnread_flags |= FOFFSET_LOCKED;
532 mtx_unlock(mtxp);
533 vn_lock(vp, LK_SHARED | LK_RETRY);
534 uio->uio_offset = fp->f_offset;
535 } else
536 vn_lock(vp, LK_SHARED | LK_RETRY);
537
538 ioflag |= sequential_heuristic(uio, fp);
539
540 #ifdef MAC
541 error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
542 if (error == 0)
543 #endif
544 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
545 if ((flags & FOF_OFFSET) == 0) {
546 fp->f_offset = uio->uio_offset;
547 mtx_lock(mtxp);
548 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
549 wakeup(&fp->f_vnread_flags);
550 fp->f_vnread_flags = 0;
551 mtx_unlock(mtxp);
552 }
553 fp->f_nextoff = uio->uio_offset;
554 VOP_UNLOCK(vp, 0);
555 VFS_UNLOCK_GIANT(vfslocked);
556 return (error);
557 }
558
559 /*
560 * File table vnode write routine.
561 */
562 static int
563 vn_write(fp, uio, active_cred, flags, td)
564 struct file *fp;
565 struct uio *uio;
566 struct ucred *active_cred;
567 struct thread *td;
568 int flags;
569 {
570 struct vnode *vp;
571 struct mount *mp;
572 int error, ioflag, lock_flags;
573 int vfslocked;
574
575 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
576 uio->uio_td, td));
577 vp = fp->f_vnode;
578 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
579 if (vp->v_type == VREG)
580 bwillwrite();
581 ioflag = IO_UNIT;
582 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
583 ioflag |= IO_APPEND;
584 if (fp->f_flag & FNONBLOCK)
585 ioflag |= IO_NDELAY;
586 if (fp->f_flag & O_DIRECT)
587 ioflag |= IO_DIRECT;
588 if ((fp->f_flag & O_FSYNC) ||
589 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
590 ioflag |= IO_SYNC;
591 mp = NULL;
592 if (vp->v_type != VCHR &&
593 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
594 goto unlock;
595
596 if ((MNT_SHARED_WRITES(mp) ||
597 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
598 (flags & FOF_OFFSET) != 0) {
599 lock_flags = LK_SHARED;
600 } else {
601 lock_flags = LK_EXCLUSIVE;
602 }
603
604 vn_lock(vp, lock_flags | LK_RETRY);
605 if ((flags & FOF_OFFSET) == 0)
606 uio->uio_offset = fp->f_offset;
607 ioflag |= sequential_heuristic(uio, fp);
608 #ifdef MAC
609 error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
610 if (error == 0)
611 #endif
612 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
613 if ((flags & FOF_OFFSET) == 0)
614 fp->f_offset = uio->uio_offset;
615 fp->f_nextoff = uio->uio_offset;
616 VOP_UNLOCK(vp, 0);
617 if (vp->v_type != VCHR)
618 vn_finished_write(mp);
619 unlock:
620 VFS_UNLOCK_GIANT(vfslocked);
621 return (error);
622 }
623
624 /*
625 * File table truncate routine.
626 */
627 static int
628 vn_truncate(fp, length, active_cred, td)
629 struct file *fp;
630 off_t length;
631 struct ucred *active_cred;
632 struct thread *td;
633 {
634 struct vattr vattr;
635 struct mount *mp;
636 struct vnode *vp;
637 int vfslocked;
638 int error;
639
640 vp = fp->f_vnode;
641 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
642 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
643 if (error) {
644 VFS_UNLOCK_GIANT(vfslocked);
645 return (error);
646 }
647 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
648 if (vp->v_type == VDIR) {
649 error = EISDIR;
650 goto out;
651 }
652 #ifdef MAC
653 error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
654 if (error)
655 goto out;
656 #endif
657 error = vn_writechk(vp);
658 if (error == 0) {
659 VATTR_NULL(&vattr);
660 vattr.va_size = length;
661 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
662 }
663 out:
664 VOP_UNLOCK(vp, 0);
665 vn_finished_write(mp);
666 VFS_UNLOCK_GIANT(vfslocked);
667 return (error);
668 }
669
670 /*
671 * File table vnode stat routine.
672 */
673 static int
674 vn_statfile(fp, sb, active_cred, td)
675 struct file *fp;
676 struct stat *sb;
677 struct ucred *active_cred;
678 struct thread *td;
679 {
680 struct vnode *vp = fp->f_vnode;
681 int vfslocked;
682 int error;
683
684 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
685 vn_lock(vp, LK_SHARED | LK_RETRY);
686 error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
687 VOP_UNLOCK(vp, 0);
688 VFS_UNLOCK_GIANT(vfslocked);
689
690 return (error);
691 }
692
693 /*
694 * Stat a vnode; implementation for the stat syscall
695 */
696 int
697 vn_stat(vp, sb, active_cred, file_cred, td)
698 struct vnode *vp;
699 register struct stat *sb;
700 struct ucred *active_cred;
701 struct ucred *file_cred;
702 struct thread *td;
703 {
704 struct vattr vattr;
705 register struct vattr *vap;
706 int error;
707 u_short mode;
708
709 #ifdef MAC
710 error = mac_vnode_check_stat(active_cred, file_cred, vp);
711 if (error)
712 return (error);
713 #endif
714
715 vap = &vattr;
716
717 /*
718 * Initialize defaults for new and unusual fields, so that file
719 * systems which don't support these fields don't need to know
720 * about them.
721 */
722 vap->va_birthtime.tv_sec = -1;
723 vap->va_birthtime.tv_nsec = 0;
724 vap->va_fsid = VNOVAL;
725 vap->va_rdev = NODEV;
726
727 error = VOP_GETATTR(vp, vap, active_cred);
728 if (error)
729 return (error);
730
731 /*
732 * Zero the spare stat fields
733 */
734 bzero(sb, sizeof *sb);
735
736 /*
737 * Copy from vattr table
738 */
739 if (vap->va_fsid != VNOVAL)
740 sb->st_dev = vap->va_fsid;
741 else
742 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
743 sb->st_ino = vap->va_fileid;
744 mode = vap->va_mode;
745 switch (vap->va_type) {
746 case VREG:
747 mode |= S_IFREG;
748 break;
749 case VDIR:
750 mode |= S_IFDIR;
751 break;
752 case VBLK:
753 mode |= S_IFBLK;
754 break;
755 case VCHR:
756 mode |= S_IFCHR;
757 break;
758 case VLNK:
759 mode |= S_IFLNK;
760 break;
761 case VSOCK:
762 mode |= S_IFSOCK;
763 break;
764 case VFIFO:
765 mode |= S_IFIFO;
766 break;
767 default:
768 return (EBADF);
769 };
770 sb->st_mode = mode;
771 sb->st_nlink = vap->va_nlink;
772 sb->st_uid = vap->va_uid;
773 sb->st_gid = vap->va_gid;
774 sb->st_rdev = vap->va_rdev;
775 if (vap->va_size > OFF_MAX)
776 return (EOVERFLOW);
777 sb->st_size = vap->va_size;
778 sb->st_atimespec = vap->va_atime;
779 sb->st_mtimespec = vap->va_mtime;
780 sb->st_ctimespec = vap->va_ctime;
781 sb->st_birthtimespec = vap->va_birthtime;
782
783 /*
784 * According to www.opengroup.org, the meaning of st_blksize is
785 * "a filesystem-specific preferred I/O block size for this
786 * object. In some filesystem types, this may vary from file
787 * to file"
788 * Default to PAGE_SIZE after much discussion.
789 * XXX: min(PAGE_SIZE, vp->v_bufobj.bo_bsize) may be more correct.
790 */
791
792 sb->st_blksize = PAGE_SIZE;
793
794 sb->st_flags = vap->va_flags;
795 if (priv_check(td, PRIV_VFS_GENERATION))
796 sb->st_gen = 0;
797 else
798 sb->st_gen = vap->va_gen;
799
800 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
801 return (0);
802 }
803
804 /*
805 * File table vnode ioctl routine.
806 */
807 static int
808 vn_ioctl(fp, com, data, active_cred, td)
809 struct file *fp;
810 u_long com;
811 void *data;
812 struct ucred *active_cred;
813 struct thread *td;
814 {
815 struct vnode *vp = fp->f_vnode;
816 struct vattr vattr;
817 int vfslocked;
818 int error;
819
820 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
821 error = ENOTTY;
822 switch (vp->v_type) {
823 case VREG:
824 case VDIR:
825 if (com == FIONREAD) {
826 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
827 error = VOP_GETATTR(vp, &vattr, active_cred);
828 VOP_UNLOCK(vp, 0);
829 if (!error)
830 *(int *)data = vattr.va_size - fp->f_offset;
831 }
832 if (com == FIONBIO || com == FIOASYNC) /* XXX */
833 error = 0;
834 else
835 error = VOP_IOCTL(vp, com, data, fp->f_flag,
836 active_cred, td);
837 break;
838
839 default:
840 break;
841 }
842 VFS_UNLOCK_GIANT(vfslocked);
843 return (error);
844 }
845
846 /*
847 * File table vnode poll routine.
848 */
849 static int
850 vn_poll(fp, events, active_cred, td)
851 struct file *fp;
852 int events;
853 struct ucred *active_cred;
854 struct thread *td;
855 {
856 struct vnode *vp;
857 int vfslocked;
858 int error;
859
860 vp = fp->f_vnode;
861 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
862 #ifdef MAC
863 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
864 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
865 VOP_UNLOCK(vp, 0);
866 if (!error)
867 #endif
868
869 error = VOP_POLL(vp, events, fp->f_cred, td);
870 VFS_UNLOCK_GIANT(vfslocked);
871 return (error);
872 }
873
874 /*
875 * Acquire the requested lock and then check for validity. LK_RETRY
876 * permits vn_lock to return doomed vnodes.
877 */
878 int
879 _vn_lock(struct vnode *vp, int flags, char *file, int line)
880 {
881 int error;
882
883 VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
884 ("vn_lock called with no locktype."));
885 do {
886 #ifdef DEBUG_VFS_LOCKS
887 KASSERT(vp->v_holdcnt != 0,
888 ("vn_lock %p: zero hold count", vp));
889 #endif
890 error = VOP_LOCK1(vp, flags, file, line);
891 flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
892 KASSERT((flags & LK_RETRY) == 0 || error == 0,
893 ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
894 flags, error));
895 /*
896 * Callers specify LK_RETRY if they wish to get dead vnodes.
897 * If RETRY is not set, we return ENOENT instead.
898 */
899 if (error == 0 && vp->v_iflag & VI_DOOMED &&
900 (flags & LK_RETRY) == 0) {
901 VOP_UNLOCK(vp, 0);
902 error = ENOENT;
903 break;
904 }
905 } while (flags & LK_RETRY && error != 0);
906 return (error);
907 }
908
909 /*
910 * File table vnode close routine.
911 */
912 static int
913 vn_closefile(fp, td)
914 struct file *fp;
915 struct thread *td;
916 {
917 struct vnode *vp;
918 struct flock lf;
919 int vfslocked;
920 int error;
921
922 vp = fp->f_vnode;
923
924 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
925 if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
926 lf.l_whence = SEEK_SET;
927 lf.l_start = 0;
928 lf.l_len = 0;
929 lf.l_type = F_UNLCK;
930 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
931 }
932
933 fp->f_ops = &badfileops;
934
935 error = vn_close(vp, fp->f_flag, fp->f_cred, td);
936 VFS_UNLOCK_GIANT(vfslocked);
937 return (error);
938 }
939
940 /*
941 * Preparing to start a filesystem write operation. If the operation is
942 * permitted, then we bump the count of operations in progress and
943 * proceed. If a suspend request is in progress, we wait until the
944 * suspension is over, and then proceed.
945 */
946 int
947 vn_start_write(vp, mpp, flags)
948 struct vnode *vp;
949 struct mount **mpp;
950 int flags;
951 {
952 struct mount *mp;
953 int error;
954
955 error = 0;
956 /*
957 * If a vnode is provided, get and return the mount point that
958 * to which it will write.
959 */
960 if (vp != NULL) {
961 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
962 *mpp = NULL;
963 if (error != EOPNOTSUPP)
964 return (error);
965 return (0);
966 }
967 }
968 if ((mp = *mpp) == NULL)
969 return (0);
970
971 /*
972 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
973 * a vfs_ref().
974 * As long as a vnode is not provided we need to acquire a
975 * refcount for the provided mountpoint too, in order to
976 * emulate a vfs_ref().
977 */
978 MNT_ILOCK(mp);
979 if (vp == NULL)
980 MNT_REF(mp);
981
982 /*
983 * Check on status of suspension.
984 */
985 if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
986 mp->mnt_susp_owner != curthread) {
987 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
988 if (flags & V_NOWAIT) {
989 error = EWOULDBLOCK;
990 goto unlock;
991 }
992 error = msleep(&mp->mnt_flag, MNT_MTX(mp),
993 (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
994 if (error)
995 goto unlock;
996 }
997 }
998 if (flags & V_XSLEEP)
999 goto unlock;
1000 mp->mnt_writeopcount++;
1001 unlock:
1002 if (error != 0 || (flags & V_XSLEEP) != 0)
1003 MNT_REL(mp);
1004 MNT_IUNLOCK(mp);
1005 return (error);
1006 }
1007
1008 /*
1009 * Secondary suspension. Used by operations such as vop_inactive
1010 * routines that are needed by the higher level functions. These
1011 * are allowed to proceed until all the higher level functions have
1012 * completed (indicated by mnt_writeopcount dropping to zero). At that
1013 * time, these operations are halted until the suspension is over.
1014 */
1015 int
1016 vn_start_secondary_write(vp, mpp, flags)
1017 struct vnode *vp;
1018 struct mount **mpp;
1019 int flags;
1020 {
1021 struct mount *mp;
1022 int error;
1023
1024 retry:
1025 if (vp != NULL) {
1026 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1027 *mpp = NULL;
1028 if (error != EOPNOTSUPP)
1029 return (error);
1030 return (0);
1031 }
1032 }
1033 /*
1034 * If we are not suspended or have not yet reached suspended
1035 * mode, then let the operation proceed.
1036 */
1037 if ((mp = *mpp) == NULL)
1038 return (0);
1039
1040 /*
1041 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1042 * a vfs_ref().
1043 * As long as a vnode is not provided we need to acquire a
1044 * refcount for the provided mountpoint too, in order to
1045 * emulate a vfs_ref().
1046 */
1047 MNT_ILOCK(mp);
1048 if (vp == NULL)
1049 MNT_REF(mp);
1050 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1051 mp->mnt_secondary_writes++;
1052 mp->mnt_secondary_accwrites++;
1053 MNT_IUNLOCK(mp);
1054 return (0);
1055 }
1056 if (flags & V_NOWAIT) {
1057 MNT_REL(mp);
1058 MNT_IUNLOCK(mp);
1059 return (EWOULDBLOCK);
1060 }
1061 /*
1062 * Wait for the suspension to finish.
1063 */
1064 error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1065 (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1066 vfs_rel(mp);
1067 if (error == 0)
1068 goto retry;
1069 return (error);
1070 }
1071
1072 /*
1073 * Filesystem write operation has completed. If we are suspending and this
1074 * operation is the last one, notify the suspender that the suspension is
1075 * now in effect.
1076 */
1077 void
1078 vn_finished_write(mp)
1079 struct mount *mp;
1080 {
1081 if (mp == NULL)
1082 return;
1083 MNT_ILOCK(mp);
1084 MNT_REL(mp);
1085 mp->mnt_writeopcount--;
1086 if (mp->mnt_writeopcount < 0)
1087 panic("vn_finished_write: neg cnt");
1088 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1089 mp->mnt_writeopcount <= 0)
1090 wakeup(&mp->mnt_writeopcount);
1091 MNT_IUNLOCK(mp);
1092 }
1093
1094
1095 /*
1096 * Filesystem secondary write operation has completed. If we are
1097 * suspending and this operation is the last one, notify the suspender
1098 * that the suspension is now in effect.
1099 */
1100 void
1101 vn_finished_secondary_write(mp)
1102 struct mount *mp;
1103 {
1104 if (mp == NULL)
1105 return;
1106 MNT_ILOCK(mp);
1107 MNT_REL(mp);
1108 mp->mnt_secondary_writes--;
1109 if (mp->mnt_secondary_writes < 0)
1110 panic("vn_finished_secondary_write: neg cnt");
1111 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1112 mp->mnt_secondary_writes <= 0)
1113 wakeup(&mp->mnt_secondary_writes);
1114 MNT_IUNLOCK(mp);
1115 }
1116
1117
1118
1119 /*
1120 * Request a filesystem to suspend write operations.
1121 */
1122 int
1123 vfs_write_suspend(mp)
1124 struct mount *mp;
1125 {
1126 int error;
1127
1128 MNT_ILOCK(mp);
1129 if (mp->mnt_susp_owner == curthread) {
1130 MNT_IUNLOCK(mp);
1131 return (EALREADY);
1132 }
1133 while (mp->mnt_kern_flag & MNTK_SUSPEND)
1134 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1135 mp->mnt_kern_flag |= MNTK_SUSPEND;
1136 mp->mnt_susp_owner = curthread;
1137 if (mp->mnt_writeopcount > 0)
1138 (void) msleep(&mp->mnt_writeopcount,
1139 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1140 else
1141 MNT_IUNLOCK(mp);
1142 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1143 vfs_write_resume(mp);
1144 return (error);
1145 }
1146
1147 /*
1148 * Request a filesystem to resume write operations.
1149 */
1150 void
1151 vfs_write_resume(mp)
1152 struct mount *mp;
1153 {
1154
1155 MNT_ILOCK(mp);
1156 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1157 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1158 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1159 MNTK_SUSPENDED);
1160 mp->mnt_susp_owner = NULL;
1161 wakeup(&mp->mnt_writeopcount);
1162 wakeup(&mp->mnt_flag);
1163 curthread->td_pflags &= ~TDP_IGNSUSP;
1164 MNT_IUNLOCK(mp);
1165 VFS_SUSP_CLEAN(mp);
1166 } else
1167 MNT_IUNLOCK(mp);
1168 }
1169
1170 /*
1171 * Implement kqueues for files by translating it to vnode operation.
1172 */
1173 static int
1174 vn_kqfilter(struct file *fp, struct knote *kn)
1175 {
1176 int vfslocked;
1177 int error;
1178
1179 vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
1180 error = VOP_KQFILTER(fp->f_vnode, kn);
1181 VFS_UNLOCK_GIANT(vfslocked);
1182
1183 return error;
1184 }
1185
1186 /*
1187 * Simplified in-kernel wrapper calls for extended attribute access.
1188 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1189 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1190 */
1191 int
1192 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1193 const char *attrname, int *buflen, char *buf, struct thread *td)
1194 {
1195 struct uio auio;
1196 struct iovec iov;
1197 int error;
1198
1199 iov.iov_len = *buflen;
1200 iov.iov_base = buf;
1201
1202 auio.uio_iov = &iov;
1203 auio.uio_iovcnt = 1;
1204 auio.uio_rw = UIO_READ;
1205 auio.uio_segflg = UIO_SYSSPACE;
1206 auio.uio_td = td;
1207 auio.uio_offset = 0;
1208 auio.uio_resid = *buflen;
1209
1210 if ((ioflg & IO_NODELOCKED) == 0)
1211 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1212
1213 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1214
1215 /* authorize attribute retrieval as kernel */
1216 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1217 td);
1218
1219 if ((ioflg & IO_NODELOCKED) == 0)
1220 VOP_UNLOCK(vp, 0);
1221
1222 if (error == 0) {
1223 *buflen = *buflen - auio.uio_resid;
1224 }
1225
1226 return (error);
1227 }
1228
1229 /*
1230 * XXX failure mode if partially written?
1231 */
1232 int
1233 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1234 const char *attrname, int buflen, char *buf, struct thread *td)
1235 {
1236 struct uio auio;
1237 struct iovec iov;
1238 struct mount *mp;
1239 int error;
1240
1241 iov.iov_len = buflen;
1242 iov.iov_base = buf;
1243
1244 auio.uio_iov = &iov;
1245 auio.uio_iovcnt = 1;
1246 auio.uio_rw = UIO_WRITE;
1247 auio.uio_segflg = UIO_SYSSPACE;
1248 auio.uio_td = td;
1249 auio.uio_offset = 0;
1250 auio.uio_resid = buflen;
1251
1252 if ((ioflg & IO_NODELOCKED) == 0) {
1253 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1254 return (error);
1255 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1256 }
1257
1258 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1259
1260 /* authorize attribute setting as kernel */
1261 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1262
1263 if ((ioflg & IO_NODELOCKED) == 0) {
1264 vn_finished_write(mp);
1265 VOP_UNLOCK(vp, 0);
1266 }
1267
1268 return (error);
1269 }
1270
1271 int
1272 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1273 const char *attrname, struct thread *td)
1274 {
1275 struct mount *mp;
1276 int error;
1277
1278 if ((ioflg & IO_NODELOCKED) == 0) {
1279 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1280 return (error);
1281 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1282 }
1283
1284 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1285
1286 /* authorize attribute removal as kernel */
1287 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1288 if (error == EOPNOTSUPP)
1289 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1290 NULL, td);
1291
1292 if ((ioflg & IO_NODELOCKED) == 0) {
1293 vn_finished_write(mp);
1294 VOP_UNLOCK(vp, 0);
1295 }
1296
1297 return (error);
1298 }
1299
1300 int
1301 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
1302 {
1303 struct mount *mp;
1304 int ltype, error;
1305
1306 mp = vp->v_mount;
1307 ltype = VOP_ISLOCKED(vp);
1308 KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
1309 ("vn_vget_ino: vp not locked"));
1310 error = vfs_busy(mp, MBF_NOWAIT);
1311 if (error != 0) {
1312 vfs_ref(mp);
1313 VOP_UNLOCK(vp, 0);
1314 error = vfs_busy(mp, 0);
1315 vn_lock(vp, ltype | LK_RETRY);
1316 vfs_rel(mp);
1317 if (error != 0)
1318 return (ENOENT);
1319 if (vp->v_iflag & VI_DOOMED) {
1320 vfs_unbusy(mp);
1321 return (ENOENT);
1322 }
1323 }
1324 VOP_UNLOCK(vp, 0);
1325 error = VFS_VGET(mp, ino, lkflags, rvp);
1326 vfs_unbusy(mp);
1327 vn_lock(vp, ltype | LK_RETRY);
1328 if (vp->v_iflag & VI_DOOMED) {
1329 if (error == 0)
1330 vput(*rvp);
1331 error = ENOENT;
1332 }
1333 return (error);
1334 }
Cache object: 1360cf5df0fd9f4e6eb504104e21b428
|