FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c
1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
35 */
36
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD: releng/8.4/sys/kern/vfs_vnops.c 239788 2012-08-28 18:45:20Z jhb $");
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/fcntl.h>
43 #include <sys/file.h>
44 #include <sys/kdb.h>
45 #include <sys/stat.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/limits.h>
49 #include <sys/lock.h>
50 #include <sys/mount.h>
51 #include <sys/mutex.h>
52 #include <sys/namei.h>
53 #include <sys/vnode.h>
54 #include <sys/bio.h>
55 #include <sys/buf.h>
56 #include <sys/filio.h>
57 #include <sys/resourcevar.h>
58 #include <sys/sx.h>
59 #include <sys/ttycom.h>
60 #include <sys/conf.h>
61 #include <sys/syslog.h>
62 #include <sys/unistd.h>
63
64 #include <security/mac/mac_framework.h>
65
66 #include <vm/vm.h>
67 #include <vm/vm_object.h>
68
69 static fo_rdwr_t vn_read;
70 static fo_rdwr_t vn_write;
71 static fo_truncate_t vn_truncate;
72 static fo_ioctl_t vn_ioctl;
73 static fo_poll_t vn_poll;
74 static fo_kqfilter_t vn_kqfilter;
75 static fo_stat_t vn_statfile;
76 static fo_close_t vn_closefile;
77
78 struct fileops vnops = {
79 .fo_read = vn_read,
80 .fo_write = vn_write,
81 .fo_truncate = vn_truncate,
82 .fo_ioctl = vn_ioctl,
83 .fo_poll = vn_poll,
84 .fo_kqfilter = vn_kqfilter,
85 .fo_stat = vn_statfile,
86 .fo_close = vn_closefile,
87 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
88 };
89
90 int
91 vn_open(ndp, flagp, cmode, fp)
92 struct nameidata *ndp;
93 int *flagp, cmode;
94 struct file *fp;
95 {
96 struct thread *td = ndp->ni_cnd.cn_thread;
97
98 return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
99 }
100
101 /*
102 * Common code for vnode open operations.
103 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
104 *
105 * Note that this does NOT free nameidata for the successful case,
106 * due to the NDINIT being done elsewhere.
107 */
108 int
109 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
110 struct ucred *cred, struct file *fp)
111 {
112 struct vnode *vp;
113 struct mount *mp;
114 struct thread *td = ndp->ni_cnd.cn_thread;
115 struct vattr vat;
116 struct vattr *vap = &vat;
117 int fmode, error;
118 accmode_t accmode;
119 int vfslocked, mpsafe;
120
121 mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
122 restart:
123 vfslocked = 0;
124 fmode = *flagp;
125 if (fmode & O_CREAT) {
126 ndp->ni_cnd.cn_nameiop = CREATE;
127 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
128 MPSAFE;
129 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
130 ndp->ni_cnd.cn_flags |= FOLLOW;
131 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
132 ndp->ni_cnd.cn_flags |= AUDITVNODE1;
133 bwillwrite();
134 if ((error = namei(ndp)) != 0)
135 return (error);
136 vfslocked = NDHASGIANT(ndp);
137 if (!mpsafe)
138 ndp->ni_cnd.cn_flags &= ~MPSAFE;
139 if (ndp->ni_vp == NULL) {
140 VATTR_NULL(vap);
141 vap->va_type = VREG;
142 vap->va_mode = cmode;
143 if (fmode & O_EXCL)
144 vap->va_vaflags |= VA_EXCLUSIVE;
145 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
146 NDFREE(ndp, NDF_ONLY_PNBUF);
147 vput(ndp->ni_dvp);
148 VFS_UNLOCK_GIANT(vfslocked);
149 if ((error = vn_start_write(NULL, &mp,
150 V_XSLEEP | PCATCH)) != 0)
151 return (error);
152 goto restart;
153 }
154 #ifdef MAC
155 error = mac_vnode_check_create(cred, ndp->ni_dvp,
156 &ndp->ni_cnd, vap);
157 if (error == 0)
158 #endif
159 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
160 &ndp->ni_cnd, vap);
161 vput(ndp->ni_dvp);
162 vn_finished_write(mp);
163 if (error) {
164 VFS_UNLOCK_GIANT(vfslocked);
165 NDFREE(ndp, NDF_ONLY_PNBUF);
166 return (error);
167 }
168 fmode &= ~O_TRUNC;
169 vp = ndp->ni_vp;
170 } else {
171 if (ndp->ni_dvp == ndp->ni_vp)
172 vrele(ndp->ni_dvp);
173 else
174 vput(ndp->ni_dvp);
175 ndp->ni_dvp = NULL;
176 vp = ndp->ni_vp;
177 if (fmode & O_EXCL) {
178 error = EEXIST;
179 goto bad;
180 }
181 fmode &= ~O_CREAT;
182 }
183 } else {
184 ndp->ni_cnd.cn_nameiop = LOOKUP;
185 ndp->ni_cnd.cn_flags = ISOPEN |
186 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
187 LOCKLEAF | MPSAFE;
188 if (!(fmode & FWRITE))
189 ndp->ni_cnd.cn_flags |= LOCKSHARED;
190 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
191 ndp->ni_cnd.cn_flags |= AUDITVNODE1;
192 if ((error = namei(ndp)) != 0)
193 return (error);
194 if (!mpsafe)
195 ndp->ni_cnd.cn_flags &= ~MPSAFE;
196 vfslocked = NDHASGIANT(ndp);
197 vp = ndp->ni_vp;
198 }
199 if (vp->v_type == VLNK) {
200 error = EMLINK;
201 goto bad;
202 }
203 if (vp->v_type == VSOCK) {
204 error = EOPNOTSUPP;
205 goto bad;
206 }
207 accmode = 0;
208 if (fmode & (FWRITE | O_TRUNC)) {
209 if (vp->v_type == VDIR) {
210 error = EISDIR;
211 goto bad;
212 }
213 accmode |= VWRITE;
214 }
215 if (fmode & FREAD)
216 accmode |= VREAD;
217 if (fmode & FEXEC)
218 accmode |= VEXEC;
219 if ((fmode & O_APPEND) && (fmode & FWRITE))
220 accmode |= VAPPEND;
221 #ifdef MAC
222 error = mac_vnode_check_open(cred, vp, accmode);
223 if (error)
224 goto bad;
225 #endif
226 if ((fmode & O_CREAT) == 0) {
227 if (accmode & VWRITE) {
228 error = vn_writechk(vp);
229 if (error)
230 goto bad;
231 }
232 if (accmode) {
233 error = VOP_ACCESS(vp, accmode, cred, td);
234 if (error)
235 goto bad;
236 }
237 }
238 if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
239 goto bad;
240
241 if (fmode & FWRITE)
242 vp->v_writecount++;
243 *flagp = fmode;
244 ASSERT_VOP_LOCKED(vp, "vn_open_cred");
245 if (!mpsafe)
246 VFS_UNLOCK_GIANT(vfslocked);
247 return (0);
248 bad:
249 NDFREE(ndp, NDF_ONLY_PNBUF);
250 vput(vp);
251 VFS_UNLOCK_GIANT(vfslocked);
252 *flagp = fmode;
253 ndp->ni_vp = NULL;
254 return (error);
255 }
256
257 /*
258 * Check for write permissions on the specified vnode.
259 * Prototype text segments cannot be written.
260 */
261 int
262 vn_writechk(vp)
263 register struct vnode *vp;
264 {
265
266 ASSERT_VOP_LOCKED(vp, "vn_writechk");
267 /*
268 * If there's shared text associated with
269 * the vnode, try to free it up once. If
270 * we fail, we can't allow writing.
271 */
272 if (vp->v_vflag & VV_TEXT)
273 return (ETXTBSY);
274
275 return (0);
276 }
277
278 /*
279 * Vnode close call
280 */
281 int
282 vn_close(vp, flags, file_cred, td)
283 register struct vnode *vp;
284 int flags;
285 struct ucred *file_cred;
286 struct thread *td;
287 {
288 struct mount *mp;
289 int error, lock_flags;
290
291 if (!(flags & FWRITE) && vp->v_mount != NULL &&
292 vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
293 lock_flags = LK_SHARED;
294 else
295 lock_flags = LK_EXCLUSIVE;
296
297 VFS_ASSERT_GIANT(vp->v_mount);
298
299 vn_start_write(vp, &mp, V_WAIT);
300 vn_lock(vp, lock_flags | LK_RETRY);
301 if (flags & FWRITE) {
302 VNASSERT(vp->v_writecount > 0, vp,
303 ("vn_close: negative writecount"));
304 vp->v_writecount--;
305 }
306 error = VOP_CLOSE(vp, flags, file_cred, td);
307 vput(vp);
308 vn_finished_write(mp);
309 return (error);
310 }
311
312 /*
313 * Heuristic to detect sequential operation.
314 */
315 static int
316 sequential_heuristic(struct uio *uio, struct file *fp)
317 {
318
319 if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
320 return (fp->f_seqcount << IO_SEQSHIFT);
321
322 /*
323 * Offset 0 is handled specially. open() sets f_seqcount to 1 so
324 * that the first I/O is normally considered to be slightly
325 * sequential. Seeking to offset 0 doesn't change sequentiality
326 * unless previous seeks have reduced f_seqcount to 0, in which
327 * case offset 0 is not special.
328 */
329 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
330 uio->uio_offset == fp->f_nextoff) {
331 /*
332 * f_seqcount is in units of fixed-size blocks so that it
333 * depends mainly on the amount of sequential I/O and not
334 * much on the number of sequential I/O's. The fixed size
335 * of 16384 is hard-coded here since it is (not quite) just
336 * a magic size that works well here. This size is more
337 * closely related to the best I/O size for real disks than
338 * to any block size used by software.
339 */
340 fp->f_seqcount += howmany(uio->uio_resid, 16384);
341 if (fp->f_seqcount > IO_SEQMAX)
342 fp->f_seqcount = IO_SEQMAX;
343 return (fp->f_seqcount << IO_SEQSHIFT);
344 }
345
346 /* Not sequential. Quickly draw-down sequentiality. */
347 if (fp->f_seqcount > 1)
348 fp->f_seqcount = 1;
349 else
350 fp->f_seqcount = 0;
351 return (0);
352 }
353
354 /*
355 * Package up an I/O request on a vnode into a uio and do it.
356 */
357 int
358 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
359 aresid, td)
360 enum uio_rw rw;
361 struct vnode *vp;
362 void *base;
363 int len;
364 off_t offset;
365 enum uio_seg segflg;
366 int ioflg;
367 struct ucred *active_cred;
368 struct ucred *file_cred;
369 int *aresid;
370 struct thread *td;
371 {
372 struct uio auio;
373 struct iovec aiov;
374 struct mount *mp;
375 struct ucred *cred;
376 int error, lock_flags;
377
378 VFS_ASSERT_GIANT(vp->v_mount);
379
380 if ((ioflg & IO_NODELOCKED) == 0) {
381 mp = NULL;
382 if (rw == UIO_WRITE) {
383 if (vp->v_type != VCHR &&
384 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
385 != 0)
386 return (error);
387 if (MNT_SHARED_WRITES(mp) ||
388 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
389 lock_flags = LK_SHARED;
390 } else {
391 lock_flags = LK_EXCLUSIVE;
392 }
393 vn_lock(vp, lock_flags | LK_RETRY);
394 } else
395 vn_lock(vp, LK_SHARED | LK_RETRY);
396
397 }
398 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
399 auio.uio_iov = &aiov;
400 auio.uio_iovcnt = 1;
401 aiov.iov_base = base;
402 aiov.iov_len = len;
403 auio.uio_resid = len;
404 auio.uio_offset = offset;
405 auio.uio_segflg = segflg;
406 auio.uio_rw = rw;
407 auio.uio_td = td;
408 error = 0;
409 #ifdef MAC
410 if ((ioflg & IO_NOMACCHECK) == 0) {
411 if (rw == UIO_READ)
412 error = mac_vnode_check_read(active_cred, file_cred,
413 vp);
414 else
415 error = mac_vnode_check_write(active_cred, file_cred,
416 vp);
417 }
418 #endif
419 if (error == 0) {
420 if (file_cred)
421 cred = file_cred;
422 else
423 cred = active_cred;
424 if (rw == UIO_READ)
425 error = VOP_READ(vp, &auio, ioflg, cred);
426 else
427 error = VOP_WRITE(vp, &auio, ioflg, cred);
428 }
429 if (aresid)
430 *aresid = auio.uio_resid;
431 else
432 if (auio.uio_resid && error == 0)
433 error = EIO;
434 if ((ioflg & IO_NODELOCKED) == 0) {
435 if (rw == UIO_WRITE && vp->v_type != VCHR)
436 vn_finished_write(mp);
437 VOP_UNLOCK(vp, 0);
438 }
439 return (error);
440 }
441
442 /*
443 * Package up an I/O request on a vnode into a uio and do it. The I/O
444 * request is split up into smaller chunks and we try to avoid saturating
445 * the buffer cache while potentially holding a vnode locked, so we
446 * check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
447 * to give other processes a chance to lock the vnode (either other processes
448 * core'ing the same binary, or unrelated processes scanning the directory).
449 */
450 int
451 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
452 file_cred, aresid, td)
453 enum uio_rw rw;
454 struct vnode *vp;
455 void *base;
456 size_t len;
457 off_t offset;
458 enum uio_seg segflg;
459 int ioflg;
460 struct ucred *active_cred;
461 struct ucred *file_cred;
462 size_t *aresid;
463 struct thread *td;
464 {
465 int error = 0;
466 int iaresid;
467
468 VFS_ASSERT_GIANT(vp->v_mount);
469
470 do {
471 int chunk;
472
473 /*
474 * Force `offset' to a multiple of MAXBSIZE except possibly
475 * for the first chunk, so that filesystems only need to
476 * write full blocks except possibly for the first and last
477 * chunks.
478 */
479 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
480
481 if (chunk > len)
482 chunk = len;
483 if (rw != UIO_READ && vp->v_type == VREG)
484 bwillwrite();
485 iaresid = 0;
486 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
487 ioflg, active_cred, file_cred, &iaresid, td);
488 len -= chunk; /* aresid calc already includes length */
489 if (error)
490 break;
491 offset += chunk;
492 base = (char *)base + chunk;
493 uio_yield();
494 } while (len);
495 if (aresid)
496 *aresid = len + iaresid;
497 return (error);
498 }
499
500 /*
501 * File table vnode read routine.
502 */
503 static int
504 vn_read(fp, uio, active_cred, flags, td)
505 struct file *fp;
506 struct uio *uio;
507 struct ucred *active_cred;
508 struct thread *td;
509 int flags;
510 {
511 struct vnode *vp;
512 int error, ioflag;
513 struct mtx *mtxp;
514 int advice, vfslocked;
515 off_t offset, start, end;
516
517 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
518 uio->uio_td, td));
519 mtxp = NULL;
520 vp = fp->f_vnode;
521 ioflag = 0;
522 if (fp->f_flag & FNONBLOCK)
523 ioflag |= IO_NDELAY;
524 if (fp->f_flag & O_DIRECT)
525 ioflag |= IO_DIRECT;
526 advice = POSIX_FADV_NORMAL;
527 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
528 /*
529 * According to McKusick the vn lock was protecting f_offset here.
530 * It is now protected by the FOFFSET_LOCKED flag.
531 */
532 if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
533 mtxp = mtx_pool_find(mtxpool_sleep, fp);
534 mtx_lock(mtxp);
535 if ((flags & FOF_OFFSET) == 0) {
536 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
537 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
538 msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
539 "vnread offlock", 0);
540 }
541 fp->f_vnread_flags |= FOFFSET_LOCKED;
542 uio->uio_offset = fp->f_offset;
543 }
544 if (fp->f_advice != NULL &&
545 uio->uio_offset >= fp->f_advice->fa_start &&
546 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
547 advice = fp->f_advice->fa_advice;
548 mtx_unlock(mtxp);
549 }
550 vn_lock(vp, LK_SHARED | LK_RETRY);
551
552 switch (advice) {
553 case POSIX_FADV_NORMAL:
554 case POSIX_FADV_SEQUENTIAL:
555 case POSIX_FADV_NOREUSE:
556 ioflag |= sequential_heuristic(uio, fp);
557 break;
558 case POSIX_FADV_RANDOM:
559 /* Disable read-ahead for random I/O. */
560 break;
561 }
562 offset = uio->uio_offset;
563
564 #ifdef MAC
565 error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
566 if (error == 0)
567 #endif
568 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
569 if ((flags & FOF_OFFSET) == 0) {
570 fp->f_offset = uio->uio_offset;
571 mtx_lock(mtxp);
572 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
573 wakeup(&fp->f_vnread_flags);
574 fp->f_vnread_flags = 0;
575 mtx_unlock(mtxp);
576 }
577 fp->f_nextoff = uio->uio_offset;
578 VOP_UNLOCK(vp, 0);
579 if (error == 0 && advice == POSIX_FADV_NOREUSE &&
580 offset != uio->uio_offset) {
581 /*
582 * Use POSIX_FADV_DONTNEED to flush clean pages and
583 * buffers for the backing file after a
584 * POSIX_FADV_NOREUSE read(2). To optimize the common
585 * case of using POSIX_FADV_NOREUSE with sequential
586 * access, track the previous implicit DONTNEED
587 * request and grow this request to include the
588 * current read(2) in addition to the previous
589 * DONTNEED. With purely sequential access this will
590 * cause the DONTNEED requests to continously grow to
591 * cover all of the previously read regions of the
592 * file. This allows filesystem blocks that are
593 * accessed by multiple calls to read(2) to be flushed
594 * once the last read(2) finishes.
595 */
596 start = offset;
597 end = uio->uio_offset - 1;
598 mtx_lock(mtxp);
599 if (fp->f_advice != NULL &&
600 fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
601 if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
602 start = fp->f_advice->fa_prevstart;
603 else if (fp->f_advice->fa_prevstart != 0 &&
604 fp->f_advice->fa_prevstart == end + 1)
605 end = fp->f_advice->fa_prevend;
606 fp->f_advice->fa_prevstart = start;
607 fp->f_advice->fa_prevend = end;
608 }
609 mtx_unlock(mtxp);
610 error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
611 }
612 VFS_UNLOCK_GIANT(vfslocked);
613 return (error);
614 }
615
616 /*
617 * File table vnode write routine.
618 */
619 static int
620 vn_write(fp, uio, active_cred, flags, td)
621 struct file *fp;
622 struct uio *uio;
623 struct ucred *active_cred;
624 struct thread *td;
625 int flags;
626 {
627 struct vnode *vp;
628 struct mount *mp;
629 int error, ioflag, lock_flags;
630 struct mtx *mtxp;
631 int advice, vfslocked;
632 off_t offset, start, end;
633
634 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
635 uio->uio_td, td));
636 vp = fp->f_vnode;
637 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
638 if (vp->v_type == VREG)
639 bwillwrite();
640 ioflag = IO_UNIT;
641 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
642 ioflag |= IO_APPEND;
643 if (fp->f_flag & FNONBLOCK)
644 ioflag |= IO_NDELAY;
645 if (fp->f_flag & O_DIRECT)
646 ioflag |= IO_DIRECT;
647 if ((fp->f_flag & O_FSYNC) ||
648 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
649 ioflag |= IO_SYNC;
650 mp = NULL;
651 if (vp->v_type != VCHR &&
652 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
653 goto unlock;
654
655 if ((MNT_SHARED_WRITES(mp) ||
656 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
657 (flags & FOF_OFFSET) != 0) {
658 lock_flags = LK_SHARED;
659 } else {
660 lock_flags = LK_EXCLUSIVE;
661 }
662
663 vn_lock(vp, lock_flags | LK_RETRY);
664 if ((flags & FOF_OFFSET) == 0)
665 uio->uio_offset = fp->f_offset;
666 advice = POSIX_FADV_NORMAL;
667 mtxp = NULL;
668 if (fp->f_advice != NULL) {
669 mtxp = mtx_pool_find(mtxpool_sleep, fp);
670 mtx_lock(mtxp);
671 if (fp->f_advice != NULL &&
672 uio->uio_offset >= fp->f_advice->fa_start &&
673 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
674 advice = fp->f_advice->fa_advice;
675 mtx_unlock(mtxp);
676 }
677 switch (advice) {
678 case POSIX_FADV_NORMAL:
679 case POSIX_FADV_SEQUENTIAL:
680 case POSIX_FADV_NOREUSE:
681 ioflag |= sequential_heuristic(uio, fp);
682 break;
683 case POSIX_FADV_RANDOM:
684 /* XXX: Is this correct? */
685 break;
686 }
687 offset = uio->uio_offset;
688
689 #ifdef MAC
690 error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
691 if (error == 0)
692 #endif
693 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
694 if ((flags & FOF_OFFSET) == 0)
695 fp->f_offset = uio->uio_offset;
696 fp->f_nextoff = uio->uio_offset;
697 VOP_UNLOCK(vp, 0);
698 if (vp->v_type != VCHR)
699 vn_finished_write(mp);
700 if (error == 0 && advice == POSIX_FADV_NOREUSE &&
701 offset != uio->uio_offset) {
702 /*
703 * Use POSIX_FADV_DONTNEED to flush clean pages and
704 * buffers for the backing file after a
705 * POSIX_FADV_NOREUSE write(2). To optimize the
706 * common case of using POSIX_FADV_NOREUSE with
707 * sequential access, track the previous implicit
708 * DONTNEED request and grow this request to include
709 * the current write(2) in addition to the previous
710 * DONTNEED. With purely sequential access this will
711 * cause the DONTNEED requests to continously grow to
712 * cover all of the previously written regions of the
713 * file.
714 *
715 * Note that the blocks just written are almost
716 * certainly still dirty, so this only works when
717 * VOP_ADVISE() calls from subsequent writes push out
718 * the data written by this write(2) once the backing
719 * buffers are clean. However, as compared to forcing
720 * IO_DIRECT, this gives much saner behavior. Write
721 * clustering is still allowed, and clean pages are
722 * merely moved to the cache page queue rather than
723 * outright thrown away. This means a subsequent
724 * read(2) can still avoid hitting the disk if the
725 * pages have not been reclaimed.
726 *
727 * This does make POSIX_FADV_NOREUSE largely useless
728 * with non-sequential access. However, sequential
729 * access is the more common use case and the flag is
730 * merely advisory.
731 */
732 start = offset;
733 end = uio->uio_offset - 1;
734 mtx_lock(mtxp);
735 if (fp->f_advice != NULL &&
736 fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
737 if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
738 start = fp->f_advice->fa_prevstart;
739 else if (fp->f_advice->fa_prevstart != 0 &&
740 fp->f_advice->fa_prevstart == end + 1)
741 end = fp->f_advice->fa_prevend;
742 fp->f_advice->fa_prevstart = start;
743 fp->f_advice->fa_prevend = end;
744 }
745 mtx_unlock(mtxp);
746 error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
747 }
748
749 unlock:
750 VFS_UNLOCK_GIANT(vfslocked);
751 return (error);
752 }
753
754 /*
755 * File table truncate routine.
756 */
757 static int
758 vn_truncate(fp, length, active_cred, td)
759 struct file *fp;
760 off_t length;
761 struct ucred *active_cred;
762 struct thread *td;
763 {
764 struct vattr vattr;
765 struct mount *mp;
766 struct vnode *vp;
767 int vfslocked;
768 int error;
769
770 vp = fp->f_vnode;
771 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
772 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
773 if (error) {
774 VFS_UNLOCK_GIANT(vfslocked);
775 return (error);
776 }
777 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
778 if (vp->v_type == VDIR) {
779 error = EISDIR;
780 goto out;
781 }
782 #ifdef MAC
783 error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
784 if (error)
785 goto out;
786 #endif
787 error = vn_writechk(vp);
788 if (error == 0) {
789 VATTR_NULL(&vattr);
790 vattr.va_size = length;
791 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
792 }
793 out:
794 VOP_UNLOCK(vp, 0);
795 vn_finished_write(mp);
796 VFS_UNLOCK_GIANT(vfslocked);
797 return (error);
798 }
799
800 /*
801 * File table vnode stat routine.
802 */
803 static int
804 vn_statfile(fp, sb, active_cred, td)
805 struct file *fp;
806 struct stat *sb;
807 struct ucred *active_cred;
808 struct thread *td;
809 {
810 struct vnode *vp = fp->f_vnode;
811 int vfslocked;
812 int error;
813
814 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
815 vn_lock(vp, LK_SHARED | LK_RETRY);
816 error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
817 VOP_UNLOCK(vp, 0);
818 VFS_UNLOCK_GIANT(vfslocked);
819
820 return (error);
821 }
822
823 /*
824 * Stat a vnode; implementation for the stat syscall
825 */
826 int
827 vn_stat(vp, sb, active_cred, file_cred, td)
828 struct vnode *vp;
829 register struct stat *sb;
830 struct ucred *active_cred;
831 struct ucred *file_cred;
832 struct thread *td;
833 {
834 struct vattr vattr;
835 register struct vattr *vap;
836 int error;
837 u_short mode;
838
839 #ifdef MAC
840 error = mac_vnode_check_stat(active_cred, file_cred, vp);
841 if (error)
842 return (error);
843 #endif
844
845 vap = &vattr;
846
847 /*
848 * Initialize defaults for new and unusual fields, so that file
849 * systems which don't support these fields don't need to know
850 * about them.
851 */
852 vap->va_birthtime.tv_sec = -1;
853 vap->va_birthtime.tv_nsec = 0;
854 vap->va_fsid = VNOVAL;
855 vap->va_rdev = NODEV;
856
857 error = VOP_GETATTR(vp, vap, active_cred);
858 if (error)
859 return (error);
860
861 /*
862 * Zero the spare stat fields
863 */
864 bzero(sb, sizeof *sb);
865
866 /*
867 * Copy from vattr table
868 */
869 if (vap->va_fsid != VNOVAL)
870 sb->st_dev = vap->va_fsid;
871 else
872 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
873 sb->st_ino = vap->va_fileid;
874 mode = vap->va_mode;
875 switch (vap->va_type) {
876 case VREG:
877 mode |= S_IFREG;
878 break;
879 case VDIR:
880 mode |= S_IFDIR;
881 break;
882 case VBLK:
883 mode |= S_IFBLK;
884 break;
885 case VCHR:
886 mode |= S_IFCHR;
887 break;
888 case VLNK:
889 mode |= S_IFLNK;
890 break;
891 case VSOCK:
892 mode |= S_IFSOCK;
893 break;
894 case VFIFO:
895 mode |= S_IFIFO;
896 break;
897 default:
898 return (EBADF);
899 };
900 sb->st_mode = mode;
901 sb->st_nlink = vap->va_nlink;
902 sb->st_uid = vap->va_uid;
903 sb->st_gid = vap->va_gid;
904 sb->st_rdev = vap->va_rdev;
905 if (vap->va_size > OFF_MAX)
906 return (EOVERFLOW);
907 sb->st_size = vap->va_size;
908 sb->st_atimespec = vap->va_atime;
909 sb->st_mtimespec = vap->va_mtime;
910 sb->st_ctimespec = vap->va_ctime;
911 sb->st_birthtimespec = vap->va_birthtime;
912
913 /*
914 * According to www.opengroup.org, the meaning of st_blksize is
915 * "a filesystem-specific preferred I/O block size for this
916 * object. In some filesystem types, this may vary from file
917 * to file"
918 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
919 */
920
921 sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
922
923 sb->st_flags = vap->va_flags;
924 if (priv_check(td, PRIV_VFS_GENERATION))
925 sb->st_gen = 0;
926 else
927 sb->st_gen = vap->va_gen;
928
929 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
930 return (0);
931 }
932
933 /*
934 * File table vnode ioctl routine.
935 */
936 static int
937 vn_ioctl(fp, com, data, active_cred, td)
938 struct file *fp;
939 u_long com;
940 void *data;
941 struct ucred *active_cred;
942 struct thread *td;
943 {
944 struct vnode *vp = fp->f_vnode;
945 struct vattr vattr;
946 int vfslocked;
947 int error;
948
949 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
950 error = ENOTTY;
951 switch (vp->v_type) {
952 case VREG:
953 case VDIR:
954 if (com == FIONREAD) {
955 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
956 error = VOP_GETATTR(vp, &vattr, active_cred);
957 VOP_UNLOCK(vp, 0);
958 if (!error)
959 *(int *)data = vattr.va_size - fp->f_offset;
960 }
961 if (com == FIONBIO || com == FIOASYNC) /* XXX */
962 error = 0;
963 else
964 error = VOP_IOCTL(vp, com, data, fp->f_flag,
965 active_cred, td);
966 break;
967
968 default:
969 break;
970 }
971 VFS_UNLOCK_GIANT(vfslocked);
972 return (error);
973 }
974
975 /*
976 * File table vnode poll routine.
977 */
978 static int
979 vn_poll(fp, events, active_cred, td)
980 struct file *fp;
981 int events;
982 struct ucred *active_cred;
983 struct thread *td;
984 {
985 struct vnode *vp;
986 int vfslocked;
987 int error;
988
989 vp = fp->f_vnode;
990 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
991 #ifdef MAC
992 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
993 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
994 VOP_UNLOCK(vp, 0);
995 if (!error)
996 #endif
997
998 error = VOP_POLL(vp, events, fp->f_cred, td);
999 VFS_UNLOCK_GIANT(vfslocked);
1000 return (error);
1001 }
1002
1003 /*
1004 * Acquire the requested lock and then check for validity. LK_RETRY
1005 * permits vn_lock to return doomed vnodes.
1006 */
1007 int
1008 _vn_lock(struct vnode *vp, int flags, char *file, int line)
1009 {
1010 int error;
1011
1012 VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1013 ("vn_lock called with no locktype."));
1014 do {
1015 #ifdef DEBUG_VFS_LOCKS
1016 KASSERT(vp->v_holdcnt != 0,
1017 ("vn_lock %p: zero hold count", vp));
1018 #endif
1019 error = VOP_LOCK1(vp, flags, file, line);
1020 flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
1021 KASSERT((flags & LK_RETRY) == 0 || error == 0,
1022 ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
1023 flags, error));
1024 /*
1025 * Callers specify LK_RETRY if they wish to get dead vnodes.
1026 * If RETRY is not set, we return ENOENT instead.
1027 */
1028 if (error == 0 && vp->v_iflag & VI_DOOMED &&
1029 (flags & LK_RETRY) == 0) {
1030 VOP_UNLOCK(vp, 0);
1031 error = ENOENT;
1032 break;
1033 }
1034 } while (flags & LK_RETRY && error != 0);
1035 return (error);
1036 }
1037
1038 /*
1039 * File table vnode close routine.
1040 */
1041 static int
1042 vn_closefile(fp, td)
1043 struct file *fp;
1044 struct thread *td;
1045 {
1046 struct vnode *vp;
1047 struct flock lf;
1048 int vfslocked;
1049 int error;
1050
1051 vp = fp->f_vnode;
1052
1053 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
1054 if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
1055 lf.l_whence = SEEK_SET;
1056 lf.l_start = 0;
1057 lf.l_len = 0;
1058 lf.l_type = F_UNLCK;
1059 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1060 }
1061
1062 fp->f_ops = &badfileops;
1063
1064 error = vn_close(vp, fp->f_flag, fp->f_cred, td);
1065 VFS_UNLOCK_GIANT(vfslocked);
1066 return (error);
1067 }
1068
1069 /*
1070 * Preparing to start a filesystem write operation. If the operation is
1071 * permitted, then we bump the count of operations in progress and
1072 * proceed. If a suspend request is in progress, we wait until the
1073 * suspension is over, and then proceed.
1074 */
1075 int
1076 vn_start_write(vp, mpp, flags)
1077 struct vnode *vp;
1078 struct mount **mpp;
1079 int flags;
1080 {
1081 struct mount *mp;
1082 int error;
1083
1084 error = 0;
1085 /*
1086 * If a vnode is provided, get and return the mount point that
1087 * to which it will write.
1088 */
1089 if (vp != NULL) {
1090 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1091 *mpp = NULL;
1092 if (error != EOPNOTSUPP)
1093 return (error);
1094 return (0);
1095 }
1096 }
1097 if ((mp = *mpp) == NULL)
1098 return (0);
1099
1100 /*
1101 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1102 * a vfs_ref().
1103 * As long as a vnode is not provided we need to acquire a
1104 * refcount for the provided mountpoint too, in order to
1105 * emulate a vfs_ref().
1106 */
1107 MNT_ILOCK(mp);
1108 if (vp == NULL)
1109 MNT_REF(mp);
1110
1111 /*
1112 * Check on status of suspension.
1113 */
1114 if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1115 mp->mnt_susp_owner != curthread) {
1116 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1117 if (flags & V_NOWAIT) {
1118 error = EWOULDBLOCK;
1119 goto unlock;
1120 }
1121 error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1122 (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
1123 if (error)
1124 goto unlock;
1125 }
1126 }
1127 if (flags & V_XSLEEP)
1128 goto unlock;
1129 mp->mnt_writeopcount++;
1130 unlock:
1131 if (error != 0 || (flags & V_XSLEEP) != 0)
1132 MNT_REL(mp);
1133 MNT_IUNLOCK(mp);
1134 return (error);
1135 }
1136
1137 /*
1138 * Secondary suspension. Used by operations such as vop_inactive
1139 * routines that are needed by the higher level functions. These
1140 * are allowed to proceed until all the higher level functions have
1141 * completed (indicated by mnt_writeopcount dropping to zero). At that
1142 * time, these operations are halted until the suspension is over.
1143 */
1144 int
1145 vn_start_secondary_write(vp, mpp, flags)
1146 struct vnode *vp;
1147 struct mount **mpp;
1148 int flags;
1149 {
1150 struct mount *mp;
1151 int error;
1152
1153 retry:
1154 if (vp != NULL) {
1155 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1156 *mpp = NULL;
1157 if (error != EOPNOTSUPP)
1158 return (error);
1159 return (0);
1160 }
1161 }
1162 /*
1163 * If we are not suspended or have not yet reached suspended
1164 * mode, then let the operation proceed.
1165 */
1166 if ((mp = *mpp) == NULL)
1167 return (0);
1168
1169 /*
1170 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1171 * a vfs_ref().
1172 * As long as a vnode is not provided we need to acquire a
1173 * refcount for the provided mountpoint too, in order to
1174 * emulate a vfs_ref().
1175 */
1176 MNT_ILOCK(mp);
1177 if (vp == NULL)
1178 MNT_REF(mp);
1179 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1180 mp->mnt_secondary_writes++;
1181 mp->mnt_secondary_accwrites++;
1182 MNT_IUNLOCK(mp);
1183 return (0);
1184 }
1185 if (flags & V_NOWAIT) {
1186 MNT_REL(mp);
1187 MNT_IUNLOCK(mp);
1188 return (EWOULDBLOCK);
1189 }
1190 /*
1191 * Wait for the suspension to finish.
1192 */
1193 error = msleep(&mp->mnt_flag, MNT_MTX(mp),
1194 (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
1195 vfs_rel(mp);
1196 if (error == 0)
1197 goto retry;
1198 return (error);
1199 }
1200
1201 /*
1202 * Filesystem write operation has completed. If we are suspending and this
1203 * operation is the last one, notify the suspender that the suspension is
1204 * now in effect.
1205 */
1206 void
1207 vn_finished_write(mp)
1208 struct mount *mp;
1209 {
1210 if (mp == NULL)
1211 return;
1212 MNT_ILOCK(mp);
1213 MNT_REL(mp);
1214 mp->mnt_writeopcount--;
1215 if (mp->mnt_writeopcount < 0)
1216 panic("vn_finished_write: neg cnt");
1217 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1218 mp->mnt_writeopcount <= 0)
1219 wakeup(&mp->mnt_writeopcount);
1220 MNT_IUNLOCK(mp);
1221 }
1222
1223
1224 /*
1225 * Filesystem secondary write operation has completed. If we are
1226 * suspending and this operation is the last one, notify the suspender
1227 * that the suspension is now in effect.
1228 */
1229 void
1230 vn_finished_secondary_write(mp)
1231 struct mount *mp;
1232 {
1233 if (mp == NULL)
1234 return;
1235 MNT_ILOCK(mp);
1236 MNT_REL(mp);
1237 mp->mnt_secondary_writes--;
1238 if (mp->mnt_secondary_writes < 0)
1239 panic("vn_finished_secondary_write: neg cnt");
1240 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1241 mp->mnt_secondary_writes <= 0)
1242 wakeup(&mp->mnt_secondary_writes);
1243 MNT_IUNLOCK(mp);
1244 }
1245
1246
1247
1248 /*
1249 * Request a filesystem to suspend write operations.
1250 */
1251 int
1252 vfs_write_suspend(mp)
1253 struct mount *mp;
1254 {
1255 int error;
1256
1257 MNT_ILOCK(mp);
1258 if (mp->mnt_susp_owner == curthread) {
1259 MNT_IUNLOCK(mp);
1260 return (EALREADY);
1261 }
1262 while (mp->mnt_kern_flag & MNTK_SUSPEND)
1263 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1264 mp->mnt_kern_flag |= MNTK_SUSPEND;
1265 mp->mnt_susp_owner = curthread;
1266 if (mp->mnt_writeopcount > 0)
1267 (void) msleep(&mp->mnt_writeopcount,
1268 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1269 else
1270 MNT_IUNLOCK(mp);
1271 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1272 vfs_write_resume(mp);
1273 return (error);
1274 }
1275
1276 /*
1277 * Request a filesystem to resume write operations.
1278 */
1279 void
1280 vfs_write_resume(mp)
1281 struct mount *mp;
1282 {
1283
1284 MNT_ILOCK(mp);
1285 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1286 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1287 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1288 MNTK_SUSPENDED);
1289 mp->mnt_susp_owner = NULL;
1290 wakeup(&mp->mnt_writeopcount);
1291 wakeup(&mp->mnt_flag);
1292 curthread->td_pflags &= ~TDP_IGNSUSP;
1293 MNT_IUNLOCK(mp);
1294 VFS_SUSP_CLEAN(mp);
1295 } else
1296 MNT_IUNLOCK(mp);
1297 }
1298
1299 /*
1300 * Implement kqueues for files by translating it to vnode operation.
1301 */
1302 static int
1303 vn_kqfilter(struct file *fp, struct knote *kn)
1304 {
1305 int vfslocked;
1306 int error;
1307
1308 vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
1309 error = VOP_KQFILTER(fp->f_vnode, kn);
1310 VFS_UNLOCK_GIANT(vfslocked);
1311
1312 return error;
1313 }
1314
1315 /*
1316 * Simplified in-kernel wrapper calls for extended attribute access.
1317 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1318 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1319 */
1320 int
1321 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1322 const char *attrname, int *buflen, char *buf, struct thread *td)
1323 {
1324 struct uio auio;
1325 struct iovec iov;
1326 int error;
1327
1328 iov.iov_len = *buflen;
1329 iov.iov_base = buf;
1330
1331 auio.uio_iov = &iov;
1332 auio.uio_iovcnt = 1;
1333 auio.uio_rw = UIO_READ;
1334 auio.uio_segflg = UIO_SYSSPACE;
1335 auio.uio_td = td;
1336 auio.uio_offset = 0;
1337 auio.uio_resid = *buflen;
1338
1339 if ((ioflg & IO_NODELOCKED) == 0)
1340 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1341
1342 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1343
1344 /* authorize attribute retrieval as kernel */
1345 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1346 td);
1347
1348 if ((ioflg & IO_NODELOCKED) == 0)
1349 VOP_UNLOCK(vp, 0);
1350
1351 if (error == 0) {
1352 *buflen = *buflen - auio.uio_resid;
1353 }
1354
1355 return (error);
1356 }
1357
1358 /*
1359 * XXX failure mode if partially written?
1360 */
1361 int
1362 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1363 const char *attrname, int buflen, char *buf, struct thread *td)
1364 {
1365 struct uio auio;
1366 struct iovec iov;
1367 struct mount *mp;
1368 int error;
1369
1370 iov.iov_len = buflen;
1371 iov.iov_base = buf;
1372
1373 auio.uio_iov = &iov;
1374 auio.uio_iovcnt = 1;
1375 auio.uio_rw = UIO_WRITE;
1376 auio.uio_segflg = UIO_SYSSPACE;
1377 auio.uio_td = td;
1378 auio.uio_offset = 0;
1379 auio.uio_resid = buflen;
1380
1381 if ((ioflg & IO_NODELOCKED) == 0) {
1382 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1383 return (error);
1384 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1385 }
1386
1387 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1388
1389 /* authorize attribute setting as kernel */
1390 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
1391
1392 if ((ioflg & IO_NODELOCKED) == 0) {
1393 vn_finished_write(mp);
1394 VOP_UNLOCK(vp, 0);
1395 }
1396
1397 return (error);
1398 }
1399
1400 int
1401 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
1402 const char *attrname, struct thread *td)
1403 {
1404 struct mount *mp;
1405 int error;
1406
1407 if ((ioflg & IO_NODELOCKED) == 0) {
1408 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
1409 return (error);
1410 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1411 }
1412
1413 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1414
1415 /* authorize attribute removal as kernel */
1416 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
1417 if (error == EOPNOTSUPP)
1418 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
1419 NULL, td);
1420
1421 if ((ioflg & IO_NODELOCKED) == 0) {
1422 vn_finished_write(mp);
1423 VOP_UNLOCK(vp, 0);
1424 }
1425
1426 return (error);
1427 }
1428
1429 int
1430 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
1431 {
1432 struct mount *mp;
1433 int ltype, error;
1434
1435 mp = vp->v_mount;
1436 ltype = VOP_ISLOCKED(vp);
1437 KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
1438 ("vn_vget_ino: vp not locked"));
1439 error = vfs_busy(mp, MBF_NOWAIT);
1440 if (error != 0) {
1441 vfs_ref(mp);
1442 VOP_UNLOCK(vp, 0);
1443 error = vfs_busy(mp, 0);
1444 vn_lock(vp, ltype | LK_RETRY);
1445 vfs_rel(mp);
1446 if (error != 0)
1447 return (ENOENT);
1448 if (vp->v_iflag & VI_DOOMED) {
1449 vfs_unbusy(mp);
1450 return (ENOENT);
1451 }
1452 }
1453 VOP_UNLOCK(vp, 0);
1454 error = VFS_VGET(mp, ino, lkflags, rvp);
1455 vfs_unbusy(mp);
1456 vn_lock(vp, ltype | LK_RETRY);
1457 if (vp->v_iflag & VI_DOOMED) {
1458 if (error == 0)
1459 vput(*rvp);
1460 error = ENOENT;
1461 }
1462 return (error);
1463 }
1464
1465 int
1466 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, const struct thread *td)
1467 {
1468 if (vp->v_type != VREG || td == NULL)
1469 return (0);
1470
1471 PROC_LOCK(td->td_proc);
1472 if (uio->uio_offset + uio->uio_resid >
1473 lim_cur(td->td_proc, RLIMIT_FSIZE)) {
1474 psignal(td->td_proc, SIGXFSZ);
1475 PROC_UNLOCK(td->td_proc);
1476 return (EFBIG);
1477 }
1478 PROC_UNLOCK(td->td_proc);
1479
1480 return (0);
1481 }
1482
1483 void
1484 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
1485 {
1486 vm_object_t object;
1487
1488 if ((object = vp->v_object) == NULL)
1489 return;
1490 VM_OBJECT_LOCK(object);
1491 vm_object_page_remove(object, start, end, 0);
1492 VM_OBJECT_UNLOCK(object);
1493 }
1494
1495 int
1496 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
1497 {
1498 struct vattr va;
1499 daddr_t bn, bnp;
1500 uint64_t bsize;
1501 off_t noff;
1502 int error;
1503
1504 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
1505 ("Wrong command %lu", cmd));
1506
1507 if (vn_lock(vp, LK_SHARED) != 0)
1508 return (EBADF);
1509 if (vp->v_type != VREG) {
1510 error = ENOTTY;
1511 goto unlock;
1512 }
1513 error = VOP_GETATTR(vp, &va, cred);
1514 if (error != 0)
1515 goto unlock;
1516 noff = *off;
1517 if (noff >= va.va_size) {
1518 error = ENXIO;
1519 goto unlock;
1520 }
1521 bsize = vp->v_mount->mnt_stat.f_iosize;
1522 for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
1523 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
1524 if (error == EOPNOTSUPP) {
1525 error = ENOTTY;
1526 goto unlock;
1527 }
1528 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
1529 (bnp != -1 && cmd == FIOSEEKDATA)) {
1530 noff = bn * bsize;
1531 if (noff < *off)
1532 noff = *off;
1533 goto unlock;
1534 }
1535 }
1536 if (noff > va.va_size)
1537 noff = va.va_size;
1538 /* noff == va.va_size. There is an implicit hole at the end of file. */
1539 if (cmd == FIOSEEKDATA)
1540 error = ENXIO;
1541 unlock:
1542 VOP_UNLOCK(vp, 0);
1543 if (error == 0)
1544 *off = noff;
1545 return (error);
1546 }
Cache object: 101004e45169e1e63204a10ab1028888
|