FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c
1 /*-
2 * Copyright (c) 1982, 1986, 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
11 * Copyright (c) 2013, 2014 The FreeBSD Foundation
12 *
13 * Portions of this software were developed by Konstantin Belousov
14 * under sponsorship from the FreeBSD Foundation.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted provided that the following conditions
18 * are met:
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
41 */
42
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD: releng/11.1/sys/kern/vfs_vnops.c 338606 2018-09-12 05:07:35Z gordon $");
45
46 #include "opt_hwpmc_hooks.h"
47
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/disk.h>
51 #include <sys/fail.h>
52 #include <sys/fcntl.h>
53 #include <sys/file.h>
54 #include <sys/kdb.h>
55 #include <sys/stat.h>
56 #include <sys/priv.h>
57 #include <sys/proc.h>
58 #include <sys/limits.h>
59 #include <sys/lock.h>
60 #include <sys/mman.h>
61 #include <sys/mount.h>
62 #include <sys/mutex.h>
63 #include <sys/namei.h>
64 #include <sys/vnode.h>
65 #include <sys/bio.h>
66 #include <sys/buf.h>
67 #include <sys/filio.h>
68 #include <sys/resourcevar.h>
69 #include <sys/rwlock.h>
70 #include <sys/sx.h>
71 #include <sys/sysctl.h>
72 #include <sys/ttycom.h>
73 #include <sys/conf.h>
74 #include <sys/syslog.h>
75 #include <sys/unistd.h>
76 #include <sys/user.h>
77
78 #include <security/audit/audit.h>
79 #include <security/mac/mac_framework.h>
80
81 #include <vm/vm.h>
82 #include <vm/vm_extern.h>
83 #include <vm/pmap.h>
84 #include <vm/vm_map.h>
85 #include <vm/vm_object.h>
86 #include <vm/vm_page.h>
87 #include <vm/vnode_pager.h>
88
89 #ifdef HWPMC_HOOKS
90 #include <sys/pmckern.h>
91 #endif
92
93 static fo_rdwr_t vn_read;
94 static fo_rdwr_t vn_write;
95 static fo_rdwr_t vn_io_fault;
96 static fo_truncate_t vn_truncate;
97 static fo_ioctl_t vn_ioctl;
98 static fo_poll_t vn_poll;
99 static fo_kqfilter_t vn_kqfilter;
100 static fo_stat_t vn_statfile;
101 static fo_close_t vn_closefile;
102 static fo_mmap_t vn_mmap;
103
104 struct fileops vnops = {
105 .fo_read = vn_io_fault,
106 .fo_write = vn_io_fault,
107 .fo_truncate = vn_truncate,
108 .fo_ioctl = vn_ioctl,
109 .fo_poll = vn_poll,
110 .fo_kqfilter = vn_kqfilter,
111 .fo_stat = vn_statfile,
112 .fo_close = vn_closefile,
113 .fo_chmod = vn_chmod,
114 .fo_chown = vn_chown,
115 .fo_sendfile = vn_sendfile,
116 .fo_seek = vn_seek,
117 .fo_fill_kinfo = vn_fill_kinfo,
118 .fo_mmap = vn_mmap,
119 .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
120 };
121
122 static const int io_hold_cnt = 16;
123 static int vn_io_fault_enable = 1;
124 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
125 &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
126 static int vn_io_fault_prefault = 0;
127 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW,
128 &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
129 static u_long vn_io_faults_cnt;
130 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
131 &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
132
133 /*
134 * Returns true if vn_io_fault mode of handling the i/o request should
135 * be used.
136 */
137 static bool
138 do_vn_io_fault(struct vnode *vp, struct uio *uio)
139 {
140 struct mount *mp;
141
142 return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
143 (mp = vp->v_mount) != NULL &&
144 (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
145 }
146
147 /*
148 * Structure used to pass arguments to vn_io_fault1(), to do either
149 * file- or vnode-based I/O calls.
150 */
151 struct vn_io_fault_args {
152 enum {
153 VN_IO_FAULT_FOP,
154 VN_IO_FAULT_VOP
155 } kind;
156 struct ucred *cred;
157 int flags;
158 union {
159 struct fop_args_tag {
160 struct file *fp;
161 fo_rdwr_t *doio;
162 } fop_args;
163 struct vop_args_tag {
164 struct vnode *vp;
165 } vop_args;
166 } args;
167 };
168
169 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
170 struct vn_io_fault_args *args, struct thread *td);
171
172 int
173 vn_open(ndp, flagp, cmode, fp)
174 struct nameidata *ndp;
175 int *flagp, cmode;
176 struct file *fp;
177 {
178 struct thread *td = ndp->ni_cnd.cn_thread;
179
180 return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
181 }
182
183 /*
184 * Common code for vnode open operations via a name lookup.
185 * Lookup the vnode and invoke VOP_CREATE if needed.
186 * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
187 *
188 * Note that this does NOT free nameidata for the successful case,
189 * due to the NDINIT being done elsewhere.
190 */
191 int
192 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
193 struct ucred *cred, struct file *fp)
194 {
195 struct vnode *vp;
196 struct mount *mp;
197 struct thread *td = ndp->ni_cnd.cn_thread;
198 struct vattr vat;
199 struct vattr *vap = &vat;
200 int fmode, error;
201
202 restart:
203 fmode = *flagp;
204 if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
205 O_EXCL | O_DIRECTORY))
206 return (EINVAL);
207 else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
208 ndp->ni_cnd.cn_nameiop = CREATE;
209 /*
210 * Set NOCACHE to avoid flushing the cache when
211 * rolling in many files at once.
212 */
213 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE;
214 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
215 ndp->ni_cnd.cn_flags |= FOLLOW;
216 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
217 ndp->ni_cnd.cn_flags |= AUDITVNODE1;
218 if (vn_open_flags & VN_OPEN_NOCAPCHECK)
219 ndp->ni_cnd.cn_flags |= NOCAPCHECK;
220 bwillwrite();
221 if ((error = namei(ndp)) != 0)
222 return (error);
223 if (ndp->ni_vp == NULL) {
224 VATTR_NULL(vap);
225 vap->va_type = VREG;
226 vap->va_mode = cmode;
227 if (fmode & O_EXCL)
228 vap->va_vaflags |= VA_EXCLUSIVE;
229 if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
230 NDFREE(ndp, NDF_ONLY_PNBUF);
231 vput(ndp->ni_dvp);
232 if ((error = vn_start_write(NULL, &mp,
233 V_XSLEEP | PCATCH)) != 0)
234 return (error);
235 goto restart;
236 }
237 if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
238 ndp->ni_cnd.cn_flags |= MAKEENTRY;
239 #ifdef MAC
240 error = mac_vnode_check_create(cred, ndp->ni_dvp,
241 &ndp->ni_cnd, vap);
242 if (error == 0)
243 #endif
244 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
245 &ndp->ni_cnd, vap);
246 vput(ndp->ni_dvp);
247 vn_finished_write(mp);
248 if (error) {
249 NDFREE(ndp, NDF_ONLY_PNBUF);
250 return (error);
251 }
252 fmode &= ~O_TRUNC;
253 vp = ndp->ni_vp;
254 } else {
255 if (ndp->ni_dvp == ndp->ni_vp)
256 vrele(ndp->ni_dvp);
257 else
258 vput(ndp->ni_dvp);
259 ndp->ni_dvp = NULL;
260 vp = ndp->ni_vp;
261 if (fmode & O_EXCL) {
262 error = EEXIST;
263 goto bad;
264 }
265 fmode &= ~O_CREAT;
266 }
267 } else {
268 ndp->ni_cnd.cn_nameiop = LOOKUP;
269 ndp->ni_cnd.cn_flags = ISOPEN |
270 ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
271 if (!(fmode & FWRITE))
272 ndp->ni_cnd.cn_flags |= LOCKSHARED;
273 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
274 ndp->ni_cnd.cn_flags |= AUDITVNODE1;
275 if (vn_open_flags & VN_OPEN_NOCAPCHECK)
276 ndp->ni_cnd.cn_flags |= NOCAPCHECK;
277 if ((error = namei(ndp)) != 0)
278 return (error);
279 vp = ndp->ni_vp;
280 }
281 error = vn_open_vnode(vp, fmode, cred, td, fp);
282 if (error)
283 goto bad;
284 *flagp = fmode;
285 return (0);
286 bad:
287 NDFREE(ndp, NDF_ONLY_PNBUF);
288 vput(vp);
289 *flagp = fmode;
290 ndp->ni_vp = NULL;
291 return (error);
292 }
293
294 /*
295 * Common code for vnode open operations once a vnode is located.
296 * Check permissions, and call the VOP_OPEN routine.
297 */
298 int
299 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
300 struct thread *td, struct file *fp)
301 {
302 accmode_t accmode;
303 struct flock lf;
304 int error, lock_flags, type;
305
306 if (vp->v_type == VLNK)
307 return (EMLINK);
308 if (vp->v_type == VSOCK)
309 return (EOPNOTSUPP);
310 if (vp->v_type != VDIR && fmode & O_DIRECTORY)
311 return (ENOTDIR);
312 accmode = 0;
313 if (fmode & (FWRITE | O_TRUNC)) {
314 if (vp->v_type == VDIR)
315 return (EISDIR);
316 accmode |= VWRITE;
317 }
318 if (fmode & FREAD)
319 accmode |= VREAD;
320 if (fmode & FEXEC)
321 accmode |= VEXEC;
322 if ((fmode & O_APPEND) && (fmode & FWRITE))
323 accmode |= VAPPEND;
324 #ifdef MAC
325 if (fmode & O_CREAT)
326 accmode |= VCREAT;
327 if (fmode & O_VERIFY)
328 accmode |= VVERIFY;
329 error = mac_vnode_check_open(cred, vp, accmode);
330 if (error)
331 return (error);
332
333 accmode &= ~(VCREAT | VVERIFY);
334 #endif
335 if ((fmode & O_CREAT) == 0) {
336 if (accmode & VWRITE) {
337 error = vn_writechk(vp);
338 if (error)
339 return (error);
340 }
341 if (accmode) {
342 error = VOP_ACCESS(vp, accmode, cred, td);
343 if (error)
344 return (error);
345 }
346 }
347 if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
348 vn_lock(vp, LK_UPGRADE | LK_RETRY);
349 if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
350 return (error);
351
352 while ((fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
353 KASSERT(fp != NULL, ("open with flock requires fp"));
354 if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) {
355 error = EOPNOTSUPP;
356 break;
357 }
358 lock_flags = VOP_ISLOCKED(vp);
359 VOP_UNLOCK(vp, 0);
360 lf.l_whence = SEEK_SET;
361 lf.l_start = 0;
362 lf.l_len = 0;
363 if (fmode & O_EXLOCK)
364 lf.l_type = F_WRLCK;
365 else
366 lf.l_type = F_RDLCK;
367 type = F_FLOCK;
368 if ((fmode & FNONBLOCK) == 0)
369 type |= F_WAIT;
370 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
371 if (error == 0)
372 fp->f_flag |= FHASLOCK;
373 vn_lock(vp, lock_flags | LK_RETRY);
374 if (error != 0)
375 break;
376 if ((vp->v_iflag & VI_DOOMED) != 0) {
377 error = ENOENT;
378 break;
379 }
380
381 /*
382 * Another thread might have used this vnode as an
383 * executable while the vnode lock was dropped.
384 * Ensure the vnode is still able to be opened for
385 * writing after the lock has been obtained.
386 */
387 if ((accmode & VWRITE) != 0)
388 error = vn_writechk(vp);
389 break;
390 }
391
392 if (error != 0) {
393 fp->f_flag |= FOPENFAILED;
394 fp->f_vnode = vp;
395 if (fp->f_ops == &badfileops) {
396 fp->f_type = DTYPE_VNODE;
397 fp->f_ops = &vnops;
398 }
399 vref(vp);
400 } else if ((fmode & FWRITE) != 0) {
401 VOP_ADD_WRITECOUNT(vp, 1);
402 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
403 __func__, vp, vp->v_writecount);
404 }
405 ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
406 return (error);
407 }
408
409 /*
410 * Check for write permissions on the specified vnode.
411 * Prototype text segments cannot be written.
412 */
413 int
414 vn_writechk(vp)
415 register struct vnode *vp;
416 {
417
418 ASSERT_VOP_LOCKED(vp, "vn_writechk");
419 /*
420 * If there's shared text associated with
421 * the vnode, try to free it up once. If
422 * we fail, we can't allow writing.
423 */
424 if (VOP_IS_TEXT(vp))
425 return (ETXTBSY);
426
427 return (0);
428 }
429
430 /*
431 * Vnode close call
432 */
433 static int
434 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
435 struct thread *td, bool keep_ref)
436 {
437 struct mount *mp;
438 int error, lock_flags;
439
440 if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
441 MNT_EXTENDED_SHARED(vp->v_mount))
442 lock_flags = LK_SHARED;
443 else
444 lock_flags = LK_EXCLUSIVE;
445
446 vn_start_write(vp, &mp, V_WAIT);
447 vn_lock(vp, lock_flags | LK_RETRY);
448 AUDIT_ARG_VNODE1(vp);
449 if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
450 VNASSERT(vp->v_writecount > 0, vp,
451 ("vn_close: negative writecount"));
452 VOP_ADD_WRITECOUNT(vp, -1);
453 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
454 __func__, vp, vp->v_writecount);
455 }
456 error = VOP_CLOSE(vp, flags, file_cred, td);
457 if (keep_ref)
458 VOP_UNLOCK(vp, 0);
459 else
460 vput(vp);
461 vn_finished_write(mp);
462 return (error);
463 }
464
465 int
466 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
467 struct thread *td)
468 {
469
470 return (vn_close1(vp, flags, file_cred, td, false));
471 }
472
473 /*
474 * Heuristic to detect sequential operation.
475 */
476 static int
477 sequential_heuristic(struct uio *uio, struct file *fp)
478 {
479
480 ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
481 if (fp->f_flag & FRDAHEAD)
482 return (fp->f_seqcount << IO_SEQSHIFT);
483
484 /*
485 * Offset 0 is handled specially. open() sets f_seqcount to 1 so
486 * that the first I/O is normally considered to be slightly
487 * sequential. Seeking to offset 0 doesn't change sequentiality
488 * unless previous seeks have reduced f_seqcount to 0, in which
489 * case offset 0 is not special.
490 */
491 if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
492 uio->uio_offset == fp->f_nextoff) {
493 /*
494 * f_seqcount is in units of fixed-size blocks so that it
495 * depends mainly on the amount of sequential I/O and not
496 * much on the number of sequential I/O's. The fixed size
497 * of 16384 is hard-coded here since it is (not quite) just
498 * a magic size that works well here. This size is more
499 * closely related to the best I/O size for real disks than
500 * to any block size used by software.
501 */
502 fp->f_seqcount += howmany(uio->uio_resid, 16384);
503 if (fp->f_seqcount > IO_SEQMAX)
504 fp->f_seqcount = IO_SEQMAX;
505 return (fp->f_seqcount << IO_SEQSHIFT);
506 }
507
508 /* Not sequential. Quickly draw-down sequentiality. */
509 if (fp->f_seqcount > 1)
510 fp->f_seqcount = 1;
511 else
512 fp->f_seqcount = 0;
513 return (0);
514 }
515
516 /*
517 * Package up an I/O request on a vnode into a uio and do it.
518 */
519 int
520 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
521 enum uio_seg segflg, int ioflg, struct ucred *active_cred,
522 struct ucred *file_cred, ssize_t *aresid, struct thread *td)
523 {
524 struct uio auio;
525 struct iovec aiov;
526 struct mount *mp;
527 struct ucred *cred;
528 void *rl_cookie;
529 struct vn_io_fault_args args;
530 int error, lock_flags;
531
532 if (offset < 0 && vp->v_type != VCHR)
533 return (EINVAL);
534 auio.uio_iov = &aiov;
535 auio.uio_iovcnt = 1;
536 aiov.iov_base = base;
537 aiov.iov_len = len;
538 auio.uio_resid = len;
539 auio.uio_offset = offset;
540 auio.uio_segflg = segflg;
541 auio.uio_rw = rw;
542 auio.uio_td = td;
543 error = 0;
544
545 if ((ioflg & IO_NODELOCKED) == 0) {
546 if ((ioflg & IO_RANGELOCKED) == 0) {
547 if (rw == UIO_READ) {
548 rl_cookie = vn_rangelock_rlock(vp, offset,
549 offset + len);
550 } else {
551 rl_cookie = vn_rangelock_wlock(vp, offset,
552 offset + len);
553 }
554 } else
555 rl_cookie = NULL;
556 mp = NULL;
557 if (rw == UIO_WRITE) {
558 if (vp->v_type != VCHR &&
559 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
560 != 0)
561 goto out;
562 if (MNT_SHARED_WRITES(mp) ||
563 ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
564 lock_flags = LK_SHARED;
565 else
566 lock_flags = LK_EXCLUSIVE;
567 } else
568 lock_flags = LK_SHARED;
569 vn_lock(vp, lock_flags | LK_RETRY);
570 } else
571 rl_cookie = NULL;
572
573 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
574 #ifdef MAC
575 if ((ioflg & IO_NOMACCHECK) == 0) {
576 if (rw == UIO_READ)
577 error = mac_vnode_check_read(active_cred, file_cred,
578 vp);
579 else
580 error = mac_vnode_check_write(active_cred, file_cred,
581 vp);
582 }
583 #endif
584 if (error == 0) {
585 if (file_cred != NULL)
586 cred = file_cred;
587 else
588 cred = active_cred;
589 if (do_vn_io_fault(vp, &auio)) {
590 args.kind = VN_IO_FAULT_VOP;
591 args.cred = cred;
592 args.flags = ioflg;
593 args.args.vop_args.vp = vp;
594 error = vn_io_fault1(vp, &auio, &args, td);
595 } else if (rw == UIO_READ) {
596 error = VOP_READ(vp, &auio, ioflg, cred);
597 } else /* if (rw == UIO_WRITE) */ {
598 error = VOP_WRITE(vp, &auio, ioflg, cred);
599 }
600 }
601 if (aresid)
602 *aresid = auio.uio_resid;
603 else
604 if (auio.uio_resid && error == 0)
605 error = EIO;
606 if ((ioflg & IO_NODELOCKED) == 0) {
607 VOP_UNLOCK(vp, 0);
608 if (mp != NULL)
609 vn_finished_write(mp);
610 }
611 out:
612 if (rl_cookie != NULL)
613 vn_rangelock_unlock(vp, rl_cookie);
614 return (error);
615 }
616
617 /*
618 * Package up an I/O request on a vnode into a uio and do it. The I/O
619 * request is split up into smaller chunks and we try to avoid saturating
620 * the buffer cache while potentially holding a vnode locked, so we
621 * check bwillwrite() before calling vn_rdwr(). We also call kern_yield()
622 * to give other processes a chance to lock the vnode (either other processes
623 * core'ing the same binary, or unrelated processes scanning the directory).
624 */
625 int
626 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
627 file_cred, aresid, td)
628 enum uio_rw rw;
629 struct vnode *vp;
630 void *base;
631 size_t len;
632 off_t offset;
633 enum uio_seg segflg;
634 int ioflg;
635 struct ucred *active_cred;
636 struct ucred *file_cred;
637 size_t *aresid;
638 struct thread *td;
639 {
640 int error = 0;
641 ssize_t iaresid;
642
643 do {
644 int chunk;
645
646 /*
647 * Force `offset' to a multiple of MAXBSIZE except possibly
648 * for the first chunk, so that filesystems only need to
649 * write full blocks except possibly for the first and last
650 * chunks.
651 */
652 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
653
654 if (chunk > len)
655 chunk = len;
656 if (rw != UIO_READ && vp->v_type == VREG)
657 bwillwrite();
658 iaresid = 0;
659 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
660 ioflg, active_cred, file_cred, &iaresid, td);
661 len -= chunk; /* aresid calc already includes length */
662 if (error)
663 break;
664 offset += chunk;
665 base = (char *)base + chunk;
666 kern_yield(PRI_USER);
667 } while (len);
668 if (aresid)
669 *aresid = len + iaresid;
670 return (error);
671 }
672
673 off_t
674 foffset_lock(struct file *fp, int flags)
675 {
676 struct mtx *mtxp;
677 off_t res;
678
679 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
680
681 #if OFF_MAX <= LONG_MAX
682 /*
683 * Caller only wants the current f_offset value. Assume that
684 * the long and shorter integer types reads are atomic.
685 */
686 if ((flags & FOF_NOLOCK) != 0)
687 return (fp->f_offset);
688 #endif
689
690 /*
691 * According to McKusick the vn lock was protecting f_offset here.
692 * It is now protected by the FOFFSET_LOCKED flag.
693 */
694 mtxp = mtx_pool_find(mtxpool_sleep, fp);
695 mtx_lock(mtxp);
696 if ((flags & FOF_NOLOCK) == 0) {
697 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
698 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
699 msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
700 "vofflock", 0);
701 }
702 fp->f_vnread_flags |= FOFFSET_LOCKED;
703 }
704 res = fp->f_offset;
705 mtx_unlock(mtxp);
706 return (res);
707 }
708
709 void
710 foffset_unlock(struct file *fp, off_t val, int flags)
711 {
712 struct mtx *mtxp;
713
714 KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
715
716 #if OFF_MAX <= LONG_MAX
717 if ((flags & FOF_NOLOCK) != 0) {
718 if ((flags & FOF_NOUPDATE) == 0)
719 fp->f_offset = val;
720 if ((flags & FOF_NEXTOFF) != 0)
721 fp->f_nextoff = val;
722 return;
723 }
724 #endif
725
726 mtxp = mtx_pool_find(mtxpool_sleep, fp);
727 mtx_lock(mtxp);
728 if ((flags & FOF_NOUPDATE) == 0)
729 fp->f_offset = val;
730 if ((flags & FOF_NEXTOFF) != 0)
731 fp->f_nextoff = val;
732 if ((flags & FOF_NOLOCK) == 0) {
733 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
734 ("Lost FOFFSET_LOCKED"));
735 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
736 wakeup(&fp->f_vnread_flags);
737 fp->f_vnread_flags = 0;
738 }
739 mtx_unlock(mtxp);
740 }
741
742 void
743 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
744 {
745
746 if ((flags & FOF_OFFSET) == 0)
747 uio->uio_offset = foffset_lock(fp, flags);
748 }
749
750 void
751 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
752 {
753
754 if ((flags & FOF_OFFSET) == 0)
755 foffset_unlock(fp, uio->uio_offset, flags);
756 }
757
758 static int
759 get_advice(struct file *fp, struct uio *uio)
760 {
761 struct mtx *mtxp;
762 int ret;
763
764 ret = POSIX_FADV_NORMAL;
765 if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
766 return (ret);
767
768 mtxp = mtx_pool_find(mtxpool_sleep, fp);
769 mtx_lock(mtxp);
770 if (fp->f_advice != NULL &&
771 uio->uio_offset >= fp->f_advice->fa_start &&
772 uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
773 ret = fp->f_advice->fa_advice;
774 mtx_unlock(mtxp);
775 return (ret);
776 }
777
778 /*
779 * File table vnode read routine.
780 */
781 static int
782 vn_read(fp, uio, active_cred, flags, td)
783 struct file *fp;
784 struct uio *uio;
785 struct ucred *active_cred;
786 int flags;
787 struct thread *td;
788 {
789 struct vnode *vp;
790 off_t orig_offset;
791 int error, ioflag;
792 int advice;
793
794 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
795 uio->uio_td, td));
796 KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
797 vp = fp->f_vnode;
798 ioflag = 0;
799 if (fp->f_flag & FNONBLOCK)
800 ioflag |= IO_NDELAY;
801 if (fp->f_flag & O_DIRECT)
802 ioflag |= IO_DIRECT;
803 advice = get_advice(fp, uio);
804 vn_lock(vp, LK_SHARED | LK_RETRY);
805
806 switch (advice) {
807 case POSIX_FADV_NORMAL:
808 case POSIX_FADV_SEQUENTIAL:
809 case POSIX_FADV_NOREUSE:
810 ioflag |= sequential_heuristic(uio, fp);
811 break;
812 case POSIX_FADV_RANDOM:
813 /* Disable read-ahead for random I/O. */
814 break;
815 }
816 orig_offset = uio->uio_offset;
817
818 #ifdef MAC
819 error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
820 if (error == 0)
821 #endif
822 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
823 fp->f_nextoff = uio->uio_offset;
824 VOP_UNLOCK(vp, 0);
825 if (error == 0 && advice == POSIX_FADV_NOREUSE &&
826 orig_offset != uio->uio_offset)
827 /*
828 * Use POSIX_FADV_DONTNEED to flush pages and buffers
829 * for the backing file after a POSIX_FADV_NOREUSE
830 * read(2).
831 */
832 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
833 POSIX_FADV_DONTNEED);
834 return (error);
835 }
836
837 /*
838 * File table vnode write routine.
839 */
840 static int
841 vn_write(fp, uio, active_cred, flags, td)
842 struct file *fp;
843 struct uio *uio;
844 struct ucred *active_cred;
845 int flags;
846 struct thread *td;
847 {
848 struct vnode *vp;
849 struct mount *mp;
850 off_t orig_offset;
851 int error, ioflag, lock_flags;
852 int advice;
853
854 KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
855 uio->uio_td, td));
856 KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
857 vp = fp->f_vnode;
858 if (vp->v_type == VREG)
859 bwillwrite();
860 ioflag = IO_UNIT;
861 if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
862 ioflag |= IO_APPEND;
863 if (fp->f_flag & FNONBLOCK)
864 ioflag |= IO_NDELAY;
865 if (fp->f_flag & O_DIRECT)
866 ioflag |= IO_DIRECT;
867 if ((fp->f_flag & O_FSYNC) ||
868 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
869 ioflag |= IO_SYNC;
870 mp = NULL;
871 if (vp->v_type != VCHR &&
872 (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
873 goto unlock;
874
875 advice = get_advice(fp, uio);
876
877 if (MNT_SHARED_WRITES(mp) ||
878 (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
879 lock_flags = LK_SHARED;
880 } else {
881 lock_flags = LK_EXCLUSIVE;
882 }
883
884 vn_lock(vp, lock_flags | LK_RETRY);
885 switch (advice) {
886 case POSIX_FADV_NORMAL:
887 case POSIX_FADV_SEQUENTIAL:
888 case POSIX_FADV_NOREUSE:
889 ioflag |= sequential_heuristic(uio, fp);
890 break;
891 case POSIX_FADV_RANDOM:
892 /* XXX: Is this correct? */
893 break;
894 }
895 orig_offset = uio->uio_offset;
896
897 #ifdef MAC
898 error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
899 if (error == 0)
900 #endif
901 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
902 fp->f_nextoff = uio->uio_offset;
903 VOP_UNLOCK(vp, 0);
904 if (vp->v_type != VCHR)
905 vn_finished_write(mp);
906 if (error == 0 && advice == POSIX_FADV_NOREUSE &&
907 orig_offset != uio->uio_offset)
908 /*
909 * Use POSIX_FADV_DONTNEED to flush pages and buffers
910 * for the backing file after a POSIX_FADV_NOREUSE
911 * write(2).
912 */
913 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
914 POSIX_FADV_DONTNEED);
915 unlock:
916 return (error);
917 }
918
919 /*
920 * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
921 * prevent the following deadlock:
922 *
923 * Assume that the thread A reads from the vnode vp1 into userspace
924 * buffer buf1 backed by the pages of vnode vp2. If a page in buf1 is
925 * currently not resident, then system ends up with the call chain
926 * vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
927 * vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
928 * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
929 * If, at the same time, thread B reads from vnode vp2 into buffer buf2
930 * backed by the pages of vnode vp1, and some page in buf2 is not
931 * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
932 *
933 * To prevent the lock order reversal and deadlock, vn_io_fault() does
934 * not allow page faults to happen during VOP_READ() or VOP_WRITE().
935 * Instead, it first tries to do the whole range i/o with pagefaults
936 * disabled. If all pages in the i/o buffer are resident and mapped,
937 * VOP will succeed (ignoring the genuine filesystem errors).
938 * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
939 * i/o in chunks, with all pages in the chunk prefaulted and held
940 * using vm_fault_quick_hold_pages().
941 *
942 * Filesystems using this deadlock avoidance scheme should use the
943 * array of the held pages from uio, saved in the curthread->td_ma,
944 * instead of doing uiomove(). A helper function
945 * vn_io_fault_uiomove() converts uiomove request into
946 * uiomove_fromphys() over td_ma array.
947 *
948 * Since vnode locks do not cover the whole i/o anymore, rangelocks
949 * make the current i/o request atomic with respect to other i/os and
950 * truncations.
951 */
952
953 /*
954 * Decode vn_io_fault_args and perform the corresponding i/o.
955 */
956 static int
957 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
958 struct thread *td)
959 {
960
961 switch (args->kind) {
962 case VN_IO_FAULT_FOP:
963 return ((args->args.fop_args.doio)(args->args.fop_args.fp,
964 uio, args->cred, args->flags, td));
965 case VN_IO_FAULT_VOP:
966 if (uio->uio_rw == UIO_READ) {
967 return (VOP_READ(args->args.vop_args.vp, uio,
968 args->flags, args->cred));
969 } else if (uio->uio_rw == UIO_WRITE) {
970 return (VOP_WRITE(args->args.vop_args.vp, uio,
971 args->flags, args->cred));
972 }
973 break;
974 }
975 panic("vn_io_fault_doio: unknown kind of io %d %d", args->kind,
976 uio->uio_rw);
977 }
978
979 static int
980 vn_io_fault_touch(char *base, const struct uio *uio)
981 {
982 int r;
983
984 r = fubyte(base);
985 if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
986 return (EFAULT);
987 return (0);
988 }
989
990 static int
991 vn_io_fault_prefault_user(const struct uio *uio)
992 {
993 char *base;
994 const struct iovec *iov;
995 size_t len;
996 ssize_t resid;
997 int error, i;
998
999 KASSERT(uio->uio_segflg == UIO_USERSPACE,
1000 ("vn_io_fault_prefault userspace"));
1001
1002 error = i = 0;
1003 iov = uio->uio_iov;
1004 resid = uio->uio_resid;
1005 base = iov->iov_base;
1006 len = iov->iov_len;
1007 while (resid > 0) {
1008 error = vn_io_fault_touch(base, uio);
1009 if (error != 0)
1010 break;
1011 if (len < PAGE_SIZE) {
1012 if (len != 0) {
1013 error = vn_io_fault_touch(base + len - 1, uio);
1014 if (error != 0)
1015 break;
1016 resid -= len;
1017 }
1018 if (++i >= uio->uio_iovcnt)
1019 break;
1020 iov = uio->uio_iov + i;
1021 base = iov->iov_base;
1022 len = iov->iov_len;
1023 } else {
1024 len -= PAGE_SIZE;
1025 base += PAGE_SIZE;
1026 resid -= PAGE_SIZE;
1027 }
1028 }
1029 return (error);
1030 }
1031
1032 /*
1033 * Common code for vn_io_fault(), agnostic to the kind of i/o request.
1034 * Uses vn_io_fault_doio() to make the call to an actual i/o function.
1035 * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
1036 * into args and call vn_io_fault1() to handle faults during the user
1037 * mode buffer accesses.
1038 */
1039 static int
1040 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
1041 struct thread *td)
1042 {
1043 vm_page_t ma[io_hold_cnt + 2];
1044 struct uio *uio_clone, short_uio;
1045 struct iovec short_iovec[1];
1046 vm_page_t *prev_td_ma;
1047 vm_prot_t prot;
1048 vm_offset_t addr, end;
1049 size_t len, resid;
1050 ssize_t adv;
1051 int error, cnt, save, saveheld, prev_td_ma_cnt;
1052
1053 if (vn_io_fault_prefault) {
1054 error = vn_io_fault_prefault_user(uio);
1055 if (error != 0)
1056 return (error); /* Or ignore ? */
1057 }
1058
1059 prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
1060
1061 /*
1062 * The UFS follows IO_UNIT directive and replays back both
1063 * uio_offset and uio_resid if an error is encountered during the
1064 * operation. But, since the iovec may be already advanced,
1065 * uio is still in an inconsistent state.
1066 *
1067 * Cache a copy of the original uio, which is advanced to the redo
1068 * point using UIO_NOCOPY below.
1069 */
1070 uio_clone = cloneuio(uio);
1071 resid = uio->uio_resid;
1072
1073 short_uio.uio_segflg = UIO_USERSPACE;
1074 short_uio.uio_rw = uio->uio_rw;
1075 short_uio.uio_td = uio->uio_td;
1076
1077 save = vm_fault_disable_pagefaults();
1078 error = vn_io_fault_doio(args, uio, td);
1079 if (error != EFAULT)
1080 goto out;
1081
1082 atomic_add_long(&vn_io_faults_cnt, 1);
1083 uio_clone->uio_segflg = UIO_NOCOPY;
1084 uiomove(NULL, resid - uio->uio_resid, uio_clone);
1085 uio_clone->uio_segflg = uio->uio_segflg;
1086
1087 saveheld = curthread_pflags_set(TDP_UIOHELD);
1088 prev_td_ma = td->td_ma;
1089 prev_td_ma_cnt = td->td_ma_cnt;
1090
1091 while (uio_clone->uio_resid != 0) {
1092 len = uio_clone->uio_iov->iov_len;
1093 if (len == 0) {
1094 KASSERT(uio_clone->uio_iovcnt >= 1,
1095 ("iovcnt underflow"));
1096 uio_clone->uio_iov++;
1097 uio_clone->uio_iovcnt--;
1098 continue;
1099 }
1100 if (len > io_hold_cnt * PAGE_SIZE)
1101 len = io_hold_cnt * PAGE_SIZE;
1102 addr = (uintptr_t)uio_clone->uio_iov->iov_base;
1103 end = round_page(addr + len);
1104 if (end < addr) {
1105 error = EFAULT;
1106 break;
1107 }
1108 cnt = atop(end - trunc_page(addr));
1109 /*
1110 * A perfectly misaligned address and length could cause
1111 * both the start and the end of the chunk to use partial
1112 * page. +2 accounts for such a situation.
1113 */
1114 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
1115 addr, len, prot, ma, io_hold_cnt + 2);
1116 if (cnt == -1) {
1117 error = EFAULT;
1118 break;
1119 }
1120 short_uio.uio_iov = &short_iovec[0];
1121 short_iovec[0].iov_base = (void *)addr;
1122 short_uio.uio_iovcnt = 1;
1123 short_uio.uio_resid = short_iovec[0].iov_len = len;
1124 short_uio.uio_offset = uio_clone->uio_offset;
1125 td->td_ma = ma;
1126 td->td_ma_cnt = cnt;
1127
1128 error = vn_io_fault_doio(args, &short_uio, td);
1129 vm_page_unhold_pages(ma, cnt);
1130 adv = len - short_uio.uio_resid;
1131
1132 uio_clone->uio_iov->iov_base =
1133 (char *)uio_clone->uio_iov->iov_base + adv;
1134 uio_clone->uio_iov->iov_len -= adv;
1135 uio_clone->uio_resid -= adv;
1136 uio_clone->uio_offset += adv;
1137
1138 uio->uio_resid -= adv;
1139 uio->uio_offset += adv;
1140
1141 if (error != 0 || adv == 0)
1142 break;
1143 }
1144 td->td_ma = prev_td_ma;
1145 td->td_ma_cnt = prev_td_ma_cnt;
1146 curthread_pflags_restore(saveheld);
1147 out:
1148 vm_fault_enable_pagefaults(save);
1149 free(uio_clone, M_IOV);
1150 return (error);
1151 }
1152
1153 static int
1154 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
1155 int flags, struct thread *td)
1156 {
1157 fo_rdwr_t *doio;
1158 struct vnode *vp;
1159 void *rl_cookie;
1160 struct vn_io_fault_args args;
1161 int error;
1162
1163 doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
1164 vp = fp->f_vnode;
1165 foffset_lock_uio(fp, uio, flags);
1166 if (do_vn_io_fault(vp, uio)) {
1167 args.kind = VN_IO_FAULT_FOP;
1168 args.args.fop_args.fp = fp;
1169 args.args.fop_args.doio = doio;
1170 args.cred = active_cred;
1171 args.flags = flags | FOF_OFFSET;
1172 if (uio->uio_rw == UIO_READ) {
1173 rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
1174 uio->uio_offset + uio->uio_resid);
1175 } else if ((fp->f_flag & O_APPEND) != 0 ||
1176 (flags & FOF_OFFSET) == 0) {
1177 /* For appenders, punt and lock the whole range. */
1178 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1179 } else {
1180 rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
1181 uio->uio_offset + uio->uio_resid);
1182 }
1183 error = vn_io_fault1(vp, uio, &args, td);
1184 vn_rangelock_unlock(vp, rl_cookie);
1185 } else {
1186 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
1187 }
1188 foffset_unlock_uio(fp, uio, flags);
1189 return (error);
1190 }
1191
1192 /*
1193 * Helper function to perform the requested uiomove operation using
1194 * the held pages for io->uio_iov[0].iov_base buffer instead of
1195 * copyin/copyout. Access to the pages with uiomove_fromphys()
1196 * instead of iov_base prevents page faults that could occur due to
1197 * pmap_collect() invalidating the mapping created by
1198 * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
1199 * object cleanup revoking the write access from page mappings.
1200 *
1201 * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
1202 * instead of plain uiomove().
1203 */
1204 int
1205 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
1206 {
1207 struct uio transp_uio;
1208 struct iovec transp_iov[1];
1209 struct thread *td;
1210 size_t adv;
1211 int error, pgadv;
1212
1213 td = curthread;
1214 if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1215 uio->uio_segflg != UIO_USERSPACE)
1216 return (uiomove(data, xfersize, uio));
1217
1218 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1219 transp_iov[0].iov_base = data;
1220 transp_uio.uio_iov = &transp_iov[0];
1221 transp_uio.uio_iovcnt = 1;
1222 if (xfersize > uio->uio_resid)
1223 xfersize = uio->uio_resid;
1224 transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
1225 transp_uio.uio_offset = 0;
1226 transp_uio.uio_segflg = UIO_SYSSPACE;
1227 /*
1228 * Since transp_iov points to data, and td_ma page array
1229 * corresponds to original uio->uio_iov, we need to invert the
1230 * direction of the i/o operation as passed to
1231 * uiomove_fromphys().
1232 */
1233 switch (uio->uio_rw) {
1234 case UIO_WRITE:
1235 transp_uio.uio_rw = UIO_READ;
1236 break;
1237 case UIO_READ:
1238 transp_uio.uio_rw = UIO_WRITE;
1239 break;
1240 }
1241 transp_uio.uio_td = uio->uio_td;
1242 error = uiomove_fromphys(td->td_ma,
1243 ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
1244 xfersize, &transp_uio);
1245 adv = xfersize - transp_uio.uio_resid;
1246 pgadv =
1247 (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
1248 (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
1249 td->td_ma += pgadv;
1250 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1251 pgadv));
1252 td->td_ma_cnt -= pgadv;
1253 uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
1254 uio->uio_iov->iov_len -= adv;
1255 uio->uio_resid -= adv;
1256 uio->uio_offset += adv;
1257 return (error);
1258 }
1259
1260 int
1261 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
1262 struct uio *uio)
1263 {
1264 struct thread *td;
1265 vm_offset_t iov_base;
1266 int cnt, pgadv;
1267
1268 td = curthread;
1269 if ((td->td_pflags & TDP_UIOHELD) == 0 ||
1270 uio->uio_segflg != UIO_USERSPACE)
1271 return (uiomove_fromphys(ma, offset, xfersize, uio));
1272
1273 KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
1274 cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
1275 iov_base = (vm_offset_t)uio->uio_iov->iov_base;
1276 switch (uio->uio_rw) {
1277 case UIO_WRITE:
1278 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
1279 offset, cnt);
1280 break;
1281 case UIO_READ:
1282 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
1283 cnt);
1284 break;
1285 }
1286 pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
1287 td->td_ma += pgadv;
1288 KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
1289 pgadv));
1290 td->td_ma_cnt -= pgadv;
1291 uio->uio_iov->iov_base = (char *)(iov_base + cnt);
1292 uio->uio_iov->iov_len -= cnt;
1293 uio->uio_resid -= cnt;
1294 uio->uio_offset += cnt;
1295 return (0);
1296 }
1297
1298
1299 /*
1300 * File table truncate routine.
1301 */
1302 static int
1303 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1304 struct thread *td)
1305 {
1306 struct vattr vattr;
1307 struct mount *mp;
1308 struct vnode *vp;
1309 void *rl_cookie;
1310 int error;
1311
1312 vp = fp->f_vnode;
1313
1314 /*
1315 * Lock the whole range for truncation. Otherwise split i/o
1316 * might happen partly before and partly after the truncation.
1317 */
1318 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
1319 error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
1320 if (error)
1321 goto out1;
1322 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1323 if (vp->v_type == VDIR) {
1324 error = EISDIR;
1325 goto out;
1326 }
1327 #ifdef MAC
1328 error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
1329 if (error)
1330 goto out;
1331 #endif
1332 error = vn_writechk(vp);
1333 if (error == 0) {
1334 VATTR_NULL(&vattr);
1335 vattr.va_size = length;
1336 if ((fp->f_flag & O_FSYNC) != 0)
1337 vattr.va_vaflags |= VA_SYNC;
1338 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
1339 }
1340 out:
1341 VOP_UNLOCK(vp, 0);
1342 vn_finished_write(mp);
1343 out1:
1344 vn_rangelock_unlock(vp, rl_cookie);
1345 return (error);
1346 }
1347
1348 /*
1349 * File table vnode stat routine.
1350 */
1351 static int
1352 vn_statfile(fp, sb, active_cred, td)
1353 struct file *fp;
1354 struct stat *sb;
1355 struct ucred *active_cred;
1356 struct thread *td;
1357 {
1358 struct vnode *vp = fp->f_vnode;
1359 int error;
1360
1361 vn_lock(vp, LK_SHARED | LK_RETRY);
1362 error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
1363 VOP_UNLOCK(vp, 0);
1364
1365 return (error);
1366 }
1367
1368 /*
1369 * Stat a vnode; implementation for the stat syscall
1370 */
1371 int
1372 vn_stat(vp, sb, active_cred, file_cred, td)
1373 struct vnode *vp;
1374 register struct stat *sb;
1375 struct ucred *active_cred;
1376 struct ucred *file_cred;
1377 struct thread *td;
1378 {
1379 struct vattr vattr;
1380 register struct vattr *vap;
1381 int error;
1382 u_short mode;
1383
1384 AUDIT_ARG_VNODE1(vp);
1385 #ifdef MAC
1386 error = mac_vnode_check_stat(active_cred, file_cred, vp);
1387 if (error)
1388 return (error);
1389 #endif
1390
1391 vap = &vattr;
1392
1393 /*
1394 * Initialize defaults for new and unusual fields, so that file
1395 * systems which don't support these fields don't need to know
1396 * about them.
1397 */
1398 vap->va_birthtime.tv_sec = -1;
1399 vap->va_birthtime.tv_nsec = 0;
1400 vap->va_fsid = VNOVAL;
1401 vap->va_rdev = NODEV;
1402
1403 error = VOP_GETATTR(vp, vap, active_cred);
1404 if (error)
1405 return (error);
1406
1407 /*
1408 * Zero the spare stat fields
1409 */
1410 bzero(sb, sizeof *sb);
1411
1412 /*
1413 * Copy from vattr table
1414 */
1415 if (vap->va_fsid != VNOVAL)
1416 sb->st_dev = vap->va_fsid;
1417 else
1418 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
1419 sb->st_ino = vap->va_fileid;
1420 mode = vap->va_mode;
1421 switch (vap->va_type) {
1422 case VREG:
1423 mode |= S_IFREG;
1424 break;
1425 case VDIR:
1426 mode |= S_IFDIR;
1427 break;
1428 case VBLK:
1429 mode |= S_IFBLK;
1430 break;
1431 case VCHR:
1432 mode |= S_IFCHR;
1433 break;
1434 case VLNK:
1435 mode |= S_IFLNK;
1436 break;
1437 case VSOCK:
1438 mode |= S_IFSOCK;
1439 break;
1440 case VFIFO:
1441 mode |= S_IFIFO;
1442 break;
1443 default:
1444 return (EBADF);
1445 }
1446 sb->st_mode = mode;
1447 sb->st_nlink = vap->va_nlink;
1448 sb->st_uid = vap->va_uid;
1449 sb->st_gid = vap->va_gid;
1450 sb->st_rdev = vap->va_rdev;
1451 if (vap->va_size > OFF_MAX)
1452 return (EOVERFLOW);
1453 sb->st_size = vap->va_size;
1454 sb->st_atim = vap->va_atime;
1455 sb->st_mtim = vap->va_mtime;
1456 sb->st_ctim = vap->va_ctime;
1457 sb->st_birthtim = vap->va_birthtime;
1458
1459 /*
1460 * According to www.opengroup.org, the meaning of st_blksize is
1461 * "a filesystem-specific preferred I/O block size for this
1462 * object. In some filesystem types, this may vary from file
1463 * to file"
1464 * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
1465 */
1466
1467 sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
1468
1469 sb->st_flags = vap->va_flags;
1470 if (priv_check(td, PRIV_VFS_GENERATION))
1471 sb->st_gen = 0;
1472 else
1473 sb->st_gen = vap->va_gen;
1474
1475 sb->st_blocks = vap->va_bytes / S_BLKSIZE;
1476 return (0);
1477 }
1478
1479 /*
1480 * File table vnode ioctl routine.
1481 */
1482 static int
1483 vn_ioctl(fp, com, data, active_cred, td)
1484 struct file *fp;
1485 u_long com;
1486 void *data;
1487 struct ucred *active_cred;
1488 struct thread *td;
1489 {
1490 struct vattr vattr;
1491 struct vnode *vp;
1492 int error;
1493
1494 vp = fp->f_vnode;
1495 switch (vp->v_type) {
1496 case VDIR:
1497 case VREG:
1498 switch (com) {
1499 case FIONREAD:
1500 vn_lock(vp, LK_SHARED | LK_RETRY);
1501 error = VOP_GETATTR(vp, &vattr, active_cred);
1502 VOP_UNLOCK(vp, 0);
1503 if (error == 0)
1504 *(int *)data = vattr.va_size - fp->f_offset;
1505 return (error);
1506 case FIONBIO:
1507 case FIOASYNC:
1508 return (0);
1509 default:
1510 return (VOP_IOCTL(vp, com, data, fp->f_flag,
1511 active_cred, td));
1512 }
1513 default:
1514 return (ENOTTY);
1515 }
1516 }
1517
1518 /*
1519 * File table vnode poll routine.
1520 */
1521 static int
1522 vn_poll(fp, events, active_cred, td)
1523 struct file *fp;
1524 int events;
1525 struct ucred *active_cred;
1526 struct thread *td;
1527 {
1528 struct vnode *vp;
1529 int error;
1530
1531 vp = fp->f_vnode;
1532 #ifdef MAC
1533 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1534 AUDIT_ARG_VNODE1(vp);
1535 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
1536 VOP_UNLOCK(vp, 0);
1537 if (!error)
1538 #endif
1539
1540 error = VOP_POLL(vp, events, fp->f_cred, td);
1541 return (error);
1542 }
1543
1544 /*
1545 * Acquire the requested lock and then check for validity. LK_RETRY
1546 * permits vn_lock to return doomed vnodes.
1547 */
1548 int
1549 _vn_lock(struct vnode *vp, int flags, char *file, int line)
1550 {
1551 int error;
1552
1553 VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
1554 ("vn_lock: no locktype"));
1555 VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count"));
1556 retry:
1557 error = VOP_LOCK1(vp, flags, file, line);
1558 flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
1559 KASSERT((flags & LK_RETRY) == 0 || error == 0,
1560 ("vn_lock: error %d incompatible with flags %#x", error, flags));
1561
1562 if ((flags & LK_RETRY) == 0) {
1563 if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
1564 VOP_UNLOCK(vp, 0);
1565 error = ENOENT;
1566 }
1567 } else if (error != 0)
1568 goto retry;
1569 return (error);
1570 }
1571
1572 /*
1573 * File table vnode close routine.
1574 */
1575 static int
1576 vn_closefile(struct file *fp, struct thread *td)
1577 {
1578 struct vnode *vp;
1579 struct flock lf;
1580 int error;
1581 bool ref;
1582
1583 vp = fp->f_vnode;
1584 fp->f_ops = &badfileops;
1585 ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
1586
1587 error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
1588
1589 if (__predict_false(ref)) {
1590 lf.l_whence = SEEK_SET;
1591 lf.l_start = 0;
1592 lf.l_len = 0;
1593 lf.l_type = F_UNLCK;
1594 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
1595 vrele(vp);
1596 }
1597 return (error);
1598 }
1599
1600 static bool
1601 vn_suspendable(struct mount *mp)
1602 {
1603
1604 return (mp->mnt_op->vfs_susp_clean != NULL);
1605 }
1606
1607 /*
1608 * Preparing to start a filesystem write operation. If the operation is
1609 * permitted, then we bump the count of operations in progress and
1610 * proceed. If a suspend request is in progress, we wait until the
1611 * suspension is over, and then proceed.
1612 */
1613 static int
1614 vn_start_write_locked(struct mount *mp, int flags)
1615 {
1616 int error, mflags;
1617
1618 mtx_assert(MNT_MTX(mp), MA_OWNED);
1619 error = 0;
1620
1621 /*
1622 * Check on status of suspension.
1623 */
1624 if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
1625 mp->mnt_susp_owner != curthread) {
1626 mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
1627 (flags & PCATCH) : 0) | (PUSER - 1);
1628 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1629 if (flags & V_NOWAIT) {
1630 error = EWOULDBLOCK;
1631 goto unlock;
1632 }
1633 error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
1634 "suspfs", 0);
1635 if (error)
1636 goto unlock;
1637 }
1638 }
1639 if (flags & V_XSLEEP)
1640 goto unlock;
1641 mp->mnt_writeopcount++;
1642 unlock:
1643 if (error != 0 || (flags & V_XSLEEP) != 0)
1644 MNT_REL(mp);
1645 MNT_IUNLOCK(mp);
1646 return (error);
1647 }
1648
1649 int
1650 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
1651 {
1652 struct mount *mp;
1653 int error;
1654
1655 KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1656 ("V_MNTREF requires mp"));
1657
1658 error = 0;
1659 /*
1660 * If a vnode is provided, get and return the mount point that
1661 * to which it will write.
1662 */
1663 if (vp != NULL) {
1664 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1665 *mpp = NULL;
1666 if (error != EOPNOTSUPP)
1667 return (error);
1668 return (0);
1669 }
1670 }
1671 if ((mp = *mpp) == NULL)
1672 return (0);
1673
1674 if (!vn_suspendable(mp)) {
1675 if (vp != NULL || (flags & V_MNTREF) != 0)
1676 vfs_rel(mp);
1677 return (0);
1678 }
1679
1680 /*
1681 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1682 * a vfs_ref().
1683 * As long as a vnode is not provided we need to acquire a
1684 * refcount for the provided mountpoint too, in order to
1685 * emulate a vfs_ref().
1686 */
1687 MNT_ILOCK(mp);
1688 if (vp == NULL && (flags & V_MNTREF) == 0)
1689 MNT_REF(mp);
1690
1691 return (vn_start_write_locked(mp, flags));
1692 }
1693
1694 /*
1695 * Secondary suspension. Used by operations such as vop_inactive
1696 * routines that are needed by the higher level functions. These
1697 * are allowed to proceed until all the higher level functions have
1698 * completed (indicated by mnt_writeopcount dropping to zero). At that
1699 * time, these operations are halted until the suspension is over.
1700 */
1701 int
1702 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
1703 {
1704 struct mount *mp;
1705 int error;
1706
1707 KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
1708 ("V_MNTREF requires mp"));
1709
1710 retry:
1711 if (vp != NULL) {
1712 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
1713 *mpp = NULL;
1714 if (error != EOPNOTSUPP)
1715 return (error);
1716 return (0);
1717 }
1718 }
1719 /*
1720 * If we are not suspended or have not yet reached suspended
1721 * mode, then let the operation proceed.
1722 */
1723 if ((mp = *mpp) == NULL)
1724 return (0);
1725
1726 if (!vn_suspendable(mp)) {
1727 if (vp != NULL || (flags & V_MNTREF) != 0)
1728 vfs_rel(mp);
1729 return (0);
1730 }
1731
1732 /*
1733 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
1734 * a vfs_ref().
1735 * As long as a vnode is not provided we need to acquire a
1736 * refcount for the provided mountpoint too, in order to
1737 * emulate a vfs_ref().
1738 */
1739 MNT_ILOCK(mp);
1740 if (vp == NULL && (flags & V_MNTREF) == 0)
1741 MNT_REF(mp);
1742 if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
1743 mp->mnt_secondary_writes++;
1744 mp->mnt_secondary_accwrites++;
1745 MNT_IUNLOCK(mp);
1746 return (0);
1747 }
1748 if (flags & V_NOWAIT) {
1749 MNT_REL(mp);
1750 MNT_IUNLOCK(mp);
1751 return (EWOULDBLOCK);
1752 }
1753 /*
1754 * Wait for the suspension to finish.
1755 */
1756 error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
1757 ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
1758 "suspfs", 0);
1759 vfs_rel(mp);
1760 if (error == 0)
1761 goto retry;
1762 return (error);
1763 }
1764
1765 /*
1766 * Filesystem write operation has completed. If we are suspending and this
1767 * operation is the last one, notify the suspender that the suspension is
1768 * now in effect.
1769 */
1770 void
1771 vn_finished_write(mp)
1772 struct mount *mp;
1773 {
1774 if (mp == NULL || !vn_suspendable(mp))
1775 return;
1776 MNT_ILOCK(mp);
1777 MNT_REL(mp);
1778 mp->mnt_writeopcount--;
1779 if (mp->mnt_writeopcount < 0)
1780 panic("vn_finished_write: neg cnt");
1781 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1782 mp->mnt_writeopcount <= 0)
1783 wakeup(&mp->mnt_writeopcount);
1784 MNT_IUNLOCK(mp);
1785 }
1786
1787
1788 /*
1789 * Filesystem secondary write operation has completed. If we are
1790 * suspending and this operation is the last one, notify the suspender
1791 * that the suspension is now in effect.
1792 */
1793 void
1794 vn_finished_secondary_write(mp)
1795 struct mount *mp;
1796 {
1797 if (mp == NULL || !vn_suspendable(mp))
1798 return;
1799 MNT_ILOCK(mp);
1800 MNT_REL(mp);
1801 mp->mnt_secondary_writes--;
1802 if (mp->mnt_secondary_writes < 0)
1803 panic("vn_finished_secondary_write: neg cnt");
1804 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
1805 mp->mnt_secondary_writes <= 0)
1806 wakeup(&mp->mnt_secondary_writes);
1807 MNT_IUNLOCK(mp);
1808 }
1809
1810
1811
1812 /*
1813 * Request a filesystem to suspend write operations.
1814 */
1815 int
1816 vfs_write_suspend(struct mount *mp, int flags)
1817 {
1818 int error;
1819
1820 MPASS(vn_suspendable(mp));
1821
1822 MNT_ILOCK(mp);
1823 if (mp->mnt_susp_owner == curthread) {
1824 MNT_IUNLOCK(mp);
1825 return (EALREADY);
1826 }
1827 while (mp->mnt_kern_flag & MNTK_SUSPEND)
1828 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
1829
1830 /*
1831 * Unmount holds a write reference on the mount point. If we
1832 * own busy reference and drain for writers, we deadlock with
1833 * the reference draining in the unmount path. Callers of
1834 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
1835 * vfs_busy() reference is owned and caller is not in the
1836 * unmount context.
1837 */
1838 if ((flags & VS_SKIP_UNMOUNT) != 0 &&
1839 (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
1840 MNT_IUNLOCK(mp);
1841 return (EBUSY);
1842 }
1843
1844 mp->mnt_kern_flag |= MNTK_SUSPEND;
1845 mp->mnt_susp_owner = curthread;
1846 if (mp->mnt_writeopcount > 0)
1847 (void) msleep(&mp->mnt_writeopcount,
1848 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
1849 else
1850 MNT_IUNLOCK(mp);
1851 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
1852 vfs_write_resume(mp, 0);
1853 return (error);
1854 }
1855
1856 /*
1857 * Request a filesystem to resume write operations.
1858 */
1859 void
1860 vfs_write_resume(struct mount *mp, int flags)
1861 {
1862
1863 MPASS(vn_suspendable(mp));
1864
1865 MNT_ILOCK(mp);
1866 if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
1867 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
1868 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
1869 MNTK_SUSPENDED);
1870 mp->mnt_susp_owner = NULL;
1871 wakeup(&mp->mnt_writeopcount);
1872 wakeup(&mp->mnt_flag);
1873 curthread->td_pflags &= ~TDP_IGNSUSP;
1874 if ((flags & VR_START_WRITE) != 0) {
1875 MNT_REF(mp);
1876 mp->mnt_writeopcount++;
1877 }
1878 MNT_IUNLOCK(mp);
1879 if ((flags & VR_NO_SUSPCLR) == 0)
1880 VFS_SUSP_CLEAN(mp);
1881 } else if ((flags & VR_START_WRITE) != 0) {
1882 MNT_REF(mp);
1883 vn_start_write_locked(mp, 0);
1884 } else {
1885 MNT_IUNLOCK(mp);
1886 }
1887 }
1888
1889 /*
1890 * Helper loop around vfs_write_suspend() for filesystem unmount VFS
1891 * methods.
1892 */
1893 int
1894 vfs_write_suspend_umnt(struct mount *mp)
1895 {
1896 int error;
1897
1898 MPASS(vn_suspendable(mp));
1899 KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
1900 ("vfs_write_suspend_umnt: recursed"));
1901
1902 /* dounmount() already called vn_start_write(). */
1903 for (;;) {
1904 vn_finished_write(mp);
1905 error = vfs_write_suspend(mp, 0);
1906 if (error != 0) {
1907 vn_start_write(NULL, &mp, V_WAIT);
1908 return (error);
1909 }
1910 MNT_ILOCK(mp);
1911 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
1912 break;
1913 MNT_IUNLOCK(mp);
1914 vn_start_write(NULL, &mp, V_WAIT);
1915 }
1916 mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
1917 wakeup(&mp->mnt_flag);
1918 MNT_IUNLOCK(mp);
1919 curthread->td_pflags |= TDP_IGNSUSP;
1920 return (0);
1921 }
1922
1923 /*
1924 * Implement kqueues for files by translating it to vnode operation.
1925 */
1926 static int
1927 vn_kqfilter(struct file *fp, struct knote *kn)
1928 {
1929
1930 return (VOP_KQFILTER(fp->f_vnode, kn));
1931 }
1932
1933 /*
1934 * Simplified in-kernel wrapper calls for extended attribute access.
1935 * Both calls pass in a NULL credential, authorizing as "kernel" access.
1936 * Set IO_NODELOCKED in ioflg if the vnode is already locked.
1937 */
1938 int
1939 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
1940 const char *attrname, int *buflen, char *buf, struct thread *td)
1941 {
1942 struct uio auio;
1943 struct iovec iov;
1944 int error;
1945
1946 iov.iov_len = *buflen;
1947 iov.iov_base = buf;
1948
1949 auio.uio_iov = &iov;
1950 auio.uio_iovcnt = 1;
1951 auio.uio_rw = UIO_READ;
1952 auio.uio_segflg = UIO_SYSSPACE;
1953 auio.uio_td = td;
1954 auio.uio_offset = 0;
1955 auio.uio_resid = *buflen;
1956
1957 if ((ioflg & IO_NODELOCKED) == 0)
1958 vn_lock(vp, LK_SHARED | LK_RETRY);
1959
1960 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
1961
1962 /* authorize attribute retrieval as kernel */
1963 error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
1964 td);
1965
1966 if ((ioflg & IO_NODELOCKED) == 0)
1967 VOP_UNLOCK(vp, 0);
1968
1969 if (error == 0) {
1970 *buflen = *buflen - auio.uio_resid;
1971 }
1972
1973 return (error);
1974 }
1975
1976 /*
1977 * XXX failure mode if partially written?
1978 */
1979 int
1980 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
1981 const char *attrname, int buflen, char *buf, struct thread *td)
1982 {
1983 struct uio auio;
1984 struct iovec iov;
1985 struct mount *mp;
1986 int error;
1987
1988 iov.iov_len = buflen;
1989 iov.iov_base = buf;
1990
1991 auio.uio_iov = &iov;
1992 auio.uio_iovcnt = 1;
1993 auio.uio_rw = UIO_WRITE;
1994 auio.uio_segflg = UIO_SYSSPACE;
1995 auio.uio_td = td;
1996 auio.uio_offset = 0;
1997 auio.uio_resid = buflen;
1998
1999 if ((ioflg & IO_NODELOCKED) == 0) {
2000 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2001 return (error);
2002 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2003 }
2004
2005 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2006
2007 /* authorize attribute setting as kernel */
2008 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
2009
2010 if ((ioflg & IO_NODELOCKED) == 0) {
2011 vn_finished_write(mp);
2012 VOP_UNLOCK(vp, 0);
2013 }
2014
2015 return (error);
2016 }
2017
2018 int
2019 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
2020 const char *attrname, struct thread *td)
2021 {
2022 struct mount *mp;
2023 int error;
2024
2025 if ((ioflg & IO_NODELOCKED) == 0) {
2026 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
2027 return (error);
2028 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2029 }
2030
2031 ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
2032
2033 /* authorize attribute removal as kernel */
2034 error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
2035 if (error == EOPNOTSUPP)
2036 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
2037 NULL, td);
2038
2039 if ((ioflg & IO_NODELOCKED) == 0) {
2040 vn_finished_write(mp);
2041 VOP_UNLOCK(vp, 0);
2042 }
2043
2044 return (error);
2045 }
2046
2047 static int
2048 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
2049 struct vnode **rvp)
2050 {
2051
2052 return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
2053 }
2054
2055 int
2056 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
2057 {
2058
2059 return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
2060 lkflags, rvp));
2061 }
2062
2063 int
2064 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
2065 int lkflags, struct vnode **rvp)
2066 {
2067 struct mount *mp;
2068 int ltype, error;
2069
2070 ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
2071 mp = vp->v_mount;
2072 ltype = VOP_ISLOCKED(vp);
2073 KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
2074 ("vn_vget_ino: vp not locked"));
2075 error = vfs_busy(mp, MBF_NOWAIT);
2076 if (error != 0) {
2077 vfs_ref(mp);
2078 VOP_UNLOCK(vp, 0);
2079 error = vfs_busy(mp, 0);
2080 vn_lock(vp, ltype | LK_RETRY);
2081 vfs_rel(mp);
2082 if (error != 0)
2083 return (ENOENT);
2084 if (vp->v_iflag & VI_DOOMED) {
2085 vfs_unbusy(mp);
2086 return (ENOENT);
2087 }
2088 }
2089 VOP_UNLOCK(vp, 0);
2090 error = alloc(mp, alloc_arg, lkflags, rvp);
2091 vfs_unbusy(mp);
2092 if (*rvp != vp)
2093 vn_lock(vp, ltype | LK_RETRY);
2094 if (vp->v_iflag & VI_DOOMED) {
2095 if (error == 0) {
2096 if (*rvp == vp)
2097 vunref(vp);
2098 else
2099 vput(*rvp);
2100 }
2101 error = ENOENT;
2102 }
2103 return (error);
2104 }
2105
2106 int
2107 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
2108 struct thread *td)
2109 {
2110
2111 if (vp->v_type != VREG || td == NULL)
2112 return (0);
2113 if ((uoff_t)uio->uio_offset + uio->uio_resid >
2114 lim_cur(td, RLIMIT_FSIZE)) {
2115 PROC_LOCK(td->td_proc);
2116 kern_psignal(td->td_proc, SIGXFSZ);
2117 PROC_UNLOCK(td->td_proc);
2118 return (EFBIG);
2119 }
2120 return (0);
2121 }
2122
2123 int
2124 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
2125 struct thread *td)
2126 {
2127 struct vnode *vp;
2128
2129 vp = fp->f_vnode;
2130 #ifdef AUDIT
2131 vn_lock(vp, LK_SHARED | LK_RETRY);
2132 AUDIT_ARG_VNODE1(vp);
2133 VOP_UNLOCK(vp, 0);
2134 #endif
2135 return (setfmode(td, active_cred, vp, mode));
2136 }
2137
2138 int
2139 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
2140 struct thread *td)
2141 {
2142 struct vnode *vp;
2143
2144 vp = fp->f_vnode;
2145 #ifdef AUDIT
2146 vn_lock(vp, LK_SHARED | LK_RETRY);
2147 AUDIT_ARG_VNODE1(vp);
2148 VOP_UNLOCK(vp, 0);
2149 #endif
2150 return (setfown(td, active_cred, vp, uid, gid));
2151 }
2152
2153 void
2154 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
2155 {
2156 vm_object_t object;
2157
2158 if ((object = vp->v_object) == NULL)
2159 return;
2160 VM_OBJECT_WLOCK(object);
2161 vm_object_page_remove(object, start, end, 0);
2162 VM_OBJECT_WUNLOCK(object);
2163 }
2164
2165 int
2166 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
2167 {
2168 struct vattr va;
2169 daddr_t bn, bnp;
2170 uint64_t bsize;
2171 off_t noff;
2172 int error;
2173
2174 KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
2175 ("Wrong command %lu", cmd));
2176
2177 if (vn_lock(vp, LK_SHARED) != 0)
2178 return (EBADF);
2179 if (vp->v_type != VREG) {
2180 error = ENOTTY;
2181 goto unlock;
2182 }
2183 error = VOP_GETATTR(vp, &va, cred);
2184 if (error != 0)
2185 goto unlock;
2186 noff = *off;
2187 if (noff >= va.va_size) {
2188 error = ENXIO;
2189 goto unlock;
2190 }
2191 bsize = vp->v_mount->mnt_stat.f_iosize;
2192 for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
2193 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
2194 if (error == EOPNOTSUPP) {
2195 error = ENOTTY;
2196 goto unlock;
2197 }
2198 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
2199 (bnp != -1 && cmd == FIOSEEKDATA)) {
2200 noff = bn * bsize;
2201 if (noff < *off)
2202 noff = *off;
2203 goto unlock;
2204 }
2205 }
2206 if (noff > va.va_size)
2207 noff = va.va_size;
2208 /* noff == va.va_size. There is an implicit hole at the end of file. */
2209 if (cmd == FIOSEEKDATA)
2210 error = ENXIO;
2211 unlock:
2212 VOP_UNLOCK(vp, 0);
2213 if (error == 0)
2214 *off = noff;
2215 return (error);
2216 }
2217
2218 int
2219 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
2220 {
2221 struct ucred *cred;
2222 struct vnode *vp;
2223 struct vattr vattr;
2224 off_t foffset, size;
2225 int error, noneg;
2226
2227 cred = td->td_ucred;
2228 vp = fp->f_vnode;
2229 foffset = foffset_lock(fp, 0);
2230 noneg = (vp->v_type != VCHR);
2231 error = 0;
2232 switch (whence) {
2233 case L_INCR:
2234 if (noneg &&
2235 (foffset < 0 ||
2236 (offset > 0 && foffset > OFF_MAX - offset))) {
2237 error = EOVERFLOW;
2238 break;
2239 }
2240 offset += foffset;
2241 break;
2242 case L_XTND:
2243 vn_lock(vp, LK_SHARED | LK_RETRY);
2244 error = VOP_GETATTR(vp, &vattr, cred);
2245 VOP_UNLOCK(vp, 0);
2246 if (error)
2247 break;
2248
2249 /*
2250 * If the file references a disk device, then fetch
2251 * the media size and use that to determine the ending
2252 * offset.
2253 */
2254 if (vattr.va_size == 0 && vp->v_type == VCHR &&
2255 fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
2256 vattr.va_size = size;
2257 if (noneg &&
2258 (vattr.va_size > OFF_MAX ||
2259 (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
2260 error = EOVERFLOW;
2261 break;
2262 }
2263 offset += vattr.va_size;
2264 break;
2265 case L_SET:
2266 break;
2267 case SEEK_DATA:
2268 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
2269 break;
2270 case SEEK_HOLE:
2271 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
2272 break;
2273 default:
2274 error = EINVAL;
2275 }
2276 if (error == 0 && noneg && offset < 0)
2277 error = EINVAL;
2278 if (error != 0)
2279 goto drop;
2280 VFS_KNOTE_UNLOCKED(vp, 0);
2281 td->td_uretoff.tdu_off = offset;
2282 drop:
2283 foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
2284 return (error);
2285 }
2286
2287 int
2288 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
2289 struct thread *td)
2290 {
2291 int error;
2292
2293 /*
2294 * Grant permission if the caller is the owner of the file, or
2295 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
2296 * on the file. If the time pointer is null, then write
2297 * permission on the file is also sufficient.
2298 *
2299 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
2300 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
2301 * will be allowed to set the times [..] to the current
2302 * server time.
2303 */
2304 error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
2305 if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
2306 error = VOP_ACCESS(vp, VWRITE, cred, td);
2307 return (error);
2308 }
2309
2310 int
2311 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
2312 {
2313 struct vnode *vp;
2314 int error;
2315
2316 if (fp->f_type == DTYPE_FIFO)
2317 kif->kf_type = KF_TYPE_FIFO;
2318 else
2319 kif->kf_type = KF_TYPE_VNODE;
2320 vp = fp->f_vnode;
2321 vref(vp);
2322 FILEDESC_SUNLOCK(fdp);
2323 error = vn_fill_kinfo_vnode(vp, kif);
2324 vrele(vp);
2325 FILEDESC_SLOCK(fdp);
2326 return (error);
2327 }
2328
2329 static inline void
2330 vn_fill_junk(struct kinfo_file *kif)
2331 {
2332 size_t len, olen;
2333
2334 /*
2335 * Simulate vn_fullpath returning changing values for a given
2336 * vp during e.g. coredump.
2337 */
2338 len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
2339 olen = strlen(kif->kf_path);
2340 if (len < olen)
2341 strcpy(&kif->kf_path[len - 1], "$");
2342 else
2343 for (; olen < len; olen++)
2344 strcpy(&kif->kf_path[olen], "A");
2345 }
2346
2347 int
2348 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
2349 {
2350 struct vattr va;
2351 char *fullpath, *freepath;
2352 int error;
2353
2354 kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
2355 freepath = NULL;
2356 fullpath = "-";
2357 error = vn_fullpath(curthread, vp, &fullpath, &freepath);
2358 if (error == 0) {
2359 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
2360 }
2361 if (freepath != NULL)
2362 free(freepath, M_TEMP);
2363
2364 KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
2365 vn_fill_junk(kif);
2366 );
2367
2368 /*
2369 * Retrieve vnode attributes.
2370 */
2371 va.va_fsid = VNOVAL;
2372 va.va_rdev = NODEV;
2373 vn_lock(vp, LK_SHARED | LK_RETRY);
2374 error = VOP_GETATTR(vp, &va, curthread->td_ucred);
2375 VOP_UNLOCK(vp, 0);
2376 if (error != 0)
2377 return (error);
2378 if (va.va_fsid != VNOVAL)
2379 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
2380 else
2381 kif->kf_un.kf_file.kf_file_fsid =
2382 vp->v_mount->mnt_stat.f_fsid.val[0];
2383 kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
2384 kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
2385 kif->kf_un.kf_file.kf_file_size = va.va_size;
2386 kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
2387 return (0);
2388 }
2389
2390 int
2391 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
2392 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
2393 struct thread *td)
2394 {
2395 #ifdef HWPMC_HOOKS
2396 struct pmckern_map_in pkm;
2397 #endif
2398 struct mount *mp;
2399 struct vnode *vp;
2400 vm_object_t object;
2401 vm_prot_t maxprot;
2402 boolean_t writecounted;
2403 int error;
2404
2405 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
2406 defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
2407 /*
2408 * POSIX shared-memory objects are defined to have
2409 * kernel persistence, and are not defined to support
2410 * read(2)/write(2) -- or even open(2). Thus, we can
2411 * use MAP_ASYNC to trade on-disk coherence for speed.
2412 * The shm_open(3) library routine turns on the FPOSIXSHM
2413 * flag to request this behavior.
2414 */
2415 if ((fp->f_flag & FPOSIXSHM) != 0)
2416 flags |= MAP_NOSYNC;
2417 #endif
2418 vp = fp->f_vnode;
2419
2420 /*
2421 * Ensure that file and memory protections are
2422 * compatible. Note that we only worry about
2423 * writability if mapping is shared; in this case,
2424 * current and max prot are dictated by the open file.
2425 * XXX use the vnode instead? Problem is: what
2426 * credentials do we use for determination? What if
2427 * proc does a setuid?
2428 */
2429 mp = vp->v_mount;
2430 if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
2431 maxprot = VM_PROT_NONE;
2432 if ((prot & VM_PROT_EXECUTE) != 0)
2433 return (EACCES);
2434 } else
2435 maxprot = VM_PROT_EXECUTE;
2436 if ((fp->f_flag & FREAD) != 0)
2437 maxprot |= VM_PROT_READ;
2438 else if ((prot & VM_PROT_READ) != 0)
2439 return (EACCES);
2440
2441 /*
2442 * If we are sharing potential changes via MAP_SHARED and we
2443 * are trying to get write permission although we opened it
2444 * without asking for it, bail out.
2445 */
2446 if ((flags & MAP_SHARED) != 0) {
2447 if ((fp->f_flag & FWRITE) != 0)
2448 maxprot |= VM_PROT_WRITE;
2449 else if ((prot & VM_PROT_WRITE) != 0)
2450 return (EACCES);
2451 } else {
2452 maxprot |= VM_PROT_WRITE;
2453 cap_maxprot |= VM_PROT_WRITE;
2454 }
2455 maxprot &= cap_maxprot;
2456
2457 /*
2458 * For regular files and shared memory, POSIX requires that
2459 * the value of foff be a legitimate offset within the data
2460 * object. In particular, negative offsets are invalid.
2461 * Blocking negative offsets and overflows here avoids
2462 * possible wraparound or user-level access into reserved
2463 * ranges of the data object later. In contrast, POSIX does
2464 * not dictate how offsets are used by device drivers, so in
2465 * the case of a device mapping a negative offset is passed
2466 * on.
2467 */
2468 if (
2469 #ifdef _LP64
2470 size > OFF_MAX ||
2471 #endif
2472 foff < 0 || foff > OFF_MAX - size)
2473 return (EINVAL);
2474
2475 writecounted = FALSE;
2476 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
2477 &foff, &object, &writecounted);
2478 if (error != 0)
2479 return (error);
2480 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
2481 foff, writecounted, td);
2482 if (error != 0) {
2483 /*
2484 * If this mapping was accounted for in the vnode's
2485 * writecount, then undo that now.
2486 */
2487 if (writecounted)
2488 vnode_pager_release_writecount(object, 0, size);
2489 vm_object_deallocate(object);
2490 }
2491 #ifdef HWPMC_HOOKS
2492 /* Inform hwpmc(4) if an executable is being mapped. */
2493 if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
2494 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
2495 pkm.pm_file = vp;
2496 pkm.pm_address = (uintptr_t) *addr;
2497 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
2498 }
2499 }
2500 #endif
2501 return (error);
2502 }
Cache object: 0830a95a31ccde120200f4dbef20c9fe
|