1 /*-
2 * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
3 *
4 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
5 * All rights reserved.
6 *
7 * This software was developed for the FreeBSD Project by Marshall
8 * Kirk McKusick and Network Associates Laboratories, the Security
9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11 * research program
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * Copyright (c) 1982, 1986, 1989, 1993
35 * The Regents of the University of California. All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
62 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
63 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
64 */
65
66 #include <sys/cdefs.h>
67 __FBSDID("$FreeBSD$");
68
69 #include "opt_directio.h"
70 #include "opt_ffs.h"
71
72 #include <sys/param.h>
73 #include <sys/bio.h>
74 #include <sys/systm.h>
75 #include <sys/buf.h>
76 #include <sys/conf.h>
77 #include <sys/extattr.h>
78 #include <sys/kernel.h>
79 #include <sys/limits.h>
80 #include <sys/malloc.h>
81 #include <sys/mount.h>
82 #include <sys/priv.h>
83 #include <sys/rwlock.h>
84 #include <sys/stat.h>
85 #include <sys/sysctl.h>
86 #include <sys/vmmeter.h>
87 #include <sys/vnode.h>
88
89 #include <vm/vm.h>
90 #include <vm/vm_param.h>
91 #include <vm/vm_extern.h>
92 #include <vm/vm_object.h>
93 #include <vm/vm_page.h>
94 #include <vm/vm_pager.h>
95 #include <vm/vnode_pager.h>
96
97 #include <ufs/ufs/extattr.h>
98 #include <ufs/ufs/quota.h>
99 #include <ufs/ufs/inode.h>
100 #include <ufs/ufs/ufs_extern.h>
101 #include <ufs/ufs/ufsmount.h>
102
103 #include <ufs/ffs/fs.h>
104 #include <ufs/ffs/ffs_extern.h>
105
106 #define ALIGNED_TO(ptr, s) \
107 (((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
108
109 #ifdef DIRECTIO
110 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
111 #endif
112 static vop_fdatasync_t ffs_fdatasync;
113 static vop_fsync_t ffs_fsync;
114 static vop_getpages_t ffs_getpages;
115 static vop_getpages_async_t ffs_getpages_async;
116 static vop_lock1_t ffs_lock;
117 static vop_read_t ffs_read;
118 static vop_write_t ffs_write;
119 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
120 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
121 struct ucred *cred);
122 static vop_strategy_t ffsext_strategy;
123 static vop_closeextattr_t ffs_closeextattr;
124 static vop_deleteextattr_t ffs_deleteextattr;
125 static vop_getextattr_t ffs_getextattr;
126 static vop_listextattr_t ffs_listextattr;
127 static vop_openextattr_t ffs_openextattr;
128 static vop_setextattr_t ffs_setextattr;
129 static vop_vptofh_t ffs_vptofh;
130
131 /* Global vfs data structures for ufs. */
132 struct vop_vector ffs_vnodeops1 = {
133 .vop_default = &ufs_vnodeops,
134 .vop_fsync = ffs_fsync,
135 .vop_fdatasync = ffs_fdatasync,
136 .vop_getpages = ffs_getpages,
137 .vop_getpages_async = ffs_getpages_async,
138 .vop_lock1 = ffs_lock,
139 .vop_read = ffs_read,
140 .vop_reallocblks = ffs_reallocblks,
141 .vop_write = ffs_write,
142 .vop_vptofh = ffs_vptofh,
143 };
144
145 struct vop_vector ffs_fifoops1 = {
146 .vop_default = &ufs_fifoops,
147 .vop_fsync = ffs_fsync,
148 .vop_fdatasync = ffs_fdatasync,
149 .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */
150 .vop_vptofh = ffs_vptofh,
151 };
152
153 /* Global vfs data structures for ufs. */
154 struct vop_vector ffs_vnodeops2 = {
155 .vop_default = &ufs_vnodeops,
156 .vop_fsync = ffs_fsync,
157 .vop_fdatasync = ffs_fdatasync,
158 .vop_getpages = ffs_getpages,
159 .vop_getpages_async = ffs_getpages_async,
160 .vop_lock1 = ffs_lock,
161 .vop_read = ffs_read,
162 .vop_reallocblks = ffs_reallocblks,
163 .vop_write = ffs_write,
164 .vop_closeextattr = ffs_closeextattr,
165 .vop_deleteextattr = ffs_deleteextattr,
166 .vop_getextattr = ffs_getextattr,
167 .vop_listextattr = ffs_listextattr,
168 .vop_openextattr = ffs_openextattr,
169 .vop_setextattr = ffs_setextattr,
170 .vop_vptofh = ffs_vptofh,
171 };
172
173 struct vop_vector ffs_fifoops2 = {
174 .vop_default = &ufs_fifoops,
175 .vop_fsync = ffs_fsync,
176 .vop_fdatasync = ffs_fdatasync,
177 .vop_lock1 = ffs_lock,
178 .vop_reallocblks = ffs_reallocblks,
179 .vop_strategy = ffsext_strategy,
180 .vop_closeextattr = ffs_closeextattr,
181 .vop_deleteextattr = ffs_deleteextattr,
182 .vop_getextattr = ffs_getextattr,
183 .vop_listextattr = ffs_listextattr,
184 .vop_openextattr = ffs_openextattr,
185 .vop_setextattr = ffs_setextattr,
186 .vop_vptofh = ffs_vptofh,
187 };
188
189 /*
190 * Synch an open file.
191 */
192 /* ARGSUSED */
193 static int
194 ffs_fsync(struct vop_fsync_args *ap)
195 {
196 struct vnode *vp;
197 struct bufobj *bo;
198 int error;
199
200 vp = ap->a_vp;
201 bo = &vp->v_bufobj;
202 retry:
203 error = ffs_syncvnode(vp, ap->a_waitfor, 0);
204 if (error)
205 return (error);
206 if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
207 error = softdep_fsync(vp);
208 if (error)
209 return (error);
210
211 /*
212 * The softdep_fsync() function may drop vp lock,
213 * allowing for dirty buffers to reappear on the
214 * bo_dirty list. Recheck and resync as needed.
215 */
216 BO_LOCK(bo);
217 if ((vp->v_type == VREG || vp->v_type == VDIR) &&
218 (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
219 BO_UNLOCK(bo);
220 goto retry;
221 }
222 BO_UNLOCK(bo);
223 }
224 return (0);
225 }
226
227 int
228 ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
229 {
230 struct inode *ip;
231 struct bufobj *bo;
232 struct buf *bp, *nbp;
233 ufs_lbn_t lbn;
234 int error, passes;
235 bool still_dirty, wait;
236
237 ip = VTOI(vp);
238 ip->i_flag &= ~IN_NEEDSYNC;
239 bo = &vp->v_bufobj;
240
241 /*
242 * When doing MNT_WAIT we must first flush all dependencies
243 * on the inode.
244 */
245 if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
246 (error = softdep_sync_metadata(vp)) != 0)
247 return (error);
248
249 /*
250 * Flush all dirty buffers associated with a vnode.
251 */
252 error = 0;
253 passes = 0;
254 wait = false; /* Always do an async pass first. */
255 lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
256 BO_LOCK(bo);
257 loop:
258 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
259 bp->b_vflags &= ~BV_SCANNED;
260 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
261 /*
262 * Reasons to skip this buffer: it has already been considered
263 * on this pass, the buffer has dependencies that will cause
264 * it to be redirtied and it has not already been deferred,
265 * or it is already being written.
266 */
267 if ((bp->b_vflags & BV_SCANNED) != 0)
268 continue;
269 bp->b_vflags |= BV_SCANNED;
270 /*
271 * Flush indirects in order, if requested.
272 *
273 * Note that if only datasync is requested, we can
274 * skip indirect blocks when softupdates are not
275 * active. Otherwise we must flush them with data,
276 * since dependencies prevent data block writes.
277 */
278 if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
279 (lbn_level(bp->b_lblkno) >= passes ||
280 ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
281 continue;
282 if (bp->b_lblkno > lbn)
283 panic("ffs_syncvnode: syncing truncated data.");
284 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
285 BO_UNLOCK(bo);
286 } else if (wait) {
287 if (BUF_LOCK(bp,
288 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
289 BO_LOCKPTR(bo)) != 0) {
290 BO_LOCK(bo);
291 bp->b_vflags &= ~BV_SCANNED;
292 goto next_locked;
293 }
294 } else
295 continue;
296 if ((bp->b_flags & B_DELWRI) == 0)
297 panic("ffs_fsync: not dirty");
298 /*
299 * Check for dependencies and potentially complete them.
300 */
301 if (!LIST_EMPTY(&bp->b_dep) &&
302 (error = softdep_sync_buf(vp, bp,
303 wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
304 /* I/O error. */
305 if (error != EBUSY) {
306 BUF_UNLOCK(bp);
307 return (error);
308 }
309 /* If we deferred once, don't defer again. */
310 if ((bp->b_flags & B_DEFERRED) == 0) {
311 bp->b_flags |= B_DEFERRED;
312 BUF_UNLOCK(bp);
313 goto next;
314 }
315 }
316 if (wait) {
317 bremfree(bp);
318 if ((error = bwrite(bp)) != 0)
319 return (error);
320 } else if ((bp->b_flags & B_CLUSTEROK)) {
321 (void) vfs_bio_awrite(bp);
322 } else {
323 bremfree(bp);
324 (void) bawrite(bp);
325 }
326 next:
327 /*
328 * Since we may have slept during the I/O, we need
329 * to start from a known point.
330 */
331 BO_LOCK(bo);
332 next_locked:
333 nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
334 }
335 if (waitfor != MNT_WAIT) {
336 BO_UNLOCK(bo);
337 if ((flags & NO_INO_UPDT) != 0)
338 return (0);
339 else
340 return (ffs_update(vp, 0));
341 }
342 /* Drain IO to see if we're done. */
343 bufobj_wwait(bo, 0, 0);
344 /*
345 * Block devices associated with filesystems may have new I/O
346 * requests posted for them even if the vnode is locked, so no
347 * amount of trying will get them clean. We make several passes
348 * as a best effort.
349 *
350 * Regular files may need multiple passes to flush all dependency
351 * work as it is possible that we must write once per indirect
352 * level, once for the leaf, and once for the inode and each of
353 * these will be done with one sync and one async pass.
354 */
355 if (bo->bo_dirty.bv_cnt > 0) {
356 if ((flags & DATA_ONLY) == 0) {
357 still_dirty = true;
358 } else {
359 /*
360 * For data-only sync, dirty indirect buffers
361 * are ignored.
362 */
363 still_dirty = false;
364 TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
365 if (bp->b_lblkno > -UFS_NDADDR) {
366 still_dirty = true;
367 break;
368 }
369 }
370 }
371
372 if (still_dirty) {
373 /* Write the inode after sync passes to flush deps. */
374 if (wait && DOINGSOFTDEP(vp) &&
375 (flags & NO_INO_UPDT) == 0) {
376 BO_UNLOCK(bo);
377 ffs_update(vp, 1);
378 BO_LOCK(bo);
379 }
380 /* switch between sync/async. */
381 wait = !wait;
382 if (wait || ++passes < UFS_NIADDR + 2)
383 goto loop;
384 }
385 }
386 BO_UNLOCK(bo);
387 error = 0;
388 if ((flags & DATA_ONLY) == 0) {
389 if ((flags & NO_INO_UPDT) == 0)
390 error = ffs_update(vp, 1);
391 if (DOINGSUJ(vp))
392 softdep_journal_fsync(VTOI(vp));
393 } else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
394 error = ffs_update(vp, 1);
395 }
396 return (error);
397 }
398
399 static int
400 ffs_fdatasync(struct vop_fdatasync_args *ap)
401 {
402
403 return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
404 }
405
406 static int
407 ffs_lock(ap)
408 struct vop_lock1_args /* {
409 struct vnode *a_vp;
410 int a_flags;
411 struct thread *a_td;
412 char *file;
413 int line;
414 } */ *ap;
415 {
416 #ifndef NO_FFS_SNAPSHOT
417 struct vnode *vp;
418 int flags;
419 struct lock *lkp;
420 int result;
421
422 switch (ap->a_flags & LK_TYPE_MASK) {
423 case LK_SHARED:
424 case LK_UPGRADE:
425 case LK_EXCLUSIVE:
426 vp = ap->a_vp;
427 flags = ap->a_flags;
428 for (;;) {
429 #ifdef DEBUG_VFS_LOCKS
430 KASSERT(vp->v_holdcnt != 0,
431 ("ffs_lock %p: zero hold count", vp));
432 #endif
433 lkp = vp->v_vnlock;
434 result = _lockmgr_args(lkp, flags, VI_MTX(vp),
435 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
436 ap->a_file, ap->a_line);
437 if (lkp == vp->v_vnlock || result != 0)
438 break;
439 /*
440 * Apparent success, except that the vnode
441 * mutated between snapshot file vnode and
442 * regular file vnode while this process
443 * slept. The lock currently held is not the
444 * right lock. Release it, and try to get the
445 * new lock.
446 */
447 (void) _lockmgr_args(lkp, LK_RELEASE, NULL,
448 LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
449 ap->a_file, ap->a_line);
450 if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
451 (LK_INTERLOCK | LK_NOWAIT))
452 return (EBUSY);
453 if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
454 flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
455 flags &= ~LK_INTERLOCK;
456 }
457 break;
458 default:
459 result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
460 }
461 return (result);
462 #else
463 return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
464 #endif
465 }
466
467 static int
468 ffs_read_hole(struct uio *uio, long xfersize, long *size)
469 {
470 ssize_t saved_resid, tlen;
471 int error;
472
473 while (xfersize > 0) {
474 tlen = min(xfersize, ZERO_REGION_SIZE);
475 saved_resid = uio->uio_resid;
476 error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
477 tlen, uio);
478 if (error != 0)
479 return (error);
480 tlen = saved_resid - uio->uio_resid;
481 xfersize -= tlen;
482 *size -= tlen;
483 }
484 return (0);
485 }
486
487 /*
488 * Vnode op for reading.
489 */
490 static int
491 ffs_read(ap)
492 struct vop_read_args /* {
493 struct vnode *a_vp;
494 struct uio *a_uio;
495 int a_ioflag;
496 struct ucred *a_cred;
497 } */ *ap;
498 {
499 struct vnode *vp;
500 struct inode *ip;
501 struct uio *uio;
502 struct fs *fs;
503 struct buf *bp;
504 ufs_lbn_t lbn, nextlbn;
505 off_t bytesinfile;
506 long size, xfersize, blkoffset;
507 ssize_t orig_resid;
508 int bflag, error, ioflag, seqcount;
509
510 vp = ap->a_vp;
511 uio = ap->a_uio;
512 ioflag = ap->a_ioflag;
513 if (ap->a_ioflag & IO_EXT)
514 #ifdef notyet
515 return (ffs_extread(vp, uio, ioflag));
516 #else
517 panic("ffs_read+IO_EXT");
518 #endif
519 #ifdef DIRECTIO
520 if ((ioflag & IO_DIRECT) != 0) {
521 int workdone;
522
523 error = ffs_rawread(vp, uio, &workdone);
524 if (error != 0 || workdone != 0)
525 return error;
526 }
527 #endif
528
529 seqcount = ap->a_ioflag >> IO_SEQSHIFT;
530 ip = VTOI(vp);
531
532 #ifdef INVARIANTS
533 if (uio->uio_rw != UIO_READ)
534 panic("ffs_read: mode");
535
536 if (vp->v_type == VLNK) {
537 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
538 panic("ffs_read: short symlink");
539 } else if (vp->v_type != VREG && vp->v_type != VDIR)
540 panic("ffs_read: type %d", vp->v_type);
541 #endif
542 orig_resid = uio->uio_resid;
543 KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
544 if (orig_resid == 0)
545 return (0);
546 KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
547 fs = ITOFS(ip);
548 if (uio->uio_offset < ip->i_size &&
549 uio->uio_offset >= fs->fs_maxfilesize)
550 return (EOVERFLOW);
551
552 bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
553 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
554 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
555 break;
556 lbn = lblkno(fs, uio->uio_offset);
557 nextlbn = lbn + 1;
558
559 /*
560 * size of buffer. The buffer representing the
561 * end of the file is rounded up to the size of
562 * the block type ( fragment or full block,
563 * depending ).
564 */
565 size = blksize(fs, ip, lbn);
566 blkoffset = blkoff(fs, uio->uio_offset);
567
568 /*
569 * The amount we want to transfer in this iteration is
570 * one FS block less the amount of the data before
571 * our startpoint (duh!)
572 */
573 xfersize = fs->fs_bsize - blkoffset;
574
575 /*
576 * But if we actually want less than the block,
577 * or the file doesn't have a whole block more of data,
578 * then use the lesser number.
579 */
580 if (uio->uio_resid < xfersize)
581 xfersize = uio->uio_resid;
582 if (bytesinfile < xfersize)
583 xfersize = bytesinfile;
584
585 if (lblktosize(fs, nextlbn) >= ip->i_size) {
586 /*
587 * Don't do readahead if this is the end of the file.
588 */
589 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
590 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
591 /*
592 * Otherwise if we are allowed to cluster,
593 * grab as much as we can.
594 *
595 * XXX This may not be a win if we are not
596 * doing sequential access.
597 */
598 error = cluster_read(vp, ip->i_size, lbn,
599 size, NOCRED, blkoffset + uio->uio_resid,
600 seqcount, bflag, &bp);
601 } else if (seqcount > 1) {
602 /*
603 * If we are NOT allowed to cluster, then
604 * if we appear to be acting sequentially,
605 * fire off a request for a readahead
606 * as well as a read. Note that the 4th and 5th
607 * arguments point to arrays of the size specified in
608 * the 6th argument.
609 */
610 u_int nextsize = blksize(fs, ip, nextlbn);
611 error = breadn_flags(vp, lbn, size, &nextlbn,
612 &nextsize, 1, NOCRED, bflag, NULL, &bp);
613 } else {
614 /*
615 * Failing all of the above, just read what the
616 * user asked for. Interestingly, the same as
617 * the first option above.
618 */
619 error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
620 }
621 if (error == EJUSTRETURN) {
622 error = ffs_read_hole(uio, xfersize, &size);
623 if (error == 0)
624 continue;
625 }
626 if (error != 0) {
627 brelse(bp);
628 bp = NULL;
629 break;
630 }
631
632 /*
633 * We should only get non-zero b_resid when an I/O error
634 * has occurred, which should cause us to break above.
635 * However, if the short read did not cause an error,
636 * then we want to ensure that we do not uiomove bad
637 * or uninitialized data.
638 */
639 size -= bp->b_resid;
640 if (size < xfersize) {
641 if (size == 0)
642 break;
643 xfersize = size;
644 }
645
646 if (buf_mapped(bp)) {
647 error = vn_io_fault_uiomove((char *)bp->b_data +
648 blkoffset, (int)xfersize, uio);
649 } else {
650 error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
651 (int)xfersize, uio);
652 }
653 if (error)
654 break;
655
656 vfs_bio_brelse(bp, ioflag);
657 }
658
659 /*
660 * This can only happen in the case of an error
661 * because the loop above resets bp to NULL on each iteration
662 * and on normal completion has not set a new value into it.
663 * so it must have come from a 'break' statement
664 */
665 if (bp != NULL)
666 vfs_bio_brelse(bp, ioflag);
667
668 if ((error == 0 || uio->uio_resid != orig_resid) &&
669 (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
670 (ip->i_flag & IN_ACCESS) == 0) {
671 VI_LOCK(vp);
672 ip->i_flag |= IN_ACCESS;
673 VI_UNLOCK(vp);
674 }
675 return (error);
676 }
677
678 /*
679 * Vnode op for writing.
680 */
681 static int
682 ffs_write(ap)
683 struct vop_write_args /* {
684 struct vnode *a_vp;
685 struct uio *a_uio;
686 int a_ioflag;
687 struct ucred *a_cred;
688 } */ *ap;
689 {
690 struct vnode *vp;
691 struct uio *uio;
692 struct inode *ip;
693 struct fs *fs;
694 struct buf *bp;
695 ufs_lbn_t lbn;
696 off_t osize;
697 ssize_t resid;
698 int seqcount;
699 int blkoffset, error, flags, ioflag, size, xfersize;
700
701 vp = ap->a_vp;
702 uio = ap->a_uio;
703 ioflag = ap->a_ioflag;
704 if (ap->a_ioflag & IO_EXT)
705 #ifdef notyet
706 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
707 #else
708 panic("ffs_write+IO_EXT");
709 #endif
710
711 seqcount = ap->a_ioflag >> IO_SEQSHIFT;
712 ip = VTOI(vp);
713
714 #ifdef INVARIANTS
715 if (uio->uio_rw != UIO_WRITE)
716 panic("ffs_write: mode");
717 #endif
718
719 switch (vp->v_type) {
720 case VREG:
721 if (ioflag & IO_APPEND)
722 uio->uio_offset = ip->i_size;
723 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
724 return (EPERM);
725 /* FALLTHROUGH */
726 case VLNK:
727 break;
728 case VDIR:
729 panic("ffs_write: dir write");
730 break;
731 default:
732 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
733 (int)uio->uio_offset,
734 (int)uio->uio_resid
735 );
736 }
737
738 KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
739 KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
740 fs = ITOFS(ip);
741 if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
742 return (EFBIG);
743 /*
744 * Maybe this should be above the vnode op call, but so long as
745 * file servers have no limits, I don't think it matters.
746 */
747 if (vn_rlimit_fsize(vp, uio, uio->uio_td))
748 return (EFBIG);
749
750 resid = uio->uio_resid;
751 osize = ip->i_size;
752 if (seqcount > BA_SEQMAX)
753 flags = BA_SEQMAX << BA_SEQSHIFT;
754 else
755 flags = seqcount << BA_SEQSHIFT;
756 if (ioflag & IO_SYNC)
757 flags |= IO_SYNC;
758 flags |= BA_UNMAPPED;
759
760 for (error = 0; uio->uio_resid > 0;) {
761 lbn = lblkno(fs, uio->uio_offset);
762 blkoffset = blkoff(fs, uio->uio_offset);
763 xfersize = fs->fs_bsize - blkoffset;
764 if (uio->uio_resid < xfersize)
765 xfersize = uio->uio_resid;
766 if (uio->uio_offset + xfersize > ip->i_size)
767 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
768
769 /*
770 * We must perform a read-before-write if the transfer size
771 * does not cover the entire buffer.
772 */
773 if (fs->fs_bsize > xfersize)
774 flags |= BA_CLRBUF;
775 else
776 flags &= ~BA_CLRBUF;
777 /* XXX is uio->uio_offset the right thing here? */
778 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
779 ap->a_cred, flags, &bp);
780 if (error != 0) {
781 vnode_pager_setsize(vp, ip->i_size);
782 break;
783 }
784 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
785 bp->b_flags |= B_NOCACHE;
786
787 if (uio->uio_offset + xfersize > ip->i_size) {
788 ip->i_size = uio->uio_offset + xfersize;
789 DIP_SET(ip, i_size, ip->i_size);
790 ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
791 }
792
793 size = blksize(fs, ip, lbn) - bp->b_resid;
794 if (size < xfersize)
795 xfersize = size;
796
797 if (buf_mapped(bp)) {
798 error = vn_io_fault_uiomove((char *)bp->b_data +
799 blkoffset, (int)xfersize, uio);
800 } else {
801 error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
802 (int)xfersize, uio);
803 }
804 /*
805 * If the buffer is not already filled and we encounter an
806 * error while trying to fill it, we have to clear out any
807 * garbage data from the pages instantiated for the buffer.
808 * If we do not, a failed uiomove() during a write can leave
809 * the prior contents of the pages exposed to a userland mmap.
810 *
811 * Note that we need only clear buffers with a transfer size
812 * equal to the block size because buffers with a shorter
813 * transfer size were cleared above by the call to UFS_BALLOC()
814 * with the BA_CLRBUF flag set.
815 *
816 * If the source region for uiomove identically mmaps the
817 * buffer, uiomove() performed the NOP copy, and the buffer
818 * content remains valid because the page fault handler
819 * validated the pages.
820 */
821 if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
822 fs->fs_bsize == xfersize)
823 vfs_bio_clrbuf(bp);
824
825 vfs_bio_set_flags(bp, ioflag);
826
827 /*
828 * If IO_SYNC each buffer is written synchronously. Otherwise
829 * if we have a severe page deficiency write the buffer
830 * asynchronously. Otherwise try to cluster, and if that
831 * doesn't do it then either do an async write (if O_DIRECT),
832 * or a delayed write (if not).
833 */
834 if (ioflag & IO_SYNC) {
835 (void)bwrite(bp);
836 } else if (vm_page_count_severe() ||
837 buf_dirty_count_severe() ||
838 (ioflag & IO_ASYNC)) {
839 bp->b_flags |= B_CLUSTEROK;
840 bawrite(bp);
841 } else if (xfersize + blkoffset == fs->fs_bsize) {
842 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
843 bp->b_flags |= B_CLUSTEROK;
844 cluster_write(vp, bp, ip->i_size, seqcount,
845 GB_UNMAPPED);
846 } else {
847 bawrite(bp);
848 }
849 } else if (ioflag & IO_DIRECT) {
850 bp->b_flags |= B_CLUSTEROK;
851 bawrite(bp);
852 } else {
853 bp->b_flags |= B_CLUSTEROK;
854 bdwrite(bp);
855 }
856 if (error || xfersize == 0)
857 break;
858 ip->i_flag |= IN_CHANGE | IN_UPDATE;
859 }
860 /*
861 * If we successfully wrote any data, and we are not the superuser
862 * we clear the setuid and setgid bits as a precaution against
863 * tampering.
864 */
865 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
866 ap->a_cred) {
867 if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
868 ip->i_mode &= ~(ISUID | ISGID);
869 DIP_SET(ip, i_mode, ip->i_mode);
870 }
871 }
872 if (error) {
873 if (ioflag & IO_UNIT) {
874 (void)ffs_truncate(vp, osize,
875 IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
876 uio->uio_offset -= resid - uio->uio_resid;
877 uio->uio_resid = resid;
878 }
879 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
880 error = ffs_update(vp, 1);
881 return (error);
882 }
883
884 /*
885 * Extended attribute area reading.
886 */
887 static int
888 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
889 {
890 struct inode *ip;
891 struct ufs2_dinode *dp;
892 struct fs *fs;
893 struct buf *bp;
894 ufs_lbn_t lbn, nextlbn;
895 off_t bytesinfile;
896 long size, xfersize, blkoffset;
897 ssize_t orig_resid;
898 int error;
899
900 ip = VTOI(vp);
901 fs = ITOFS(ip);
902 dp = ip->i_din2;
903
904 #ifdef INVARIANTS
905 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
906 panic("ffs_extread: mode");
907
908 #endif
909 orig_resid = uio->uio_resid;
910 KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
911 if (orig_resid == 0)
912 return (0);
913 KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
914
915 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
916 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
917 break;
918 lbn = lblkno(fs, uio->uio_offset);
919 nextlbn = lbn + 1;
920
921 /*
922 * size of buffer. The buffer representing the
923 * end of the file is rounded up to the size of
924 * the block type ( fragment or full block,
925 * depending ).
926 */
927 size = sblksize(fs, dp->di_extsize, lbn);
928 blkoffset = blkoff(fs, uio->uio_offset);
929
930 /*
931 * The amount we want to transfer in this iteration is
932 * one FS block less the amount of the data before
933 * our startpoint (duh!)
934 */
935 xfersize = fs->fs_bsize - blkoffset;
936
937 /*
938 * But if we actually want less than the block,
939 * or the file doesn't have a whole block more of data,
940 * then use the lesser number.
941 */
942 if (uio->uio_resid < xfersize)
943 xfersize = uio->uio_resid;
944 if (bytesinfile < xfersize)
945 xfersize = bytesinfile;
946
947 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
948 /*
949 * Don't do readahead if this is the end of the info.
950 */
951 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
952 } else {
953 /*
954 * If we have a second block, then
955 * fire off a request for a readahead
956 * as well as a read. Note that the 4th and 5th
957 * arguments point to arrays of the size specified in
958 * the 6th argument.
959 */
960 u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
961
962 nextlbn = -1 - nextlbn;
963 error = breadn(vp, -1 - lbn,
964 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
965 }
966 if (error) {
967 brelse(bp);
968 bp = NULL;
969 break;
970 }
971
972 /*
973 * We should only get non-zero b_resid when an I/O error
974 * has occurred, which should cause us to break above.
975 * However, if the short read did not cause an error,
976 * then we want to ensure that we do not uiomove bad
977 * or uninitialized data.
978 */
979 size -= bp->b_resid;
980 if (size < xfersize) {
981 if (size == 0)
982 break;
983 xfersize = size;
984 }
985
986 error = uiomove((char *)bp->b_data + blkoffset,
987 (int)xfersize, uio);
988 if (error)
989 break;
990 vfs_bio_brelse(bp, ioflag);
991 }
992
993 /*
994 * This can only happen in the case of an error
995 * because the loop above resets bp to NULL on each iteration
996 * and on normal completion has not set a new value into it.
997 * so it must have come from a 'break' statement
998 */
999 if (bp != NULL)
1000 vfs_bio_brelse(bp, ioflag);
1001 return (error);
1002 }
1003
1004 /*
1005 * Extended attribute area writing.
1006 */
1007 static int
1008 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1009 {
1010 struct inode *ip;
1011 struct ufs2_dinode *dp;
1012 struct fs *fs;
1013 struct buf *bp;
1014 ufs_lbn_t lbn;
1015 off_t osize;
1016 ssize_t resid;
1017 int blkoffset, error, flags, size, xfersize;
1018
1019 ip = VTOI(vp);
1020 fs = ITOFS(ip);
1021 dp = ip->i_din2;
1022
1023 #ifdef INVARIANTS
1024 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1025 panic("ffs_extwrite: mode");
1026 #endif
1027
1028 if (ioflag & IO_APPEND)
1029 uio->uio_offset = dp->di_extsize;
1030 KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1031 KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1032 if ((uoff_t)uio->uio_offset + uio->uio_resid >
1033 UFS_NXADDR * fs->fs_bsize)
1034 return (EFBIG);
1035
1036 resid = uio->uio_resid;
1037 osize = dp->di_extsize;
1038 flags = IO_EXT;
1039 if (ioflag & IO_SYNC)
1040 flags |= IO_SYNC;
1041
1042 for (error = 0; uio->uio_resid > 0;) {
1043 lbn = lblkno(fs, uio->uio_offset);
1044 blkoffset = blkoff(fs, uio->uio_offset);
1045 xfersize = fs->fs_bsize - blkoffset;
1046 if (uio->uio_resid < xfersize)
1047 xfersize = uio->uio_resid;
1048
1049 /*
1050 * We must perform a read-before-write if the transfer size
1051 * does not cover the entire buffer.
1052 */
1053 if (fs->fs_bsize > xfersize)
1054 flags |= BA_CLRBUF;
1055 else
1056 flags &= ~BA_CLRBUF;
1057 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1058 ucred, flags, &bp);
1059 if (error != 0)
1060 break;
1061 /*
1062 * If the buffer is not valid we have to clear out any
1063 * garbage data from the pages instantiated for the buffer.
1064 * If we do not, a failed uiomove() during a write can leave
1065 * the prior contents of the pages exposed to a userland
1066 * mmap(). XXX deal with uiomove() errors a better way.
1067 */
1068 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1069 vfs_bio_clrbuf(bp);
1070
1071 if (uio->uio_offset + xfersize > dp->di_extsize) {
1072 dp->di_extsize = uio->uio_offset + xfersize;
1073 ip->i_flag |= IN_SIZEMOD | IN_CHANGE;
1074 }
1075
1076 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1077 if (size < xfersize)
1078 xfersize = size;
1079
1080 error =
1081 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1082
1083 vfs_bio_set_flags(bp, ioflag);
1084
1085 /*
1086 * If IO_SYNC each buffer is written synchronously. Otherwise
1087 * if we have a severe page deficiency write the buffer
1088 * asynchronously. Otherwise try to cluster, and if that
1089 * doesn't do it then either do an async write (if O_DIRECT),
1090 * or a delayed write (if not).
1091 */
1092 if (ioflag & IO_SYNC) {
1093 (void)bwrite(bp);
1094 } else if (vm_page_count_severe() ||
1095 buf_dirty_count_severe() ||
1096 xfersize + blkoffset == fs->fs_bsize ||
1097 (ioflag & (IO_ASYNC | IO_DIRECT)))
1098 bawrite(bp);
1099 else
1100 bdwrite(bp);
1101 if (error || xfersize == 0)
1102 break;
1103 ip->i_flag |= IN_CHANGE;
1104 }
1105 /*
1106 * If we successfully wrote any data, and we are not the superuser
1107 * we clear the setuid and setgid bits as a precaution against
1108 * tampering.
1109 */
1110 if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1111 if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
1112 ip->i_mode &= ~(ISUID | ISGID);
1113 dp->di_mode = ip->i_mode;
1114 }
1115 }
1116 if (error) {
1117 if (ioflag & IO_UNIT) {
1118 (void)ffs_truncate(vp, osize,
1119 IO_EXT | (ioflag&IO_SYNC), ucred);
1120 uio->uio_offset -= resid - uio->uio_resid;
1121 uio->uio_resid = resid;
1122 }
1123 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1124 error = ffs_update(vp, 1);
1125 return (error);
1126 }
1127
1128
1129 /*
1130 * Vnode operating to retrieve a named extended attribute.
1131 *
1132 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1133 * the length of the EA, and possibly the pointer to the entry and to the data.
1134 */
1135 static int
1136 ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name,
1137 struct extattr **eapp, u_char **eac)
1138 {
1139 struct extattr *eap, *eaend;
1140 size_t nlen;
1141
1142 nlen = strlen(name);
1143 KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1144 eap = (struct extattr *)ptr;
1145 eaend = (struct extattr *)(ptr + length);
1146 for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1147 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1148 ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1149 if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1150 || memcmp(eap->ea_name, name, nlen) != 0)
1151 continue;
1152 if (eapp != NULL)
1153 *eapp = eap;
1154 if (eac != NULL)
1155 *eac = EXTATTR_CONTENT(eap);
1156 return (EXTATTR_CONTENT_SIZE(eap));
1157 }
1158 return (-1);
1159 }
1160
1161 static int
1162 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td)
1163 {
1164 const struct extattr *eap, *eaend, *eapnext;
1165 struct inode *ip;
1166 struct ufs2_dinode *dp;
1167 struct fs *fs;
1168 struct uio luio;
1169 struct iovec liovec;
1170 u_int easize;
1171 int error;
1172 u_char *eae;
1173
1174 ip = VTOI(vp);
1175 fs = ITOFS(ip);
1176 dp = ip->i_din2;
1177 easize = dp->di_extsize;
1178 if ((uoff_t)easize > UFS_NXADDR * fs->fs_bsize)
1179 return (EFBIG);
1180
1181 eae = malloc(easize, M_TEMP, M_WAITOK);
1182
1183 liovec.iov_base = eae;
1184 liovec.iov_len = easize;
1185 luio.uio_iov = &liovec;
1186 luio.uio_iovcnt = 1;
1187 luio.uio_offset = 0;
1188 luio.uio_resid = easize;
1189 luio.uio_segflg = UIO_SYSSPACE;
1190 luio.uio_rw = UIO_READ;
1191 luio.uio_td = td;
1192
1193 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1194 if (error) {
1195 free(eae, M_TEMP);
1196 return (error);
1197 }
1198 /* Validate disk xattrfile contents. */
1199 for (eap = (void *)eae, eaend = (void *)(eae + easize); eap < eaend;
1200 eap = eapnext) {
1201 eapnext = EXTATTR_NEXT(eap);
1202 /* Bogusly short entry or bogusly long entry. */
1203 if (eap->ea_length < sizeof(*eap) || eapnext > eaend) {
1204 free(eae, M_TEMP);
1205 return (EINTEGRITY);
1206 }
1207 }
1208 *p = eae;
1209 return (0);
1210 }
1211
1212 static void
1213 ffs_lock_ea(struct vnode *vp)
1214 {
1215 struct inode *ip;
1216
1217 ip = VTOI(vp);
1218 VI_LOCK(vp);
1219 while (ip->i_flag & IN_EA_LOCKED) {
1220 ip->i_flag |= IN_EA_LOCKWAIT;
1221 msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1222 0);
1223 }
1224 ip->i_flag |= IN_EA_LOCKED;
1225 VI_UNLOCK(vp);
1226 }
1227
1228 static void
1229 ffs_unlock_ea(struct vnode *vp)
1230 {
1231 struct inode *ip;
1232
1233 ip = VTOI(vp);
1234 VI_LOCK(vp);
1235 if (ip->i_flag & IN_EA_LOCKWAIT)
1236 wakeup(&ip->i_ea_refs);
1237 ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1238 VI_UNLOCK(vp);
1239 }
1240
1241 static int
1242 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1243 {
1244 struct inode *ip;
1245 struct ufs2_dinode *dp;
1246 int error;
1247
1248 ip = VTOI(vp);
1249
1250 ffs_lock_ea(vp);
1251 if (ip->i_ea_area != NULL) {
1252 ip->i_ea_refs++;
1253 ffs_unlock_ea(vp);
1254 return (0);
1255 }
1256 dp = ip->i_din2;
1257 error = ffs_rdextattr(&ip->i_ea_area, vp, td);
1258 if (error) {
1259 ffs_unlock_ea(vp);
1260 return (error);
1261 }
1262 ip->i_ea_len = dp->di_extsize;
1263 ip->i_ea_error = 0;
1264 ip->i_ea_refs++;
1265 ffs_unlock_ea(vp);
1266 return (0);
1267 }
1268
1269 /*
1270 * Vnode extattr transaction commit/abort
1271 */
1272 static int
1273 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1274 {
1275 struct inode *ip;
1276 struct uio luio;
1277 struct iovec liovec;
1278 int error;
1279 struct ufs2_dinode *dp;
1280
1281 ip = VTOI(vp);
1282
1283 ffs_lock_ea(vp);
1284 if (ip->i_ea_area == NULL) {
1285 ffs_unlock_ea(vp);
1286 return (EINVAL);
1287 }
1288 dp = ip->i_din2;
1289 error = ip->i_ea_error;
1290 if (commit && error == 0) {
1291 ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1292 if (cred == NOCRED)
1293 cred = vp->v_mount->mnt_cred;
1294 liovec.iov_base = ip->i_ea_area;
1295 liovec.iov_len = ip->i_ea_len;
1296 luio.uio_iov = &liovec;
1297 luio.uio_iovcnt = 1;
1298 luio.uio_offset = 0;
1299 luio.uio_resid = ip->i_ea_len;
1300 luio.uio_segflg = UIO_SYSSPACE;
1301 luio.uio_rw = UIO_WRITE;
1302 luio.uio_td = td;
1303 /* XXX: I'm not happy about truncating to zero size */
1304 if (ip->i_ea_len < dp->di_extsize)
1305 error = ffs_truncate(vp, 0, IO_EXT, cred);
1306 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1307 }
1308 if (--ip->i_ea_refs == 0) {
1309 free(ip->i_ea_area, M_TEMP);
1310 ip->i_ea_area = NULL;
1311 ip->i_ea_len = 0;
1312 ip->i_ea_error = 0;
1313 }
1314 ffs_unlock_ea(vp);
1315 return (error);
1316 }
1317
1318 /*
1319 * Vnode extattr strategy routine for fifos.
1320 *
1321 * We need to check for a read or write of the external attributes.
1322 * Otherwise we just fall through and do the usual thing.
1323 */
1324 static int
1325 ffsext_strategy(struct vop_strategy_args *ap)
1326 /*
1327 struct vop_strategy_args {
1328 struct vnodeop_desc *a_desc;
1329 struct vnode *a_vp;
1330 struct buf *a_bp;
1331 };
1332 */
1333 {
1334 struct vnode *vp;
1335 daddr_t lbn;
1336
1337 vp = ap->a_vp;
1338 lbn = ap->a_bp->b_lblkno;
1339 if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1340 return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1341 if (vp->v_type == VFIFO)
1342 return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1343 panic("spec nodes went here");
1344 }
1345
1346 /*
1347 * Vnode extattr transaction commit/abort
1348 */
1349 static int
1350 ffs_openextattr(struct vop_openextattr_args *ap)
1351 /*
1352 struct vop_openextattr_args {
1353 struct vnodeop_desc *a_desc;
1354 struct vnode *a_vp;
1355 IN struct ucred *a_cred;
1356 IN struct thread *a_td;
1357 };
1358 */
1359 {
1360
1361 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1362 return (EOPNOTSUPP);
1363
1364 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1365 }
1366
1367
1368 /*
1369 * Vnode extattr transaction commit/abort
1370 */
1371 static int
1372 ffs_closeextattr(struct vop_closeextattr_args *ap)
1373 /*
1374 struct vop_closeextattr_args {
1375 struct vnodeop_desc *a_desc;
1376 struct vnode *a_vp;
1377 int a_commit;
1378 IN struct ucred *a_cred;
1379 IN struct thread *a_td;
1380 };
1381 */
1382 {
1383
1384 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1385 return (EOPNOTSUPP);
1386
1387 if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1388 return (EROFS);
1389
1390 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1391 }
1392
1393 /*
1394 * Vnode operation to remove a named attribute.
1395 */
1396 static int
1397 ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1398 /*
1399 vop_deleteextattr {
1400 IN struct vnode *a_vp;
1401 IN int a_attrnamespace;
1402 IN const char *a_name;
1403 IN struct ucred *a_cred;
1404 IN struct thread *a_td;
1405 };
1406 */
1407 {
1408 struct inode *ip;
1409 struct extattr *eap;
1410 uint32_t ul;
1411 int olen, error, i, easize;
1412 u_char *eae;
1413 void *tmp;
1414
1415 ip = VTOI(ap->a_vp);
1416
1417 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1418 return (EOPNOTSUPP);
1419
1420 if (strlen(ap->a_name) == 0)
1421 return (EINVAL);
1422
1423 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1424 return (EROFS);
1425
1426 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1427 ap->a_cred, ap->a_td, VWRITE);
1428 if (error) {
1429
1430 /*
1431 * ffs_lock_ea is not needed there, because the vnode
1432 * must be exclusively locked.
1433 */
1434 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1435 ip->i_ea_error = error;
1436 return (error);
1437 }
1438
1439 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1440 if (error)
1441 return (error);
1442
1443 /* CEM: delete could be done in-place instead */
1444 eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1445 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1446 easize = ip->i_ea_len;
1447
1448 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1449 &eap, NULL);
1450 if (olen == -1) {
1451 /* delete but nonexistent */
1452 free(eae, M_TEMP);
1453 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1454 return (ENOATTR);
1455 }
1456 ul = eap->ea_length;
1457 i = (u_char *)EXTATTR_NEXT(eap) - eae;
1458 bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1459 easize -= ul;
1460
1461 tmp = ip->i_ea_area;
1462 ip->i_ea_area = eae;
1463 ip->i_ea_len = easize;
1464 free(tmp, M_TEMP);
1465 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1466 return (error);
1467 }
1468
1469 /*
1470 * Vnode operation to retrieve a named extended attribute.
1471 */
1472 static int
1473 ffs_getextattr(struct vop_getextattr_args *ap)
1474 /*
1475 vop_getextattr {
1476 IN struct vnode *a_vp;
1477 IN int a_attrnamespace;
1478 IN const char *a_name;
1479 INOUT struct uio *a_uio;
1480 OUT size_t *a_size;
1481 IN struct ucred *a_cred;
1482 IN struct thread *a_td;
1483 };
1484 */
1485 {
1486 struct inode *ip;
1487 u_char *eae, *p;
1488 unsigned easize;
1489 int error, ealen;
1490
1491 ip = VTOI(ap->a_vp);
1492
1493 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1494 return (EOPNOTSUPP);
1495
1496 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1497 ap->a_cred, ap->a_td, VREAD);
1498 if (error)
1499 return (error);
1500
1501 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1502 if (error)
1503 return (error);
1504
1505 eae = ip->i_ea_area;
1506 easize = ip->i_ea_len;
1507
1508 ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1509 NULL, &p);
1510 if (ealen >= 0) {
1511 error = 0;
1512 if (ap->a_size != NULL)
1513 *ap->a_size = ealen;
1514 else if (ap->a_uio != NULL)
1515 error = uiomove(p, ealen, ap->a_uio);
1516 } else
1517 error = ENOATTR;
1518
1519 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1520 return (error);
1521 }
1522
1523 /*
1524 * Vnode operation to retrieve extended attributes on a vnode.
1525 */
1526 static int
1527 ffs_listextattr(struct vop_listextattr_args *ap)
1528 /*
1529 vop_listextattr {
1530 IN struct vnode *a_vp;
1531 IN int a_attrnamespace;
1532 INOUT struct uio *a_uio;
1533 OUT size_t *a_size;
1534 IN struct ucred *a_cred;
1535 IN struct thread *a_td;
1536 };
1537 */
1538 {
1539 struct inode *ip;
1540 struct extattr *eap, *eaend;
1541 int error, ealen;
1542
1543 ip = VTOI(ap->a_vp);
1544
1545 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1546 return (EOPNOTSUPP);
1547
1548 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1549 ap->a_cred, ap->a_td, VREAD);
1550 if (error)
1551 return (error);
1552
1553 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1554 if (error)
1555 return (error);
1556
1557 error = 0;
1558 if (ap->a_size != NULL)
1559 *ap->a_size = 0;
1560
1561 KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1562 eap = (struct extattr *)ip->i_ea_area;
1563 eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1564 for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1565 KASSERT(EXTATTR_NEXT(eap) <= eaend,
1566 ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1567 if (eap->ea_namespace != ap->a_attrnamespace)
1568 continue;
1569
1570 ealen = eap->ea_namelength;
1571 if (ap->a_size != NULL)
1572 *ap->a_size += ealen + 1;
1573 else if (ap->a_uio != NULL)
1574 error = uiomove(&eap->ea_namelength, ealen + 1,
1575 ap->a_uio);
1576 }
1577
1578 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1579 return (error);
1580 }
1581
1582 /*
1583 * Vnode operation to set a named attribute.
1584 */
1585 static int
1586 ffs_setextattr(struct vop_setextattr_args *ap)
1587 /*
1588 vop_setextattr {
1589 IN struct vnode *a_vp;
1590 IN int a_attrnamespace;
1591 IN const char *a_name;
1592 INOUT struct uio *a_uio;
1593 IN struct ucred *a_cred;
1594 IN struct thread *a_td;
1595 };
1596 */
1597 {
1598 struct inode *ip;
1599 struct fs *fs;
1600 struct extattr *eap;
1601 uint32_t ealength, ul;
1602 ssize_t ealen;
1603 int olen, eapad1, eapad2, error, i, easize;
1604 u_char *eae;
1605 void *tmp;
1606
1607 ip = VTOI(ap->a_vp);
1608 fs = ITOFS(ip);
1609
1610 if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1611 return (EOPNOTSUPP);
1612
1613 if (strlen(ap->a_name) == 0)
1614 return (EINVAL);
1615
1616 /* XXX Now unsupported API to delete EAs using NULL uio. */
1617 if (ap->a_uio == NULL)
1618 return (EOPNOTSUPP);
1619
1620 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1621 return (EROFS);
1622
1623 ealen = ap->a_uio->uio_resid;
1624 if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1625 return (EINVAL);
1626
1627 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1628 ap->a_cred, ap->a_td, VWRITE);
1629 if (error) {
1630
1631 /*
1632 * ffs_lock_ea is not needed there, because the vnode
1633 * must be exclusively locked.
1634 */
1635 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1636 ip->i_ea_error = error;
1637 return (error);
1638 }
1639
1640 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1641 if (error)
1642 return (error);
1643
1644 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1645 eapad1 = roundup2(ealength, 8) - ealength;
1646 eapad2 = roundup2(ealen, 8) - ealen;
1647 ealength += eapad1 + ealen + eapad2;
1648
1649 /*
1650 * CEM: rewrites of the same size or smaller could be done in-place
1651 * instead. (We don't acquire any fine-grained locks in here either,
1652 * so we could also do bigger writes in-place.)
1653 */
1654 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1655 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1656 easize = ip->i_ea_len;
1657
1658 olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1659 &eap, NULL);
1660 if (olen == -1) {
1661 /* new, append at end */
1662 KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1663 ("unaligned"));
1664 eap = (struct extattr *)(eae + easize);
1665 easize += ealength;
1666 } else {
1667 ul = eap->ea_length;
1668 i = (u_char *)EXTATTR_NEXT(eap) - eae;
1669 if (ul != ealength) {
1670 bcopy(EXTATTR_NEXT(eap), (u_char *)eap + ealength,
1671 easize - i);
1672 easize += (ealength - ul);
1673 }
1674 }
1675 if (easize > lblktosize(fs, UFS_NXADDR)) {
1676 free(eae, M_TEMP);
1677 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1678 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1679 ip->i_ea_error = ENOSPC;
1680 return (ENOSPC);
1681 }
1682 eap->ea_length = ealength;
1683 eap->ea_namespace = ap->a_attrnamespace;
1684 eap->ea_contentpadlen = eapad2;
1685 eap->ea_namelength = strlen(ap->a_name);
1686 memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1687 bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1688 error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1689 if (error) {
1690 free(eae, M_TEMP);
1691 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1692 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1693 ip->i_ea_error = error;
1694 return (error);
1695 }
1696 bzero((u_char *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1697
1698 tmp = ip->i_ea_area;
1699 ip->i_ea_area = eae;
1700 ip->i_ea_len = easize;
1701 free(tmp, M_TEMP);
1702 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1703 return (error);
1704 }
1705
1706 /*
1707 * Vnode pointer to File handle
1708 */
1709 static int
1710 ffs_vptofh(struct vop_vptofh_args *ap)
1711 /*
1712 vop_vptofh {
1713 IN struct vnode *a_vp;
1714 IN struct fid *a_fhp;
1715 };
1716 */
1717 {
1718 struct inode *ip;
1719 struct ufid *ufhp;
1720
1721 ip = VTOI(ap->a_vp);
1722 ufhp = (struct ufid *)ap->a_fhp;
1723 ufhp->ufid_len = sizeof(struct ufid);
1724 ufhp->ufid_ino = ip->i_number;
1725 ufhp->ufid_gen = ip->i_gen;
1726 return (0);
1727 }
1728
1729 SYSCTL_DECL(_vfs_ffs);
1730 static int use_buf_pager = 1;
1731 SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1732 "Always use buffer pager instead of bmap");
1733
1734 static daddr_t
1735 ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1736 {
1737
1738 return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1739 }
1740
1741 static int
1742 ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz)
1743 {
1744
1745 *sz = blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn);
1746 return (0);
1747 }
1748
1749 static int
1750 ffs_getpages(struct vop_getpages_args *ap)
1751 {
1752 struct vnode *vp;
1753 struct ufsmount *um;
1754
1755 vp = ap->a_vp;
1756 um = VFSTOUFS(vp->v_mount);
1757
1758 if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1759 return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1760 ap->a_rbehind, ap->a_rahead, NULL, NULL));
1761 return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1762 ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1763 }
1764
1765 static int
1766 ffs_getpages_async(struct vop_getpages_async_args *ap)
1767 {
1768 struct vnode *vp;
1769 struct ufsmount *um;
1770 bool do_iodone;
1771 int error;
1772
1773 vp = ap->a_vp;
1774 um = VFSTOUFS(vp->v_mount);
1775 do_iodone = true;
1776
1777 if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) {
1778 error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1779 ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
1780 if (error == 0)
1781 do_iodone = false;
1782 } else {
1783 error = vfs_bio_getpages(vp, ap->a_m, ap->a_count,
1784 ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno,
1785 ffs_gbp_getblksz);
1786 }
1787 if (do_iodone && ap->a_iodone != NULL)
1788 ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1789
1790 return (error);
1791 }
1792
Cache object: 71de4a3026794d4154388592bd646844
|