1 /*
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Copyright (c) 1982, 1986, 1989, 1993
12 * The Regents of the University of California. All rights reserved.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the University of
25 * California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 *
42 * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
43 * $FreeBSD: releng/5.0/sys/ufs/ffs/ffs_vnops.c 105422 2002-10-18 22:52:41Z dillon $
44 */
45
46 #include <sys/param.h>
47 #include <sys/bio.h>
48 #include <sys/systm.h>
49 #include <sys/buf.h>
50 #include <sys/conf.h>
51 #include <sys/extattr.h>
52 #include <sys/kernel.h>
53 #include <sys/malloc.h>
54 #include <sys/mount.h>
55 #include <sys/proc.h>
56 #include <sys/resourcevar.h>
57 #include <sys/signalvar.h>
58 #include <sys/stat.h>
59 #include <sys/vmmeter.h>
60 #include <sys/vnode.h>
61
62 #include <machine/limits.h>
63
64 #include <vm/vm.h>
65 #include <vm/vm_extern.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vnode_pager.h>
70
71 #include <ufs/ufs/extattr.h>
72 #include <ufs/ufs/quota.h>
73 #include <ufs/ufs/inode.h>
74 #include <ufs/ufs/ufs_extern.h>
75 #include <ufs/ufs/ufsmount.h>
76
77 #include <ufs/ffs/fs.h>
78 #include <ufs/ffs/ffs_extern.h>
79
80 static int ffs_fsync(struct vop_fsync_args *);
81 static int ffs_getpages(struct vop_getpages_args *);
82 static int ffs_read(struct vop_read_args *);
83 static int ffs_write(struct vop_write_args *);
84 static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
85 static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
86 struct ucred *cred);
87 static int ffsext_strategy(struct vop_strategy_args *);
88 static int ffs_closeextattr(struct vop_closeextattr_args *);
89 static int ffs_getextattr(struct vop_getextattr_args *);
90 static int ffs_openextattr(struct vop_openextattr_args *);
91 static int ffs_setextattr(struct vop_setextattr_args *);
92
93
94 /* Global vfs data structures for ufs. */
95 vop_t **ffs_vnodeop_p;
96 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
97 { &vop_default_desc, (vop_t *) ufs_vnoperate },
98 { &vop_fsync_desc, (vop_t *) ffs_fsync },
99 { &vop_getpages_desc, (vop_t *) ffs_getpages },
100 { &vop_read_desc, (vop_t *) ffs_read },
101 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
102 { &vop_write_desc, (vop_t *) ffs_write },
103 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
104 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
105 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
106 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
107 { NULL, NULL }
108 };
109 static struct vnodeopv_desc ffs_vnodeop_opv_desc =
110 { &ffs_vnodeop_p, ffs_vnodeop_entries };
111
112 vop_t **ffs_specop_p;
113 static struct vnodeopv_entry_desc ffs_specop_entries[] = {
114 { &vop_default_desc, (vop_t *) ufs_vnoperatespec },
115 { &vop_fsync_desc, (vop_t *) ffs_fsync },
116 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
117 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
118 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
119 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
120 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
121 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
122 { NULL, NULL }
123 };
124 static struct vnodeopv_desc ffs_specop_opv_desc =
125 { &ffs_specop_p, ffs_specop_entries };
126
127 vop_t **ffs_fifoop_p;
128 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
129 { &vop_default_desc, (vop_t *) ufs_vnoperatefifo },
130 { &vop_fsync_desc, (vop_t *) ffs_fsync },
131 { &vop_reallocblks_desc, (vop_t *) ffs_reallocblks },
132 { &vop_strategy_desc, (vop_t *) ffsext_strategy },
133 { &vop_closeextattr_desc, (vop_t *) ffs_closeextattr },
134 { &vop_getextattr_desc, (vop_t *) ffs_getextattr },
135 { &vop_openextattr_desc, (vop_t *) ffs_openextattr },
136 { &vop_setextattr_desc, (vop_t *) ffs_setextattr },
137 { NULL, NULL }
138 };
139 static struct vnodeopv_desc ffs_fifoop_opv_desc =
140 { &ffs_fifoop_p, ffs_fifoop_entries };
141
142 VNODEOP_SET(ffs_vnodeop_opv_desc);
143 VNODEOP_SET(ffs_specop_opv_desc);
144 VNODEOP_SET(ffs_fifoop_opv_desc);
145
146 /*
147 * Synch an open file.
148 */
149 /* ARGSUSED */
150 static int
151 ffs_fsync(ap)
152 struct vop_fsync_args /* {
153 struct vnode *a_vp;
154 struct ucred *a_cred;
155 int a_waitfor;
156 struct thread *a_td;
157 } */ *ap;
158 {
159 struct vnode *vp = ap->a_vp;
160 struct inode *ip = VTOI(vp);
161 struct buf *bp;
162 struct buf *nbp;
163 int s, error, wait, passes, skipmeta;
164 ufs_lbn_t lbn;
165
166 wait = (ap->a_waitfor == MNT_WAIT);
167 if (vn_isdisk(vp, NULL)) {
168 lbn = INT_MAX;
169 if (vp->v_rdev->si_mountpoint != NULL &&
170 (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
171 softdep_fsync_mountdev(vp);
172 } else {
173 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
174 }
175
176 /*
177 * Flush all dirty buffers associated with a vnode.
178 */
179 passes = NIADDR + 1;
180 skipmeta = 0;
181 if (wait)
182 skipmeta = 1;
183 s = splbio();
184 VI_LOCK(vp);
185 loop:
186 TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
187 bp->b_flags &= ~B_SCANNED;
188 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
189 nbp = TAILQ_NEXT(bp, b_vnbufs);
190 /*
191 * Reasons to skip this buffer: it has already been considered
192 * on this pass, this pass is the first time through on a
193 * synchronous flush request and the buffer being considered
194 * is metadata, the buffer has dependencies that will cause
195 * it to be redirtied and it has not already been deferred,
196 * or it is already being written.
197 */
198 if ((bp->b_flags & B_SCANNED) != 0)
199 continue;
200 bp->b_flags |= B_SCANNED;
201 if ((skipmeta == 1 && bp->b_lblkno < 0))
202 continue;
203 if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
204 (bp->b_flags & B_DEFERRED) == 0 &&
205 buf_countdeps(bp, 0)) {
206 bp->b_flags |= B_DEFERRED;
207 continue;
208 }
209 VI_UNLOCK(vp);
210 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
211 VI_LOCK(vp);
212 continue;
213 }
214 if ((bp->b_flags & B_DELWRI) == 0)
215 panic("ffs_fsync: not dirty");
216 if (vp != bp->b_vp)
217 panic("ffs_fsync: vp != vp->b_vp");
218 /*
219 * If this is a synchronous flush request, or it is not a
220 * file or device, start the write on this buffer immediatly.
221 */
222 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
223
224 /*
225 * On our final pass through, do all I/O synchronously
226 * so that we can find out if our flush is failing
227 * because of write errors.
228 */
229 if (passes > 0 || !wait) {
230 if ((bp->b_flags & B_CLUSTEROK) && !wait) {
231 BUF_UNLOCK(bp);
232 (void) vfs_bio_awrite(bp);
233 } else {
234 bremfree(bp);
235 splx(s);
236 (void) bawrite(bp);
237 s = splbio();
238 }
239 } else {
240 bremfree(bp);
241 splx(s);
242 if ((error = bwrite(bp)) != 0)
243 return (error);
244 s = splbio();
245 }
246 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
247 /*
248 * If the buffer is for data that has been truncated
249 * off the file, then throw it away.
250 */
251 bremfree(bp);
252 bp->b_flags |= B_INVAL | B_NOCACHE;
253 splx(s);
254 brelse(bp);
255 s = splbio();
256 } else {
257 BUF_UNLOCK(bp);
258 vfs_bio_awrite(bp);
259 }
260 /*
261 * Since we may have slept during the I/O, we need
262 * to start from a known point.
263 */
264 VI_LOCK(vp);
265 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
266 }
267 /*
268 * If we were asked to do this synchronously, then go back for
269 * another pass, this time doing the metadata.
270 */
271 if (skipmeta) {
272 skipmeta = 0;
273 goto loop;
274 }
275
276 if (wait) {
277 while (vp->v_numoutput) {
278 vp->v_iflag |= VI_BWAIT;
279 msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
280 PRIBIO + 4, "ffsfsn", 0);
281 }
282 VI_UNLOCK(vp);
283
284 /*
285 * Ensure that any filesystem metatdata associated
286 * with the vnode has been written.
287 */
288 splx(s);
289 if ((error = softdep_sync_metadata(ap)) != 0)
290 return (error);
291 s = splbio();
292
293 VI_LOCK(vp);
294 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
295 /*
296 * Block devices associated with filesystems may
297 * have new I/O requests posted for them even if
298 * the vnode is locked, so no amount of trying will
299 * get them clean. Thus we give block devices a
300 * good effort, then just give up. For all other file
301 * types, go around and try again until it is clean.
302 */
303 if (passes > 0) {
304 passes -= 1;
305 goto loop;
306 }
307 #ifdef DIAGNOSTIC
308 if (!vn_isdisk(vp, NULL))
309 vprint("ffs_fsync: dirty", vp);
310 #endif
311 }
312 }
313 VI_UNLOCK(vp);
314 splx(s);
315 return (UFS_UPDATE(vp, wait));
316 }
317
318
319 /*
320 * Vnode op for reading.
321 */
322 /* ARGSUSED */
323 static int
324 ffs_read(ap)
325 struct vop_read_args /* {
326 struct vnode *a_vp;
327 struct uio *a_uio;
328 int a_ioflag;
329 struct ucred *a_cred;
330 } */ *ap;
331 {
332 struct vnode *vp;
333 struct inode *ip;
334 struct uio *uio;
335 struct fs *fs;
336 struct buf *bp;
337 ufs_lbn_t lbn, nextlbn;
338 off_t bytesinfile;
339 long size, xfersize, blkoffset;
340 int error, orig_resid;
341 mode_t mode;
342 int seqcount;
343 int ioflag;
344 vm_object_t object;
345
346 vp = ap->a_vp;
347 uio = ap->a_uio;
348 ioflag = ap->a_ioflag;
349 if (ap->a_ioflag & IO_EXT)
350 #ifdef notyet
351 return (ffs_extread(vp, uio, ioflag));
352 #else
353 panic("ffs_read+IO_EXT");
354 #endif
355
356 GIANT_REQUIRED;
357
358 seqcount = ap->a_ioflag >> 16;
359 ip = VTOI(vp);
360 mode = ip->i_mode;
361
362 #ifdef DIAGNOSTIC
363 if (uio->uio_rw != UIO_READ)
364 panic("ffs_read: mode");
365
366 if (vp->v_type == VLNK) {
367 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
368 panic("ffs_read: short symlink");
369 } else if (vp->v_type != VREG && vp->v_type != VDIR)
370 panic("ffs_read: type %d", vp->v_type);
371 #endif
372 fs = ip->i_fs;
373 if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
374 return (EFBIG);
375
376 orig_resid = uio->uio_resid;
377 if (orig_resid <= 0)
378 return (0);
379
380 object = vp->v_object;
381
382 bytesinfile = ip->i_size - uio->uio_offset;
383 if (bytesinfile <= 0) {
384 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
385 ip->i_flag |= IN_ACCESS;
386 return 0;
387 }
388
389 if (object) {
390 vm_object_reference(object);
391 }
392
393 #ifdef ENABLE_VFS_IOOPT
394 /*
395 * If IO optimisation is turned on,
396 * and we are NOT a VM based IO request,
397 * (i.e. not headed for the buffer cache)
398 * but there IS a vm object associated with it.
399 */
400 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
401 int nread, toread;
402
403 toread = uio->uio_resid;
404 if (toread > bytesinfile)
405 toread = bytesinfile;
406 if (toread >= PAGE_SIZE) {
407 /*
408 * Then if it's at least a page in size, try
409 * get the data from the object using vm tricks
410 */
411 error = uioread(toread, uio, object, &nread);
412 if ((uio->uio_resid == 0) || (error != 0)) {
413 /*
414 * If we finished or there was an error
415 * then finish up (the reference previously
416 * obtained on object must be released).
417 */
418 if ((error == 0 ||
419 uio->uio_resid != orig_resid) &&
420 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
421 ip->i_flag |= IN_ACCESS;
422
423 if (object) {
424 vm_object_vndeallocate(object);
425 }
426 return error;
427 }
428 }
429 }
430 #endif
431
432 /*
433 * Ok so we couldn't do it all in one vm trick...
434 * so cycle around trying smaller bites..
435 */
436 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
437 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
438 break;
439 #ifdef ENABLE_VFS_IOOPT
440 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
441 /*
442 * Obviously we didn't finish above, but we
443 * didn't get an error either. Try the same trick again.
444 * but this time we are looping.
445 */
446 int nread, toread;
447 toread = uio->uio_resid;
448 if (toread > bytesinfile)
449 toread = bytesinfile;
450
451 /*
452 * Once again, if there isn't enough for a
453 * whole page, don't try optimising.
454 */
455 if (toread >= PAGE_SIZE) {
456 error = uioread(toread, uio, object, &nread);
457 if ((uio->uio_resid == 0) || (error != 0)) {
458 /*
459 * If we finished or there was an
460 * error then finish up (the reference
461 * previously obtained on object must
462 * be released).
463 */
464 if ((error == 0 ||
465 uio->uio_resid != orig_resid) &&
466 (vp->v_mount->mnt_flag &
467 MNT_NOATIME) == 0)
468 ip->i_flag |= IN_ACCESS;
469 if (object) {
470 vm_object_vndeallocate(object);
471 }
472 return error;
473 }
474 /*
475 * To get here we didnt't finish or err.
476 * If we did get some data,
477 * loop to try another bite.
478 */
479 if (nread > 0) {
480 continue;
481 }
482 }
483 }
484 #endif
485
486 lbn = lblkno(fs, uio->uio_offset);
487 nextlbn = lbn + 1;
488
489 /*
490 * size of buffer. The buffer representing the
491 * end of the file is rounded up to the size of
492 * the block type ( fragment or full block,
493 * depending ).
494 */
495 size = blksize(fs, ip, lbn);
496 blkoffset = blkoff(fs, uio->uio_offset);
497
498 /*
499 * The amount we want to transfer in this iteration is
500 * one FS block less the amount of the data before
501 * our startpoint (duh!)
502 */
503 xfersize = fs->fs_bsize - blkoffset;
504
505 /*
506 * But if we actually want less than the block,
507 * or the file doesn't have a whole block more of data,
508 * then use the lesser number.
509 */
510 if (uio->uio_resid < xfersize)
511 xfersize = uio->uio_resid;
512 if (bytesinfile < xfersize)
513 xfersize = bytesinfile;
514
515 if (lblktosize(fs, nextlbn) >= ip->i_size) {
516 /*
517 * Don't do readahead if this is the end of the file.
518 */
519 error = bread(vp, lbn, size, NOCRED, &bp);
520 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
521 /*
522 * Otherwise if we are allowed to cluster,
523 * grab as much as we can.
524 *
525 * XXX This may not be a win if we are not
526 * doing sequential access.
527 */
528 error = cluster_read(vp, ip->i_size, lbn,
529 size, NOCRED, uio->uio_resid, seqcount, &bp);
530 } else if (seqcount > 1) {
531 /*
532 * If we are NOT allowed to cluster, then
533 * if we appear to be acting sequentially,
534 * fire off a request for a readahead
535 * as well as a read. Note that the 4th and 5th
536 * arguments point to arrays of the size specified in
537 * the 6th argument.
538 */
539 int nextsize = blksize(fs, ip, nextlbn);
540 error = breadn(vp, lbn,
541 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
542 } else {
543 /*
544 * Failing all of the above, just read what the
545 * user asked for. Interestingly, the same as
546 * the first option above.
547 */
548 error = bread(vp, lbn, size, NOCRED, &bp);
549 }
550 if (error) {
551 brelse(bp);
552 bp = NULL;
553 break;
554 }
555
556 /*
557 * If IO_DIRECT then set B_DIRECT for the buffer. This
558 * will cause us to attempt to release the buffer later on
559 * and will cause the buffer cache to attempt to free the
560 * underlying pages.
561 */
562 if (ioflag & IO_DIRECT)
563 bp->b_flags |= B_DIRECT;
564
565 /*
566 * We should only get non-zero b_resid when an I/O error
567 * has occurred, which should cause us to break above.
568 * However, if the short read did not cause an error,
569 * then we want to ensure that we do not uiomove bad
570 * or uninitialized data.
571 */
572 size -= bp->b_resid;
573 if (size < xfersize) {
574 if (size == 0)
575 break;
576 xfersize = size;
577 }
578
579 #ifdef ENABLE_VFS_IOOPT
580 if (vfs_ioopt && object &&
581 (bp->b_flags & B_VMIO) &&
582 ((blkoffset & PAGE_MASK) == 0) &&
583 ((xfersize & PAGE_MASK) == 0)) {
584 /*
585 * If VFS IO optimisation is turned on,
586 * and it's an exact page multiple
587 * And a normal VM based op,
588 * then use uiomiveco()
589 */
590 error =
591 uiomoveco((char *)bp->b_data + blkoffset,
592 (int)xfersize, uio, object, 0);
593 } else
594 #endif
595 {
596 /*
597 * otherwise use the general form
598 */
599 error =
600 uiomove((char *)bp->b_data + blkoffset,
601 (int)xfersize, uio);
602 }
603
604 if (error)
605 break;
606
607 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
608 (LIST_FIRST(&bp->b_dep) == NULL)) {
609 /*
610 * If there are no dependencies, and it's VMIO,
611 * then we don't need the buf, mark it available
612 * for freeing. The VM has the data.
613 */
614 bp->b_flags |= B_RELBUF;
615 brelse(bp);
616 } else {
617 /*
618 * Otherwise let whoever
619 * made the request take care of
620 * freeing it. We just queue
621 * it onto another list.
622 */
623 bqrelse(bp);
624 }
625 }
626
627 /*
628 * This can only happen in the case of an error
629 * because the loop above resets bp to NULL on each iteration
630 * and on normal completion has not set a new value into it.
631 * so it must have come from a 'break' statement
632 */
633 if (bp != NULL) {
634 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
635 (LIST_FIRST(&bp->b_dep) == NULL)) {
636 bp->b_flags |= B_RELBUF;
637 brelse(bp);
638 } else {
639 bqrelse(bp);
640 }
641 }
642
643 if (object) {
644 vm_object_vndeallocate(object);
645 }
646 if ((error == 0 || uio->uio_resid != orig_resid) &&
647 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
648 ip->i_flag |= IN_ACCESS;
649 return (error);
650 }
651
652 /*
653 * Vnode op for writing.
654 */
655 static int
656 ffs_write(ap)
657 struct vop_write_args /* {
658 struct vnode *a_vp;
659 struct uio *a_uio;
660 int a_ioflag;
661 struct ucred *a_cred;
662 } */ *ap;
663 {
664 struct vnode *vp;
665 struct uio *uio;
666 struct inode *ip;
667 struct fs *fs;
668 struct buf *bp;
669 struct thread *td;
670 ufs_lbn_t lbn;
671 off_t osize;
672 int seqcount;
673 int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
674 vm_object_t object;
675
676 vp = ap->a_vp;
677 uio = ap->a_uio;
678 ioflag = ap->a_ioflag;
679 if (ap->a_ioflag & IO_EXT)
680 #ifdef notyet
681 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
682 #else
683 panic("ffs_read+IO_EXT");
684 #endif
685
686 GIANT_REQUIRED;
687
688 extended = 0;
689 seqcount = ap->a_ioflag >> 16;
690 ip = VTOI(vp);
691
692 object = vp->v_object;
693 if (object) {
694 vm_object_reference(object);
695 }
696
697 #ifdef DIAGNOSTIC
698 if (uio->uio_rw != UIO_WRITE)
699 panic("ffswrite: mode");
700 #endif
701
702 switch (vp->v_type) {
703 case VREG:
704 if (ioflag & IO_APPEND)
705 uio->uio_offset = ip->i_size;
706 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
707 if (object) {
708 vm_object_vndeallocate(object);
709 }
710 return (EPERM);
711 }
712 /* FALLTHROUGH */
713 case VLNK:
714 break;
715 case VDIR:
716 panic("ffswrite: dir write");
717 break;
718 default:
719 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
720 (int)uio->uio_offset,
721 (int)uio->uio_resid
722 );
723 }
724
725 fs = ip->i_fs;
726 if (uio->uio_offset < 0 ||
727 (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
728 if (object) {
729 vm_object_vndeallocate(object);
730 }
731 return (EFBIG);
732 }
733 /*
734 * Maybe this should be above the vnode op call, but so long as
735 * file servers have no limits, I don't think it matters.
736 */
737 td = uio->uio_td;
738 if (vp->v_type == VREG && td &&
739 uio->uio_offset + uio->uio_resid >
740 td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
741 PROC_LOCK(td->td_proc);
742 psignal(td->td_proc, SIGXFSZ);
743 PROC_UNLOCK(td->td_proc);
744 if (object) {
745 vm_object_vndeallocate(object);
746 }
747 return (EFBIG);
748 }
749
750 resid = uio->uio_resid;
751 osize = ip->i_size;
752 if (seqcount > BA_SEQMAX)
753 flags = BA_SEQMAX << BA_SEQSHIFT;
754 else
755 flags = seqcount << BA_SEQSHIFT;
756 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
757 flags |= IO_SYNC;
758
759 #ifdef ENABLE_VFS_IOOPT
760 if (object && (object->flags & OBJ_OPT)) {
761 vm_freeze_copyopts(object,
762 OFF_TO_IDX(uio->uio_offset),
763 OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK));
764 }
765 #endif
766 for (error = 0; uio->uio_resid > 0;) {
767 lbn = lblkno(fs, uio->uio_offset);
768 blkoffset = blkoff(fs, uio->uio_offset);
769 xfersize = fs->fs_bsize - blkoffset;
770 if (uio->uio_resid < xfersize)
771 xfersize = uio->uio_resid;
772
773 if (uio->uio_offset + xfersize > ip->i_size)
774 vnode_pager_setsize(vp, uio->uio_offset + xfersize);
775
776 /*
777 * We must perform a read-before-write if the transfer size
778 * does not cover the entire buffer.
779 */
780 if (fs->fs_bsize > xfersize)
781 flags |= BA_CLRBUF;
782 else
783 flags &= ~BA_CLRBUF;
784 /* XXX is uio->uio_offset the right thing here? */
785 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
786 ap->a_cred, flags, &bp);
787 if (error != 0)
788 break;
789 /*
790 * If the buffer is not valid we have to clear out any
791 * garbage data from the pages instantiated for the buffer.
792 * If we do not, a failed uiomove() during a write can leave
793 * the prior contents of the pages exposed to a userland
794 * mmap(). XXX deal with uiomove() errors a better way.
795 */
796 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
797 vfs_bio_clrbuf(bp);
798 if (ioflag & IO_DIRECT)
799 bp->b_flags |= B_DIRECT;
800 if (ioflag & IO_NOWDRAIN)
801 bp->b_flags |= B_NOWDRAIN;
802
803 if (uio->uio_offset + xfersize > ip->i_size) {
804 ip->i_size = uio->uio_offset + xfersize;
805 DIP(ip, i_size) = ip->i_size;
806 extended = 1;
807 }
808
809 size = blksize(fs, ip, lbn) - bp->b_resid;
810 if (size < xfersize)
811 xfersize = size;
812
813 error =
814 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
815 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
816 (LIST_FIRST(&bp->b_dep) == NULL)) {
817 bp->b_flags |= B_RELBUF;
818 }
819
820 /*
821 * If IO_SYNC each buffer is written synchronously. Otherwise
822 * if we have a severe page deficiency write the buffer
823 * asynchronously. Otherwise try to cluster, and if that
824 * doesn't do it then either do an async write (if O_DIRECT),
825 * or a delayed write (if not).
826 */
827 if (ioflag & IO_SYNC) {
828 (void)bwrite(bp);
829 } else if (vm_page_count_severe() ||
830 buf_dirty_count_severe() ||
831 (ioflag & IO_ASYNC)) {
832 bp->b_flags |= B_CLUSTEROK;
833 bawrite(bp);
834 } else if (xfersize + blkoffset == fs->fs_bsize) {
835 if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
836 bp->b_flags |= B_CLUSTEROK;
837 cluster_write(bp, ip->i_size, seqcount);
838 } else {
839 bawrite(bp);
840 }
841 } else if (ioflag & IO_DIRECT) {
842 bp->b_flags |= B_CLUSTEROK;
843 bawrite(bp);
844 } else {
845 bp->b_flags |= B_CLUSTEROK;
846 bdwrite(bp);
847 }
848 if (error || xfersize == 0)
849 break;
850 ip->i_flag |= IN_CHANGE | IN_UPDATE;
851 }
852 /*
853 * If we successfully wrote any data, and we are not the superuser
854 * we clear the setuid and setgid bits as a precaution against
855 * tampering.
856 */
857 if (resid > uio->uio_resid && ap->a_cred &&
858 suser_cred(ap->a_cred, PRISON_ROOT)) {
859 ip->i_mode &= ~(ISUID | ISGID);
860 DIP(ip, i_mode) = ip->i_mode;
861 }
862 if (resid > uio->uio_resid)
863 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
864 if (error) {
865 if (ioflag & IO_UNIT) {
866 (void)UFS_TRUNCATE(vp, osize,
867 IO_NORMAL | (ioflag & IO_SYNC),
868 ap->a_cred, uio->uio_td);
869 uio->uio_offset -= resid - uio->uio_resid;
870 uio->uio_resid = resid;
871 }
872 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
873 error = UFS_UPDATE(vp, 1);
874
875 if (object) {
876 vm_object_vndeallocate(object);
877 }
878
879 return (error);
880 }
881
882 /*
883 * get page routine
884 */
885 static int
886 ffs_getpages(ap)
887 struct vop_getpages_args *ap;
888 {
889 off_t foff, physoffset;
890 int i, size, bsize;
891 struct vnode *dp, *vp;
892 vm_object_t obj;
893 vm_pindex_t pindex, firstindex;
894 vm_page_t mreq;
895 int bbackwards, bforwards;
896 int pbackwards, pforwards;
897 int firstpage;
898 ufs2_daddr_t reqblkno, reqlblkno;
899 int poff;
900 int pcount;
901 int rtval;
902 int pagesperblock;
903
904 GIANT_REQUIRED;
905
906 pcount = round_page(ap->a_count) / PAGE_SIZE;
907 mreq = ap->a_m[ap->a_reqpage];
908 firstindex = ap->a_m[0]->pindex;
909
910 /*
911 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
912 * then the entire page is valid. Since the page may be mapped,
913 * user programs might reference data beyond the actual end of file
914 * occuring within the page. We have to zero that data.
915 */
916 if (mreq->valid) {
917 if (mreq->valid != VM_PAGE_BITS_ALL)
918 vm_page_zero_invalid(mreq, TRUE);
919 vm_page_lock_queues();
920 for (i = 0; i < pcount; i++) {
921 if (i != ap->a_reqpage) {
922 vm_page_free(ap->a_m[i]);
923 }
924 }
925 vm_page_unlock_queues();
926 return VM_PAGER_OK;
927 }
928
929 vp = ap->a_vp;
930 obj = vp->v_object;
931 bsize = vp->v_mount->mnt_stat.f_iosize;
932 pindex = mreq->pindex;
933 foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
934
935 if (bsize < PAGE_SIZE)
936 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
937 ap->a_count,
938 ap->a_reqpage);
939
940 /*
941 * foff is the file offset of the required page
942 * reqlblkno is the logical block that contains the page
943 * poff is the index of the page into the logical block
944 */
945 reqlblkno = foff / bsize;
946 poff = (foff % bsize) / PAGE_SIZE;
947
948 dp = VTOI(vp)->i_devvp;
949 if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
950 || (reqblkno == -1)) {
951 vm_page_lock_queues();
952 for(i = 0; i < pcount; i++) {
953 if (i != ap->a_reqpage)
954 vm_page_free(ap->a_m[i]);
955 }
956 vm_page_unlock_queues();
957 if (reqblkno == -1) {
958 if ((mreq->flags & PG_ZERO) == 0)
959 pmap_zero_page(mreq);
960 vm_page_undirty(mreq);
961 mreq->valid = VM_PAGE_BITS_ALL;
962 return VM_PAGER_OK;
963 } else {
964 return VM_PAGER_ERROR;
965 }
966 }
967
968 physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
969 pagesperblock = bsize / PAGE_SIZE;
970 /*
971 * find the first page that is contiguous...
972 * note that pbackwards is the number of pages that are contiguous
973 * backwards.
974 */
975 firstpage = 0;
976 if (ap->a_count) {
977 pbackwards = poff + bbackwards * pagesperblock;
978 if (ap->a_reqpage > pbackwards) {
979 firstpage = ap->a_reqpage - pbackwards;
980 vm_page_lock_queues();
981 for(i=0;i<firstpage;i++)
982 vm_page_free(ap->a_m[i]);
983 vm_page_unlock_queues();
984 }
985
986 /*
987 * pforwards is the number of pages that are contiguous
988 * after the current page.
989 */
990 pforwards = (pagesperblock - (poff + 1)) +
991 bforwards * pagesperblock;
992 if (pforwards < (pcount - (ap->a_reqpage + 1))) {
993 vm_page_lock_queues();
994 for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
995 vm_page_free(ap->a_m[i]);
996 vm_page_unlock_queues();
997 pcount = ap->a_reqpage + pforwards + 1;
998 }
999
1000 /*
1001 * number of pages for I/O corrected for the non-contig pages at
1002 * the beginning of the array.
1003 */
1004 pcount -= firstpage;
1005 }
1006
1007 /*
1008 * calculate the size of the transfer
1009 */
1010
1011 size = pcount * PAGE_SIZE;
1012
1013 if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
1014 obj->un_pager.vnp.vnp_size)
1015 size = obj->un_pager.vnp.vnp_size -
1016 IDX_TO_OFF(ap->a_m[firstpage]->pindex);
1017
1018 physoffset -= foff;
1019 rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
1020 (ap->a_reqpage - firstpage), physoffset);
1021
1022 return (rtval);
1023 }
1024
1025 /*
1026 * Extended attribute area reading.
1027 */
1028 static int
1029 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
1030 {
1031 struct inode *ip;
1032 struct ufs2_dinode *dp;
1033 struct fs *fs;
1034 struct buf *bp;
1035 ufs_lbn_t lbn, nextlbn;
1036 off_t bytesinfile;
1037 long size, xfersize, blkoffset;
1038 int error, orig_resid;
1039 mode_t mode;
1040
1041 GIANT_REQUIRED;
1042
1043 ip = VTOI(vp);
1044 fs = ip->i_fs;
1045 dp = ip->i_din2;
1046 mode = ip->i_mode;
1047
1048 #ifdef DIAGNOSTIC
1049 if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
1050 panic("ffs_extread: mode");
1051
1052 #endif
1053 orig_resid = uio->uio_resid;
1054 if (orig_resid <= 0)
1055 return (0);
1056
1057 bytesinfile = dp->di_extsize - uio->uio_offset;
1058 if (bytesinfile <= 0) {
1059 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1060 ip->i_flag |= IN_ACCESS;
1061 return 0;
1062 }
1063
1064 for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1065 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
1066 break;
1067
1068 lbn = lblkno(fs, uio->uio_offset);
1069 nextlbn = lbn + 1;
1070
1071 /*
1072 * size of buffer. The buffer representing the
1073 * end of the file is rounded up to the size of
1074 * the block type ( fragment or full block,
1075 * depending ).
1076 */
1077 size = sblksize(fs, dp->di_extsize, lbn);
1078 blkoffset = blkoff(fs, uio->uio_offset);
1079
1080 /*
1081 * The amount we want to transfer in this iteration is
1082 * one FS block less the amount of the data before
1083 * our startpoint (duh!)
1084 */
1085 xfersize = fs->fs_bsize - blkoffset;
1086
1087 /*
1088 * But if we actually want less than the block,
1089 * or the file doesn't have a whole block more of data,
1090 * then use the lesser number.
1091 */
1092 if (uio->uio_resid < xfersize)
1093 xfersize = uio->uio_resid;
1094 if (bytesinfile < xfersize)
1095 xfersize = bytesinfile;
1096
1097 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1098 /*
1099 * Don't do readahead if this is the end of the info.
1100 */
1101 error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1102 } else {
1103 /*
1104 * If we have a second block, then
1105 * fire off a request for a readahead
1106 * as well as a read. Note that the 4th and 5th
1107 * arguments point to arrays of the size specified in
1108 * the 6th argument.
1109 */
1110 int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1111
1112 nextlbn = -1 - nextlbn;
1113 error = breadn(vp, -1 - lbn,
1114 size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1115 }
1116 if (error) {
1117 brelse(bp);
1118 bp = NULL;
1119 break;
1120 }
1121
1122 /*
1123 * If IO_DIRECT then set B_DIRECT for the buffer. This
1124 * will cause us to attempt to release the buffer later on
1125 * and will cause the buffer cache to attempt to free the
1126 * underlying pages.
1127 */
1128 if (ioflag & IO_DIRECT)
1129 bp->b_flags |= B_DIRECT;
1130
1131 /*
1132 * We should only get non-zero b_resid when an I/O error
1133 * has occurred, which should cause us to break above.
1134 * However, if the short read did not cause an error,
1135 * then we want to ensure that we do not uiomove bad
1136 * or uninitialized data.
1137 */
1138 size -= bp->b_resid;
1139 if (size < xfersize) {
1140 if (size == 0)
1141 break;
1142 xfersize = size;
1143 }
1144
1145 error = uiomove((char *)bp->b_data + blkoffset,
1146 (int)xfersize, uio);
1147 if (error)
1148 break;
1149
1150 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1151 (LIST_FIRST(&bp->b_dep) == NULL)) {
1152 /*
1153 * If there are no dependencies, and it's VMIO,
1154 * then we don't need the buf, mark it available
1155 * for freeing. The VM has the data.
1156 */
1157 bp->b_flags |= B_RELBUF;
1158 brelse(bp);
1159 } else {
1160 /*
1161 * Otherwise let whoever
1162 * made the request take care of
1163 * freeing it. We just queue
1164 * it onto another list.
1165 */
1166 bqrelse(bp);
1167 }
1168 }
1169
1170 /*
1171 * This can only happen in the case of an error
1172 * because the loop above resets bp to NULL on each iteration
1173 * and on normal completion has not set a new value into it.
1174 * so it must have come from a 'break' statement
1175 */
1176 if (bp != NULL) {
1177 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1178 (LIST_FIRST(&bp->b_dep) == NULL)) {
1179 bp->b_flags |= B_RELBUF;
1180 brelse(bp);
1181 } else {
1182 bqrelse(bp);
1183 }
1184 }
1185
1186 if ((error == 0 || uio->uio_resid != orig_resid) &&
1187 (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
1188 ip->i_flag |= IN_ACCESS;
1189 return (error);
1190 }
1191
1192 /*
1193 * Extended attribute area writing.
1194 */
1195 static int
1196 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1197 {
1198 struct inode *ip;
1199 struct ufs2_dinode *dp;
1200 struct fs *fs;
1201 struct buf *bp;
1202 ufs_lbn_t lbn;
1203 off_t osize;
1204 int blkoffset, error, flags, resid, size, xfersize;
1205
1206 GIANT_REQUIRED;
1207
1208 ip = VTOI(vp);
1209 fs = ip->i_fs;
1210 dp = ip->i_din2;
1211
1212 #ifdef DIAGNOSTIC
1213 if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1214 panic("ext_write: mode");
1215 #endif
1216
1217 if (ioflag & IO_APPEND)
1218 uio->uio_offset = dp->di_extsize;
1219
1220 if (uio->uio_offset < 0 ||
1221 (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1222 return (EFBIG);
1223
1224 resid = uio->uio_resid;
1225 osize = dp->di_extsize;
1226 flags = IO_EXT;
1227 if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1228 flags |= IO_SYNC;
1229
1230 for (error = 0; uio->uio_resid > 0;) {
1231 lbn = lblkno(fs, uio->uio_offset);
1232 blkoffset = blkoff(fs, uio->uio_offset);
1233 xfersize = fs->fs_bsize - blkoffset;
1234 if (uio->uio_resid < xfersize)
1235 xfersize = uio->uio_resid;
1236
1237 /*
1238 * We must perform a read-before-write if the transfer size
1239 * does not cover the entire buffer.
1240 */
1241 if (fs->fs_bsize > xfersize)
1242 flags |= BA_CLRBUF;
1243 else
1244 flags &= ~BA_CLRBUF;
1245 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1246 ucred, flags, &bp);
1247 if (error != 0)
1248 break;
1249 /*
1250 * If the buffer is not valid we have to clear out any
1251 * garbage data from the pages instantiated for the buffer.
1252 * If we do not, a failed uiomove() during a write can leave
1253 * the prior contents of the pages exposed to a userland
1254 * mmap(). XXX deal with uiomove() errors a better way.
1255 */
1256 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1257 vfs_bio_clrbuf(bp);
1258 if (ioflag & IO_DIRECT)
1259 bp->b_flags |= B_DIRECT;
1260 if (ioflag & IO_NOWDRAIN)
1261 bp->b_flags |= B_NOWDRAIN;
1262
1263 if (uio->uio_offset + xfersize > dp->di_extsize)
1264 dp->di_extsize = uio->uio_offset + xfersize;
1265
1266 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1267 if (size < xfersize)
1268 xfersize = size;
1269
1270 error =
1271 uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1272 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1273 (LIST_FIRST(&bp->b_dep) == NULL)) {
1274 bp->b_flags |= B_RELBUF;
1275 }
1276
1277 /*
1278 * If IO_SYNC each buffer is written synchronously. Otherwise
1279 * if we have a severe page deficiency write the buffer
1280 * asynchronously. Otherwise try to cluster, and if that
1281 * doesn't do it then either do an async write (if O_DIRECT),
1282 * or a delayed write (if not).
1283 */
1284 if (ioflag & IO_SYNC) {
1285 (void)bwrite(bp);
1286 } else if (vm_page_count_severe() ||
1287 buf_dirty_count_severe() ||
1288 xfersize + blkoffset == fs->fs_bsize ||
1289 (ioflag & (IO_ASYNC | IO_DIRECT)))
1290 bawrite(bp);
1291 else
1292 bdwrite(bp);
1293 if (error || xfersize == 0)
1294 break;
1295 ip->i_flag |= IN_CHANGE | IN_UPDATE;
1296 }
1297 /*
1298 * If we successfully wrote any data, and we are not the superuser
1299 * we clear the setuid and setgid bits as a precaution against
1300 * tampering.
1301 */
1302 if (resid > uio->uio_resid && ucred &&
1303 suser_cred(ucred, PRISON_ROOT)) {
1304 ip->i_mode &= ~(ISUID | ISGID);
1305 dp->di_mode = ip->i_mode;
1306 }
1307 if (error) {
1308 if (ioflag & IO_UNIT) {
1309 (void)UFS_TRUNCATE(vp, osize,
1310 IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
1311 uio->uio_offset -= resid - uio->uio_resid;
1312 uio->uio_resid = resid;
1313 }
1314 } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1315 error = UFS_UPDATE(vp, 1);
1316 return (error);
1317 }
1318
1319
1320 /*
1321 * Vnode operating to retrieve a named extended attribute.
1322 *
1323 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1324 * the length of the EA, and possibly the pointer to the entry and to the data.
1325 */
1326 static int
1327 ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac)
1328 {
1329 u_char *p, *pe, *pn, *p0;
1330 int eapad1, eapad2, ealength, ealen, nlen;
1331 uint32_t ul;
1332
1333 pe = ptr + length;
1334 nlen = strlen(name);
1335
1336 for (p = ptr; p < pe; p = pn) {
1337 p0 = p;
1338 bcopy(p, &ul, sizeof(ul));
1339 pn = p + ul;
1340 /* make sure this entry is complete */
1341 if (pn > pe)
1342 break;
1343 p += sizeof(uint32_t);
1344 if (*p != nspace)
1345 continue;
1346 p++;
1347 eapad2 = *p++;
1348 if (*p != nlen)
1349 continue;
1350 p++;
1351 if (bcmp(p, name, nlen))
1352 continue;
1353 ealength = sizeof(uint32_t) + 3 + nlen;
1354 eapad1 = 8 - (ealength % 8);
1355 if (eapad1 == 8)
1356 eapad1 = 0;
1357 ealength += eapad1;
1358 ealen = ul - ealength - eapad2;
1359 p += nlen + eapad1;
1360 if (eap != NULL)
1361 *eap = p0;
1362 if (eac != NULL)
1363 *eac = p;
1364 return (ealen);
1365 }
1366 return(-1);
1367 }
1368
1369 static int
1370 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1371 {
1372 struct inode *ip;
1373 struct fs *fs;
1374 struct ufs2_dinode *dp;
1375 struct uio luio;
1376 struct iovec liovec;
1377 int easize, error;
1378 u_char *eae;
1379
1380 ip = VTOI(vp);
1381 fs = ip->i_fs;
1382 dp = ip->i_din2;
1383 easize = dp->di_extsize;
1384
1385 eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1386
1387 liovec.iov_base = eae;
1388 liovec.iov_len = easize;
1389 luio.uio_iov = &liovec;
1390 luio.uio_iovcnt = 1;
1391 luio.uio_offset = 0;
1392 luio.uio_resid = easize;
1393 luio.uio_segflg = UIO_SYSSPACE;
1394 luio.uio_rw = UIO_READ;
1395 luio.uio_td = td;
1396
1397 error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1398 if (error) {
1399 free(eae, M_TEMP);
1400 return(error);
1401 }
1402 *p = eae;
1403 return (0);
1404 }
1405
1406 static int
1407 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1408 {
1409 struct inode *ip;
1410 struct fs *fs;
1411 struct ufs2_dinode *dp;
1412 int error;
1413
1414 ip = VTOI(vp);
1415 fs = ip->i_fs;
1416
1417 if (ip->i_ea_area != NULL)
1418 return (EBUSY);
1419 dp = ip->i_din2;
1420 error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1421 if (error)
1422 return (error);
1423 ip->i_ea_len = dp->di_extsize;
1424 ip->i_ea_error = 0;
1425 return (0);
1426 }
1427
1428 /*
1429 * Vnode extattr transaction commit/abort
1430 */
1431 static int
1432 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1433 {
1434 struct inode *ip;
1435 struct fs *fs;
1436 struct uio luio;
1437 struct iovec liovec;
1438 int error;
1439 struct ufs2_dinode *dp;
1440
1441 ip = VTOI(vp);
1442 fs = ip->i_fs;
1443 if (ip->i_ea_area == NULL)
1444 return (EINVAL);
1445 dp = ip->i_din2;
1446 error = ip->i_ea_error;
1447 if (commit && error == 0) {
1448 if (cred == NOCRED)
1449 cred = vp->v_mount->mnt_cred;
1450 liovec.iov_base = ip->i_ea_area;
1451 liovec.iov_len = ip->i_ea_len;
1452 luio.uio_iov = &liovec;
1453 luio.uio_iovcnt = 1;
1454 luio.uio_offset = 0;
1455 luio.uio_resid = ip->i_ea_len;
1456 luio.uio_segflg = UIO_SYSSPACE;
1457 luio.uio_rw = UIO_WRITE;
1458 luio.uio_td = td;
1459 /* XXX: I'm not happy about truncating to zero size */
1460 if (ip->i_ea_len < dp->di_extsize)
1461 error = ffs_truncate(vp, 0, IO_EXT, cred, td);
1462 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1463 }
1464 free(ip->i_ea_area, M_TEMP);
1465 ip->i_ea_area = NULL;
1466 ip->i_ea_len = 0;
1467 ip->i_ea_error = 0;
1468 return (error);
1469 }
1470
1471 /*
1472 * Vnode extattr strategy routine for special devices and fifos.
1473 *
1474 * We need to check for a read or write of the external attributes.
1475 * Otherwise we just fall through and do the usual thing.
1476 */
1477 static int
1478 ffsext_strategy(struct vop_strategy_args *ap)
1479 /*
1480 struct vop_strategy_args {
1481 struct vnodeop_desc *a_desc;
1482 struct vnode *a_vp;
1483 struct buf *a_bp;
1484 };
1485 */
1486 {
1487 struct vnode *vp;
1488 daddr_t lbn;
1489
1490 vp = ap->a_vp;
1491 lbn = ap->a_bp->b_lblkno;
1492 if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1493 lbn < 0 && lbn >= -NXADDR)
1494 return (ufs_vnoperate((struct vop_generic_args *)ap));
1495 if (vp->v_type == VFIFO)
1496 return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
1497 return (ufs_vnoperatespec((struct vop_generic_args *)ap));
1498 }
1499
1500 /*
1501 * Vnode extattr transaction commit/abort
1502 */
1503 static int
1504 ffs_openextattr(struct vop_openextattr_args *ap)
1505 /*
1506 struct vop_openextattr_args {
1507 struct vnodeop_desc *a_desc;
1508 struct vnode *a_vp;
1509 IN struct ucred *a_cred;
1510 IN struct thread *a_td;
1511 };
1512 */
1513 {
1514 struct inode *ip;
1515 struct fs *fs;
1516
1517 ip = VTOI(ap->a_vp);
1518 fs = ip->i_fs;
1519 if (fs->fs_magic == FS_UFS1_MAGIC)
1520 return (ufs_vnoperate((struct vop_generic_args *)ap));
1521 return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1522 }
1523
1524
1525 /*
1526 * Vnode extattr transaction commit/abort
1527 */
1528 static int
1529 ffs_closeextattr(struct vop_closeextattr_args *ap)
1530 /*
1531 struct vop_closeextattr_args {
1532 struct vnodeop_desc *a_desc;
1533 struct vnode *a_vp;
1534 int a_commit;
1535 IN struct ucred *a_cred;
1536 IN struct thread *a_td;
1537 };
1538 */
1539 {
1540 struct inode *ip;
1541 struct fs *fs;
1542
1543 ip = VTOI(ap->a_vp);
1544 fs = ip->i_fs;
1545 if (fs->fs_magic == FS_UFS1_MAGIC)
1546 return (ufs_vnoperate((struct vop_generic_args *)ap));
1547 return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1548 }
1549
1550
1551
1552 /*
1553 * Vnode operation to retrieve a named extended attribute.
1554 */
1555 static int
1556 ffs_getextattr(struct vop_getextattr_args *ap)
1557 /*
1558 vop_getextattr {
1559 IN struct vnode *a_vp;
1560 IN int a_attrnamespace;
1561 IN const char *a_name;
1562 INOUT struct uio *a_uio;
1563 OUT size_t *a_size;
1564 IN struct ucred *a_cred;
1565 IN struct thread *a_td;
1566 };
1567 */
1568 {
1569 struct inode *ip;
1570 struct fs *fs;
1571 u_char *eae, *p, *pe, *pn;
1572 struct ufs2_dinode *dp;
1573 unsigned easize;
1574 uint32_t ul;
1575 int error, ealen, stand_alone;
1576
1577 ip = VTOI(ap->a_vp);
1578 fs = ip->i_fs;
1579
1580 if (fs->fs_magic == FS_UFS1_MAGIC)
1581 return (ufs_vnoperate((struct vop_generic_args *)ap));
1582
1583 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1584 ap->a_cred, ap->a_td, IREAD);
1585 if (error)
1586 return (error);
1587
1588 if (ip->i_ea_area == NULL) {
1589 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1590 if (error)
1591 return (error);
1592 stand_alone = 1;
1593 } else {
1594 stand_alone = 0;
1595 }
1596 dp = ip->i_din2;
1597 eae = ip->i_ea_area;
1598 easize = ip->i_ea_len;
1599 if (strlen(ap->a_name) > 0) {
1600 ealen = ffs_findextattr(eae, easize,
1601 ap->a_attrnamespace, ap->a_name, NULL, &p);
1602 if (ealen >= 0) {
1603 error = 0;
1604 if (ap->a_size != NULL)
1605 *ap->a_size = ealen;
1606 else if (ap->a_uio != NULL)
1607 error = uiomove(p, ealen, ap->a_uio);
1608 } else {
1609 error = ENOATTR;
1610 }
1611 } else {
1612 error = 0;
1613 if (ap->a_size != NULL)
1614 *ap->a_size = 0;
1615 pe = eae + easize;
1616 for(p = eae; error == 0 && p < pe; p = pn) {
1617 bcopy(p, &ul, sizeof(ul));
1618 pn = p + ul;
1619 if (pn > pe)
1620 break;
1621 p += sizeof(ul);
1622 if (*p++ != ap->a_attrnamespace)
1623 continue;
1624 p++; /* pad2 */
1625 ealen = *p;
1626 if (ap->a_size != NULL) {
1627 *ap->a_size += ealen + 1;
1628 } else if (ap->a_uio != NULL) {
1629 error = uiomove(p, ealen + 1, ap->a_uio);
1630 }
1631 }
1632 }
1633 if (stand_alone)
1634 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1635 return(error);
1636 }
1637
1638 /*
1639 * Vnode operation to set a named attribute.
1640 */
1641 static int
1642 ffs_setextattr(struct vop_setextattr_args *ap)
1643 /*
1644 vop_setextattr {
1645 IN struct vnode *a_vp;
1646 IN int a_attrnamespace;
1647 IN const char *a_name;
1648 INOUT struct uio *a_uio;
1649 IN struct ucred *a_cred;
1650 IN struct thread *a_td;
1651 };
1652 */
1653 {
1654 struct inode *ip;
1655 struct fs *fs;
1656 uint32_t ealength, ul;
1657 int ealen, olen, eacont, eapad1, eapad2, error, i, easize;
1658 u_char *eae, *p;
1659 struct ufs2_dinode *dp;
1660 struct ucred *cred;
1661 int stand_alone;
1662
1663 ip = VTOI(ap->a_vp);
1664 fs = ip->i_fs;
1665
1666 if (fs->fs_magic == FS_UFS1_MAGIC)
1667 return (ufs_vnoperate((struct vop_generic_args *)ap));
1668
1669 error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1670 ap->a_cred, ap->a_td, IWRITE);
1671 if (error) {
1672 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1673 ip->i_ea_error = error;
1674 return (error);
1675 }
1676
1677 if (ap->a_cred != NOCRED)
1678 cred = ap->a_cred;
1679 else
1680 cred = ap->a_vp->v_mount->mnt_cred;
1681
1682 dp = ip->i_din2;
1683
1684 if (ip->i_ea_area == NULL) {
1685 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1686 if (error)
1687 return (error);
1688 stand_alone = 1;
1689 } else {
1690 stand_alone = 0;
1691 }
1692
1693 /* Calculate the length of the EA entry */
1694 if (ap->a_uio == NULL) {
1695 /* delete */
1696 ealength = eapad1 = ealen = eapad2 = eacont = 0;
1697 } else {
1698 ealen = ap->a_uio->uio_resid;
1699 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1700 eapad1 = 8 - (ealength % 8);
1701 if (eapad1 == 8)
1702 eapad1 = 0;
1703 eacont = ealength + eapad1;
1704 eapad2 = 8 - (ealen % 8);
1705 if (eapad2 == 8)
1706 eapad2 = 0;
1707 ealength += eapad1 + ealen + eapad2;
1708 }
1709
1710 eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1711 bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1712 easize = ip->i_ea_len;
1713
1714 olen = ffs_findextattr(eae, easize,
1715 ap->a_attrnamespace, ap->a_name, &p, NULL);
1716 if (olen == -1 && ealength == 0) {
1717 /* delete but nonexistent */
1718 free(eae, M_TEMP);
1719 if (stand_alone)
1720 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1721 return(ENOATTR);
1722 }
1723 if (olen == -1) {
1724 /* new, append at end */
1725 p = eae + easize;
1726 easize += ealength;
1727 } else {
1728 bcopy(p, &ul, sizeof ul);
1729 i = p - eae + ul;
1730 if (ul != ealength) {
1731 bcopy(p + ul, p + ealength, easize - i);
1732 easize += (ealength - ul);
1733 }
1734 }
1735 if (easize > NXADDR * fs->fs_bsize) {
1736 free(eae, M_TEMP);
1737 if (stand_alone)
1738 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1739 else if (ip->i_ea_error == 0)
1740 ip->i_ea_error = ENOSPC;
1741 return(ENOSPC);
1742 }
1743 if (ealength != 0) {
1744 bcopy(&ealength, p, sizeof(ealength));
1745 p += sizeof(ealength);
1746 *p++ = ap->a_attrnamespace;
1747 *p++ = eapad2;
1748 *p++ = strlen(ap->a_name);
1749 strcpy(p, ap->a_name);
1750 p += strlen(ap->a_name);
1751 bzero(p, eapad1);
1752 p += eapad1;
1753 error = uiomove(p, ealen, ap->a_uio);
1754 if (error) {
1755 free(eae, M_TEMP);
1756 if (stand_alone)
1757 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1758 else if (ip->i_ea_error == 0)
1759 ip->i_ea_error = error;
1760 return(error);
1761 }
1762 p += ealen;
1763 bzero(p, eapad2);
1764 }
1765 p = ip->i_ea_area;
1766 ip->i_ea_area = eae;
1767 ip->i_ea_len = easize;
1768 free(p, M_TEMP);
1769 if (stand_alone)
1770 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1771 return(error);
1772 }
Cache object: b6615c3799c8653d66b2e81cf6f0d94d
|