1 /*-
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
33 */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD: src/sys/nfsclient/nfs_bio.c,v 1.168 2008/10/10 21:23:50 attilio Exp $");
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/bio.h>
41 #include <sys/buf.h>
42 #include <sys/kernel.h>
43 #include <sys/mount.h>
44 #include <sys/proc.h>
45 #include <sys/resourcevar.h>
46 #include <sys/signalvar.h>
47 #include <sys/vmmeter.h>
48 #include <sys/vnode.h>
49
50 #include <vm/vm.h>
51 #include <vm/vm_extern.h>
52 #include <vm/vm_page.h>
53 #include <vm/vm_object.h>
54 #include <vm/vm_pager.h>
55 #include <vm/vnode_pager.h>
56
57 #include <rpc/rpcclnt.h>
58
59 #include <nfs/rpcv2.h>
60 #include <nfs/nfsproto.h>
61 #include <nfsclient/nfs.h>
62 #include <nfsclient/nfsmount.h>
63 #include <nfsclient/nfsnode.h>
64
65 #include <nfs4client/nfs4.h>
66
67 static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
68 struct thread *td);
69 static int nfs_directio_write(struct vnode *vp, struct uio *uiop,
70 struct ucred *cred, int ioflag);
71
72 extern int nfs_directio_enable;
73 extern int nfs_directio_allow_mmap;
74
75 /*
76 * Vnode op for VM getpages.
77 */
78 int
79 nfs_getpages(struct vop_getpages_args *ap)
80 {
81 int i, error, nextoff, size, toff, count, npages;
82 struct uio uio;
83 struct iovec iov;
84 vm_offset_t kva;
85 struct buf *bp;
86 struct vnode *vp;
87 struct thread *td;
88 struct ucred *cred;
89 struct nfsmount *nmp;
90 vm_object_t object;
91 vm_page_t *pages;
92 struct nfsnode *np;
93
94 vp = ap->a_vp;
95 np = VTONFS(vp);
96 td = curthread; /* XXX */
97 cred = curthread->td_ucred; /* XXX */
98 nmp = VFSTONFS(vp->v_mount);
99 pages = ap->a_m;
100 count = ap->a_count;
101
102 if ((object = vp->v_object) == NULL) {
103 nfs_printf("nfs_getpages: called with non-merged cache vnode??\n");
104 return VM_PAGER_ERROR;
105 }
106
107 if (nfs_directio_enable && !nfs_directio_allow_mmap) {
108 mtx_lock(&np->n_mtx);
109 if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
110 mtx_unlock(&np->n_mtx);
111 nfs_printf("nfs_getpages: called on non-cacheable vnode??\n");
112 return VM_PAGER_ERROR;
113 } else
114 mtx_unlock(&np->n_mtx);
115 }
116
117 mtx_lock(&nmp->nm_mtx);
118 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
119 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
120 mtx_unlock(&nmp->nm_mtx);
121 /* We'll never get here for v4, because we always have fsinfo */
122 (void)nfs_fsinfo(nmp, vp, cred, td);
123 } else
124 mtx_unlock(&nmp->nm_mtx);
125
126 npages = btoc(count);
127
128 /*
129 * If the requested page is partially valid, just return it and
130 * allow the pager to zero-out the blanks. Partially valid pages
131 * can only occur at the file EOF.
132 */
133
134 {
135 vm_page_t m = pages[ap->a_reqpage];
136
137 VM_OBJECT_LOCK(object);
138 vm_page_lock_queues();
139 if (m->valid != 0) {
140 /* handled by vm_fault now */
141 /* vm_page_zero_invalid(m, TRUE); */
142 for (i = 0; i < npages; ++i) {
143 if (i != ap->a_reqpage)
144 vm_page_free(pages[i]);
145 }
146 vm_page_unlock_queues();
147 VM_OBJECT_UNLOCK(object);
148 return(0);
149 }
150 vm_page_unlock_queues();
151 VM_OBJECT_UNLOCK(object);
152 }
153
154 /*
155 * We use only the kva address for the buffer, but this is extremely
156 * convienient and fast.
157 */
158 bp = getpbuf(&nfs_pbuf_freecnt);
159
160 kva = (vm_offset_t) bp->b_data;
161 pmap_qenter(kva, pages, npages);
162 PCPU_INC(cnt.v_vnodein);
163 PCPU_ADD(cnt.v_vnodepgsin, npages);
164
165 iov.iov_base = (caddr_t) kva;
166 iov.iov_len = count;
167 uio.uio_iov = &iov;
168 uio.uio_iovcnt = 1;
169 uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
170 uio.uio_resid = count;
171 uio.uio_segflg = UIO_SYSSPACE;
172 uio.uio_rw = UIO_READ;
173 uio.uio_td = td;
174
175 error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred);
176 pmap_qremove(kva, npages);
177
178 relpbuf(bp, &nfs_pbuf_freecnt);
179
180 if (error && (uio.uio_resid == count)) {
181 nfs_printf("nfs_getpages: error %d\n", error);
182 VM_OBJECT_LOCK(object);
183 vm_page_lock_queues();
184 for (i = 0; i < npages; ++i) {
185 if (i != ap->a_reqpage)
186 vm_page_free(pages[i]);
187 }
188 vm_page_unlock_queues();
189 VM_OBJECT_UNLOCK(object);
190 return VM_PAGER_ERROR;
191 }
192
193 /*
194 * Calculate the number of bytes read and validate only that number
195 * of bytes. Note that due to pending writes, size may be 0. This
196 * does not mean that the remaining data is invalid!
197 */
198
199 size = count - uio.uio_resid;
200 VM_OBJECT_LOCK(object);
201 vm_page_lock_queues();
202 for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
203 vm_page_t m;
204 nextoff = toff + PAGE_SIZE;
205 m = pages[i];
206
207 if (nextoff <= size) {
208 /*
209 * Read operation filled an entire page
210 */
211 m->valid = VM_PAGE_BITS_ALL;
212 vm_page_undirty(m);
213 } else if (size > toff) {
214 /*
215 * Read operation filled a partial page.
216 */
217 m->valid = 0;
218 vm_page_set_validclean(m, 0, size - toff);
219 /* handled by vm_fault now */
220 /* vm_page_zero_invalid(m, TRUE); */
221 } else {
222 /*
223 * Read operation was short. If no error occured
224 * we may have hit a zero-fill section. We simply
225 * leave valid set to 0.
226 */
227 ;
228 }
229 if (i != ap->a_reqpage) {
230 /*
231 * Whether or not to leave the page activated is up in
232 * the air, but we should put the page on a page queue
233 * somewhere (it already is in the object). Result:
234 * It appears that emperical results show that
235 * deactivating pages is best.
236 */
237
238 /*
239 * Just in case someone was asking for this page we
240 * now tell them that it is ok to use.
241 */
242 if (!error) {
243 if (m->oflags & VPO_WANTED)
244 vm_page_activate(m);
245 else
246 vm_page_deactivate(m);
247 vm_page_wakeup(m);
248 } else {
249 vm_page_free(m);
250 }
251 }
252 }
253 vm_page_unlock_queues();
254 VM_OBJECT_UNLOCK(object);
255 return 0;
256 }
257
258 /*
259 * Vnode op for VM putpages.
260 */
261 int
262 nfs_putpages(struct vop_putpages_args *ap)
263 {
264 struct uio uio;
265 struct iovec iov;
266 vm_offset_t kva;
267 struct buf *bp;
268 int iomode, must_commit, i, error, npages, count;
269 off_t offset;
270 int *rtvals;
271 struct vnode *vp;
272 struct thread *td;
273 struct ucred *cred;
274 struct nfsmount *nmp;
275 struct nfsnode *np;
276 vm_page_t *pages;
277
278 vp = ap->a_vp;
279 np = VTONFS(vp);
280 td = curthread; /* XXX */
281 cred = curthread->td_ucred; /* XXX */
282 nmp = VFSTONFS(vp->v_mount);
283 pages = ap->a_m;
284 count = ap->a_count;
285 rtvals = ap->a_rtvals;
286 npages = btoc(count);
287 offset = IDX_TO_OFF(pages[0]->pindex);
288
289 mtx_lock(&nmp->nm_mtx);
290 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
291 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
292 mtx_unlock(&nmp->nm_mtx);
293 (void)nfs_fsinfo(nmp, vp, cred, td);
294 } else
295 mtx_unlock(&nmp->nm_mtx);
296
297 mtx_lock(&np->n_mtx);
298 if (nfs_directio_enable && !nfs_directio_allow_mmap &&
299 (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
300 mtx_unlock(&np->n_mtx);
301 nfs_printf("nfs_putpages: called on noncache-able vnode??\n");
302 mtx_lock(&np->n_mtx);
303 }
304
305 for (i = 0; i < npages; i++)
306 rtvals[i] = VM_PAGER_AGAIN;
307
308 /*
309 * When putting pages, do not extend file past EOF.
310 */
311 if (offset + count > np->n_size) {
312 count = np->n_size - offset;
313 if (count < 0)
314 count = 0;
315 }
316 mtx_unlock(&np->n_mtx);
317
318 /*
319 * We use only the kva address for the buffer, but this is extremely
320 * convienient and fast.
321 */
322 bp = getpbuf(&nfs_pbuf_freecnt);
323
324 kva = (vm_offset_t) bp->b_data;
325 pmap_qenter(kva, pages, npages);
326 PCPU_INC(cnt.v_vnodeout);
327 PCPU_ADD(cnt.v_vnodepgsout, count);
328
329 iov.iov_base = (caddr_t) kva;
330 iov.iov_len = count;
331 uio.uio_iov = &iov;
332 uio.uio_iovcnt = 1;
333 uio.uio_offset = offset;
334 uio.uio_resid = count;
335 uio.uio_segflg = UIO_SYSSPACE;
336 uio.uio_rw = UIO_WRITE;
337 uio.uio_td = td;
338
339 if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
340 iomode = NFSV3WRITE_UNSTABLE;
341 else
342 iomode = NFSV3WRITE_FILESYNC;
343
344 error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit);
345
346 pmap_qremove(kva, npages);
347 relpbuf(bp, &nfs_pbuf_freecnt);
348
349 if (!error) {
350 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
351 for (i = 0; i < nwritten; i++) {
352 rtvals[i] = VM_PAGER_OK;
353 vm_page_undirty(pages[i]);
354 }
355 if (must_commit) {
356 nfs_clearcommit(vp->v_mount);
357 }
358 }
359 return rtvals[0];
360 }
361
362 /*
363 * For nfs, cache consistency can only be maintained approximately.
364 * Although RFC1094 does not specify the criteria, the following is
365 * believed to be compatible with the reference port.
366 * For nfs:
367 * If the file's modify time on the server has changed since the
368 * last read rpc or you have written to the file,
369 * you may have lost data cache consistency with the
370 * server, so flush all of the file's data out of the cache.
371 * Then force a getattr rpc to ensure that you have up to date
372 * attributes.
373 * NB: This implies that cache data can be read when up to
374 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
375 * attributes this could be forced by setting n_attrstamp to 0 before
376 * the VOP_GETATTR() call.
377 */
378 static inline int
379 nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
380 {
381 int error = 0;
382 struct vattr vattr;
383 struct nfsnode *np = VTONFS(vp);
384 int old_lock;
385 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
386
387 /*
388 * Grab the exclusive lock before checking whether the cache is
389 * consistent.
390 * XXX - We can make this cheaper later (by acquiring cheaper locks).
391 * But for now, this suffices.
392 */
393 old_lock = nfs_upgrade_vnlock(vp);
394 mtx_lock(&np->n_mtx);
395 if (np->n_flag & NMODIFIED) {
396 mtx_unlock(&np->n_mtx);
397 if (vp->v_type != VREG) {
398 if (vp->v_type != VDIR)
399 panic("nfs: bioread, not dir");
400 (nmp->nm_rpcops->nr_invaldir)(vp);
401 error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
402 if (error)
403 goto out;
404 }
405 np->n_attrstamp = 0;
406 error = VOP_GETATTR(vp, &vattr, cred);
407 if (error)
408 goto out;
409 mtx_lock(&np->n_mtx);
410 np->n_mtime = vattr.va_mtime;
411 mtx_unlock(&np->n_mtx);
412 } else {
413 mtx_unlock(&np->n_mtx);
414 error = VOP_GETATTR(vp, &vattr, cred);
415 if (error)
416 return (error);
417 mtx_lock(&np->n_mtx);
418 if ((np->n_flag & NSIZECHANGED)
419 || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
420 mtx_unlock(&np->n_mtx);
421 if (vp->v_type == VDIR)
422 (nmp->nm_rpcops->nr_invaldir)(vp);
423 error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
424 if (error)
425 goto out;
426 mtx_lock(&np->n_mtx);
427 np->n_mtime = vattr.va_mtime;
428 np->n_flag &= ~NSIZECHANGED;
429 }
430 mtx_unlock(&np->n_mtx);
431 }
432 out:
433 nfs_downgrade_vnlock(vp, old_lock);
434 return error;
435 }
436
437 /*
438 * Vnode op for read using bio
439 */
440 int
441 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
442 {
443 struct nfsnode *np = VTONFS(vp);
444 int biosize, i;
445 struct buf *bp, *rabp;
446 struct thread *td;
447 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
448 daddr_t lbn, rabn;
449 int bcount;
450 int seqcount;
451 int nra, error = 0, n = 0, on = 0;
452
453 #ifdef DIAGNOSTIC
454 if (uio->uio_rw != UIO_READ)
455 panic("nfs_read mode");
456 #endif
457 if (uio->uio_resid == 0)
458 return (0);
459 if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */
460 return (EINVAL);
461 td = uio->uio_td;
462
463 mtx_lock(&nmp->nm_mtx);
464 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
465 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
466 mtx_unlock(&nmp->nm_mtx);
467 (void)nfs_fsinfo(nmp, vp, cred, td);
468 } else
469 mtx_unlock(&nmp->nm_mtx);
470
471 if (vp->v_type != VDIR &&
472 (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
473 return (EFBIG);
474
475 if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
476 /* No caching/ no readaheads. Just read data into the user buffer */
477 return nfs_readrpc(vp, uio, cred);
478
479 biosize = vp->v_mount->mnt_stat.f_iosize;
480 seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
481
482 error = nfs_bioread_check_cons(vp, td, cred);
483 if (error)
484 return error;
485
486 do {
487 u_quad_t nsize;
488
489 mtx_lock(&np->n_mtx);
490 nsize = np->n_size;
491 mtx_unlock(&np->n_mtx);
492
493 switch (vp->v_type) {
494 case VREG:
495 nfsstats.biocache_reads++;
496 lbn = uio->uio_offset / biosize;
497 on = uio->uio_offset & (biosize - 1);
498
499 /*
500 * Start the read ahead(s), as required.
501 */
502 if (nmp->nm_readahead > 0) {
503 for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
504 (off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
505 rabn = lbn + 1 + nra;
506 if (incore(&vp->v_bufobj, rabn) == NULL) {
507 rabp = nfs_getcacheblk(vp, rabn, biosize, td);
508 if (!rabp) {
509 error = nfs_sigintr(nmp, NULL, td);
510 return (error ? error : EINTR);
511 }
512 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
513 rabp->b_flags |= B_ASYNC;
514 rabp->b_iocmd = BIO_READ;
515 vfs_busy_pages(rabp, 0);
516 if (nfs_asyncio(nmp, rabp, cred, td)) {
517 rabp->b_flags |= B_INVAL;
518 rabp->b_ioflags |= BIO_ERROR;
519 vfs_unbusy_pages(rabp);
520 brelse(rabp);
521 break;
522 }
523 } else {
524 brelse(rabp);
525 }
526 }
527 }
528 }
529
530 /* Note that bcount is *not* DEV_BSIZE aligned. */
531 bcount = biosize;
532 if ((off_t)lbn * biosize >= nsize) {
533 bcount = 0;
534 } else if ((off_t)(lbn + 1) * biosize > nsize) {
535 bcount = nsize - (off_t)lbn * biosize;
536 }
537 bp = nfs_getcacheblk(vp, lbn, bcount, td);
538
539 if (!bp) {
540 error = nfs_sigintr(nmp, NULL, td);
541 return (error ? error : EINTR);
542 }
543
544 /*
545 * If B_CACHE is not set, we must issue the read. If this
546 * fails, we return an error.
547 */
548
549 if ((bp->b_flags & B_CACHE) == 0) {
550 bp->b_iocmd = BIO_READ;
551 vfs_busy_pages(bp, 0);
552 error = nfs_doio(vp, bp, cred, td);
553 if (error) {
554 brelse(bp);
555 return (error);
556 }
557 }
558
559 /*
560 * on is the offset into the current bp. Figure out how many
561 * bytes we can copy out of the bp. Note that bcount is
562 * NOT DEV_BSIZE aligned.
563 *
564 * Then figure out how many bytes we can copy into the uio.
565 */
566
567 n = 0;
568 if (on < bcount)
569 n = min((unsigned)(bcount - on), uio->uio_resid);
570 break;
571 case VLNK:
572 nfsstats.biocache_readlinks++;
573 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
574 if (!bp) {
575 error = nfs_sigintr(nmp, NULL, td);
576 return (error ? error : EINTR);
577 }
578 if ((bp->b_flags & B_CACHE) == 0) {
579 bp->b_iocmd = BIO_READ;
580 vfs_busy_pages(bp, 0);
581 error = nfs_doio(vp, bp, cred, td);
582 if (error) {
583 bp->b_ioflags |= BIO_ERROR;
584 brelse(bp);
585 return (error);
586 }
587 }
588 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
589 on = 0;
590 break;
591 case VDIR:
592 nfsstats.biocache_readdirs++;
593 if (np->n_direofoffset
594 && uio->uio_offset >= np->n_direofoffset) {
595 return (0);
596 }
597 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
598 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
599 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
600 if (!bp) {
601 error = nfs_sigintr(nmp, NULL, td);
602 return (error ? error : EINTR);
603 }
604 if ((bp->b_flags & B_CACHE) == 0) {
605 bp->b_iocmd = BIO_READ;
606 vfs_busy_pages(bp, 0);
607 error = nfs_doio(vp, bp, cred, td);
608 if (error) {
609 brelse(bp);
610 }
611 while (error == NFSERR_BAD_COOKIE) {
612 (nmp->nm_rpcops->nr_invaldir)(vp);
613 error = nfs_vinvalbuf(vp, 0, td, 1);
614 /*
615 * Yuck! The directory has been modified on the
616 * server. The only way to get the block is by
617 * reading from the beginning to get all the
618 * offset cookies.
619 *
620 * Leave the last bp intact unless there is an error.
621 * Loop back up to the while if the error is another
622 * NFSERR_BAD_COOKIE (double yuch!).
623 */
624 for (i = 0; i <= lbn && !error; i++) {
625 if (np->n_direofoffset
626 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
627 return (0);
628 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
629 if (!bp) {
630 error = nfs_sigintr(nmp, NULL, td);
631 return (error ? error : EINTR);
632 }
633 if ((bp->b_flags & B_CACHE) == 0) {
634 bp->b_iocmd = BIO_READ;
635 vfs_busy_pages(bp, 0);
636 error = nfs_doio(vp, bp, cred, td);
637 /*
638 * no error + B_INVAL == directory EOF,
639 * use the block.
640 */
641 if (error == 0 && (bp->b_flags & B_INVAL))
642 break;
643 }
644 /*
645 * An error will throw away the block and the
646 * for loop will break out. If no error and this
647 * is not the block we want, we throw away the
648 * block and go for the next one via the for loop.
649 */
650 if (error || i < lbn)
651 brelse(bp);
652 }
653 }
654 /*
655 * The above while is repeated if we hit another cookie
656 * error. If we hit an error and it wasn't a cookie error,
657 * we give up.
658 */
659 if (error)
660 return (error);
661 }
662
663 /*
664 * If not eof and read aheads are enabled, start one.
665 * (You need the current block first, so that you have the
666 * directory offset cookie of the next block.)
667 */
668 if (nmp->nm_readahead > 0 &&
669 (bp->b_flags & B_INVAL) == 0 &&
670 (np->n_direofoffset == 0 ||
671 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
672 incore(&vp->v_bufobj, lbn + 1) == NULL) {
673 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
674 if (rabp) {
675 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
676 rabp->b_flags |= B_ASYNC;
677 rabp->b_iocmd = BIO_READ;
678 vfs_busy_pages(rabp, 0);
679 if (nfs_asyncio(nmp, rabp, cred, td)) {
680 rabp->b_flags |= B_INVAL;
681 rabp->b_ioflags |= BIO_ERROR;
682 vfs_unbusy_pages(rabp);
683 brelse(rabp);
684 }
685 } else {
686 brelse(rabp);
687 }
688 }
689 }
690 /*
691 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
692 * chopped for the EOF condition, we cannot tell how large
693 * NFS directories are going to be until we hit EOF. So
694 * an NFS directory buffer is *not* chopped to its EOF. Now,
695 * it just so happens that b_resid will effectively chop it
696 * to EOF. *BUT* this information is lost if the buffer goes
697 * away and is reconstituted into a B_CACHE state ( due to
698 * being VMIO ) later. So we keep track of the directory eof
699 * in np->n_direofoffset and chop it off as an extra step
700 * right here.
701 */
702 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
703 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
704 n = np->n_direofoffset - uio->uio_offset;
705 break;
706 default:
707 nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
708 bp = NULL;
709 break;
710 };
711
712 if (n > 0) {
713 error = uiomove(bp->b_data + on, (int)n, uio);
714 }
715 if (vp->v_type == VLNK)
716 n = 0;
717 if (bp != NULL)
718 brelse(bp);
719 } while (error == 0 && uio->uio_resid > 0 && n > 0);
720 return (error);
721 }
722
723 /*
724 * The NFS write path cannot handle iovecs with len > 1. So we need to
725 * break up iovecs accordingly (restricting them to wsize).
726 * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf).
727 * For the ASYNC case, 2 copies are needed. The first a copy from the
728 * user buffer to a staging buffer and then a second copy from the staging
729 * buffer to mbufs. This can be optimized by copying from the user buffer
730 * directly into mbufs and passing the chain down, but that requires a
731 * fair amount of re-working of the relevant codepaths (and can be done
732 * later).
733 */
734 static int
735 nfs_directio_write(vp, uiop, cred, ioflag)
736 struct vnode *vp;
737 struct uio *uiop;
738 struct ucred *cred;
739 int ioflag;
740 {
741 int error;
742 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
743 struct thread *td = uiop->uio_td;
744 int size;
745 int wsize;
746
747 mtx_lock(&nmp->nm_mtx);
748 wsize = nmp->nm_wsize;
749 mtx_unlock(&nmp->nm_mtx);
750 if (ioflag & IO_SYNC) {
751 int iomode, must_commit;
752 struct uio uio;
753 struct iovec iov;
754 do_sync:
755 while (uiop->uio_resid > 0) {
756 size = min(uiop->uio_resid, wsize);
757 size = min(uiop->uio_iov->iov_len, size);
758 iov.iov_base = uiop->uio_iov->iov_base;
759 iov.iov_len = size;
760 uio.uio_iov = &iov;
761 uio.uio_iovcnt = 1;
762 uio.uio_offset = uiop->uio_offset;
763 uio.uio_resid = size;
764 uio.uio_segflg = UIO_USERSPACE;
765 uio.uio_rw = UIO_WRITE;
766 uio.uio_td = td;
767 iomode = NFSV3WRITE_FILESYNC;
768 error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred,
769 &iomode, &must_commit);
770 KASSERT((must_commit == 0),
771 ("nfs_directio_write: Did not commit write"));
772 if (error)
773 return (error);
774 uiop->uio_offset += size;
775 uiop->uio_resid -= size;
776 if (uiop->uio_iov->iov_len <= size) {
777 uiop->uio_iovcnt--;
778 uiop->uio_iov++;
779 } else {
780 uiop->uio_iov->iov_base =
781 (char *)uiop->uio_iov->iov_base + size;
782 uiop->uio_iov->iov_len -= size;
783 }
784 }
785 } else {
786 struct uio *t_uio;
787 struct iovec *t_iov;
788 struct buf *bp;
789
790 /*
791 * Break up the write into blocksize chunks and hand these
792 * over to nfsiod's for write back.
793 * Unfortunately, this incurs a copy of the data. Since
794 * the user could modify the buffer before the write is
795 * initiated.
796 *
797 * The obvious optimization here is that one of the 2 copies
798 * in the async write path can be eliminated by copying the
799 * data here directly into mbufs and passing the mbuf chain
800 * down. But that will require a fair amount of re-working
801 * of the code and can be done if there's enough interest
802 * in NFS directio access.
803 */
804 while (uiop->uio_resid > 0) {
805 size = min(uiop->uio_resid, wsize);
806 size = min(uiop->uio_iov->iov_len, size);
807 bp = getpbuf(&nfs_pbuf_freecnt);
808 t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
809 t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
810 t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
811 t_iov->iov_len = size;
812 t_uio->uio_iov = t_iov;
813 t_uio->uio_iovcnt = 1;
814 t_uio->uio_offset = uiop->uio_offset;
815 t_uio->uio_resid = size;
816 t_uio->uio_segflg = UIO_SYSSPACE;
817 t_uio->uio_rw = UIO_WRITE;
818 t_uio->uio_td = td;
819 bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size);
820 bp->b_flags |= B_DIRECT;
821 bp->b_iocmd = BIO_WRITE;
822 if (cred != NOCRED) {
823 crhold(cred);
824 bp->b_wcred = cred;
825 } else
826 bp->b_wcred = NOCRED;
827 bp->b_caller1 = (void *)t_uio;
828 bp->b_vp = vp;
829 error = nfs_asyncio(nmp, bp, NOCRED, td);
830 if (error) {
831 free(t_iov->iov_base, M_NFSDIRECTIO);
832 free(t_iov, M_NFSDIRECTIO);
833 free(t_uio, M_NFSDIRECTIO);
834 bp->b_vp = NULL;
835 relpbuf(bp, &nfs_pbuf_freecnt);
836 if (error == EINTR)
837 return (error);
838 goto do_sync;
839 }
840 uiop->uio_offset += size;
841 uiop->uio_resid -= size;
842 if (uiop->uio_iov->iov_len <= size) {
843 uiop->uio_iovcnt--;
844 uiop->uio_iov++;
845 } else {
846 uiop->uio_iov->iov_base =
847 (char *)uiop->uio_iov->iov_base + size;
848 uiop->uio_iov->iov_len -= size;
849 }
850 }
851 }
852 return (0);
853 }
854
855 /*
856 * Vnode op for write using bio
857 */
858 int
859 nfs_write(struct vop_write_args *ap)
860 {
861 int biosize;
862 struct uio *uio = ap->a_uio;
863 struct thread *td = uio->uio_td;
864 struct vnode *vp = ap->a_vp;
865 struct nfsnode *np = VTONFS(vp);
866 struct ucred *cred = ap->a_cred;
867 int ioflag = ap->a_ioflag;
868 struct buf *bp;
869 struct vattr vattr;
870 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
871 daddr_t lbn;
872 int bcount;
873 int n, on, error = 0;
874 struct proc *p = td?td->td_proc:NULL;
875
876 #ifdef DIAGNOSTIC
877 if (uio->uio_rw != UIO_WRITE)
878 panic("nfs_write mode");
879 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
880 panic("nfs_write proc");
881 #endif
882 if (vp->v_type != VREG)
883 return (EIO);
884 mtx_lock(&np->n_mtx);
885 if (np->n_flag & NWRITEERR) {
886 np->n_flag &= ~NWRITEERR;
887 mtx_unlock(&np->n_mtx);
888 return (np->n_error);
889 } else
890 mtx_unlock(&np->n_mtx);
891 mtx_lock(&nmp->nm_mtx);
892 if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
893 (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
894 mtx_unlock(&nmp->nm_mtx);
895 (void)nfs_fsinfo(nmp, vp, cred, td);
896 } else
897 mtx_unlock(&nmp->nm_mtx);
898
899 /*
900 * Synchronously flush pending buffers if we are in synchronous
901 * mode or if we are appending.
902 */
903 if (ioflag & (IO_APPEND | IO_SYNC)) {
904 mtx_lock(&np->n_mtx);
905 if ( |