[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]

FreeBSD/Linux Kernel Cross Reference
sys/nfsclient/nfs_bio.c

Version: -  FREEBSD  -  FREEBSD7  -  FREEBSD70  -  FREEBSD6  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  OPENSOLARIS  -  minix-3-1-1  -  TRUSTEDBSD-SEBSD  -  FREEBSD-LIBC  -  FREEBSD7-LIBC  -  FREEBSD6-LIBC  -  GLIBC27 
SearchContext: -  none  -  excerpts  -  bigexcerpts 

  1 /*-
  2  * Copyright (c) 1989, 1993
  3  *      The Regents of the University of California.  All rights reserved.
  4  *
  5  * This code is derived from software contributed to Berkeley by
  6  * Rick Macklem at The University of Guelph.
  7  *
  8  * Redistribution and use in source and binary forms, with or without
  9  * modification, are permitted provided that the following conditions
 10  * are met:
 11  * 1. Redistributions of source code must retain the above copyright
 12  *    notice, this list of conditions and the following disclaimer.
 13  * 2. Redistributions in binary form must reproduce the above copyright
 14  *    notice, this list of conditions and the following disclaimer in the
 15  *    documentation and/or other materials provided with the distribution.
 16  * 4. Neither the name of the University nor the names of its contributors
 17  *    may be used to endorse or promote products derived from this software
 18  *    without specific prior written permission.
 19  *
 20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 30  * SUCH DAMAGE.
 31  *
 32  *      @(#)nfs_bio.c   8.9 (Berkeley) 3/30/95
 33  */
 34 
 35 #include <sys/cdefs.h>
 36 __FBSDID("$FreeBSD: src/sys/nfsclient/nfs_bio.c,v 1.168 2008/10/10 21:23:50 attilio Exp $");
 37 
 38 #include <sys/param.h>
 39 #include <sys/systm.h>
 40 #include <sys/bio.h>
 41 #include <sys/buf.h>
 42 #include <sys/kernel.h>
 43 #include <sys/mount.h>
 44 #include <sys/proc.h>
 45 #include <sys/resourcevar.h>
 46 #include <sys/signalvar.h>
 47 #include <sys/vmmeter.h>
 48 #include <sys/vnode.h>
 49 
 50 #include <vm/vm.h>
 51 #include <vm/vm_extern.h>
 52 #include <vm/vm_page.h>
 53 #include <vm/vm_object.h>
 54 #include <vm/vm_pager.h>
 55 #include <vm/vnode_pager.h>
 56 
 57 #include <rpc/rpcclnt.h>
 58 
 59 #include <nfs/rpcv2.h>
 60 #include <nfs/nfsproto.h>
 61 #include <nfsclient/nfs.h>
 62 #include <nfsclient/nfsmount.h>
 63 #include <nfsclient/nfsnode.h>
 64 
 65 #include <nfs4client/nfs4.h>
 66 
 67 static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
 68                     struct thread *td);
 69 static int nfs_directio_write(struct vnode *vp, struct uio *uiop, 
 70                               struct ucred *cred, int ioflag);
 71 
 72 extern int nfs_directio_enable;
 73 extern int nfs_directio_allow_mmap;
 74 
 75 /*
 76  * Vnode op for VM getpages.
 77  */
 78 int
 79 nfs_getpages(struct vop_getpages_args *ap)
 80 {
 81         int i, error, nextoff, size, toff, count, npages;
 82         struct uio uio;
 83         struct iovec iov;
 84         vm_offset_t kva;
 85         struct buf *bp;
 86         struct vnode *vp;
 87         struct thread *td;
 88         struct ucred *cred;
 89         struct nfsmount *nmp;
 90         vm_object_t object;
 91         vm_page_t *pages;
 92         struct nfsnode *np;
 93 
 94         vp = ap->a_vp;
 95         np = VTONFS(vp);
 96         td = curthread;                         /* XXX */
 97         cred = curthread->td_ucred;             /* XXX */
 98         nmp = VFSTONFS(vp->v_mount);
 99         pages = ap->a_m;
100         count = ap->a_count;
101 
102         if ((object = vp->v_object) == NULL) {
103                 nfs_printf("nfs_getpages: called with non-merged cache vnode??\n");
104                 return VM_PAGER_ERROR;
105         }
106 
107         if (nfs_directio_enable && !nfs_directio_allow_mmap) {
108                 mtx_lock(&np->n_mtx);
109                 if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
110                         mtx_unlock(&np->n_mtx);
111                         nfs_printf("nfs_getpages: called on non-cacheable vnode??\n");
112                         return VM_PAGER_ERROR;
113                 } else
114                         mtx_unlock(&np->n_mtx);
115         }
116 
117         mtx_lock(&nmp->nm_mtx);
118         if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
119             (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {  
120                 mtx_unlock(&nmp->nm_mtx);
121                 /* We'll never get here for v4, because we always have fsinfo */
122                 (void)nfs_fsinfo(nmp, vp, cred, td);
123         } else
124                 mtx_unlock(&nmp->nm_mtx);
125 
126         npages = btoc(count);
127 
128         /*
129          * If the requested page is partially valid, just return it and
130          * allow the pager to zero-out the blanks.  Partially valid pages
131          * can only occur at the file EOF.
132          */
133 
134         {
135                 vm_page_t m = pages[ap->a_reqpage];
136 
137                 VM_OBJECT_LOCK(object);
138                 vm_page_lock_queues();
139                 if (m->valid != 0) {
140                         /* handled by vm_fault now        */
141                         /* vm_page_zero_invalid(m, TRUE); */
142                         for (i = 0; i < npages; ++i) {
143                                 if (i != ap->a_reqpage)
144                                         vm_page_free(pages[i]);
145                         }
146                         vm_page_unlock_queues();
147                         VM_OBJECT_UNLOCK(object);
148                         return(0);
149                 }
150                 vm_page_unlock_queues();
151                 VM_OBJECT_UNLOCK(object);
152         }
153 
154         /*
155          * We use only the kva address for the buffer, but this is extremely
156          * convienient and fast.
157          */
158         bp = getpbuf(&nfs_pbuf_freecnt);
159 
160         kva = (vm_offset_t) bp->b_data;
161         pmap_qenter(kva, pages, npages);
162         PCPU_INC(cnt.v_vnodein);
163         PCPU_ADD(cnt.v_vnodepgsin, npages);
164 
165         iov.iov_base = (caddr_t) kva;
166         iov.iov_len = count;
167         uio.uio_iov = &iov;
168         uio.uio_iovcnt = 1;
169         uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
170         uio.uio_resid = count;
171         uio.uio_segflg = UIO_SYSSPACE;
172         uio.uio_rw = UIO_READ;
173         uio.uio_td = td;
174 
175         error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred);
176         pmap_qremove(kva, npages);
177 
178         relpbuf(bp, &nfs_pbuf_freecnt);
179 
180         if (error && (uio.uio_resid == count)) {
181                 nfs_printf("nfs_getpages: error %d\n", error);
182                 VM_OBJECT_LOCK(object);
183                 vm_page_lock_queues();
184                 for (i = 0; i < npages; ++i) {
185                         if (i != ap->a_reqpage)
186                                 vm_page_free(pages[i]);
187                 }
188                 vm_page_unlock_queues();
189                 VM_OBJECT_UNLOCK(object);
190                 return VM_PAGER_ERROR;
191         }
192 
193         /*
194          * Calculate the number of bytes read and validate only that number
195          * of bytes.  Note that due to pending writes, size may be 0.  This
196          * does not mean that the remaining data is invalid!
197          */
198 
199         size = count - uio.uio_resid;
200         VM_OBJECT_LOCK(object);
201         vm_page_lock_queues();
202         for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
203                 vm_page_t m;
204                 nextoff = toff + PAGE_SIZE;
205                 m = pages[i];
206 
207                 if (nextoff <= size) {
208                         /*
209                          * Read operation filled an entire page
210                          */
211                         m->valid = VM_PAGE_BITS_ALL;
212                         vm_page_undirty(m);
213                 } else if (size > toff) {
214                         /*
215                          * Read operation filled a partial page.
216                          */
217                         m->valid = 0;
218                         vm_page_set_validclean(m, 0, size - toff);
219                         /* handled by vm_fault now        */
220                         /* vm_page_zero_invalid(m, TRUE); */
221                 } else {
222                         /*
223                          * Read operation was short.  If no error occured
224                          * we may have hit a zero-fill section.   We simply
225                          * leave valid set to 0.
226                          */
227                         ;
228                 }
229                 if (i != ap->a_reqpage) {
230                         /*
231                          * Whether or not to leave the page activated is up in
232                          * the air, but we should put the page on a page queue
233                          * somewhere (it already is in the object).  Result:
234                          * It appears that emperical results show that
235                          * deactivating pages is best.
236                          */
237 
238                         /*
239                          * Just in case someone was asking for this page we
240                          * now tell them that it is ok to use.
241                          */
242                         if (!error) {
243                                 if (m->oflags & VPO_WANTED)
244                                         vm_page_activate(m);
245                                 else
246                                         vm_page_deactivate(m);
247                                 vm_page_wakeup(m);
248                         } else {
249                                 vm_page_free(m);
250                         }
251                 }
252         }
253         vm_page_unlock_queues();
254         VM_OBJECT_UNLOCK(object);
255         return 0;
256 }
257 
258 /*
259  * Vnode op for VM putpages.
260  */
261 int
262 nfs_putpages(struct vop_putpages_args *ap)
263 {
264         struct uio uio;
265         struct iovec iov;
266         vm_offset_t kva;
267         struct buf *bp;
268         int iomode, must_commit, i, error, npages, count;
269         off_t offset;
270         int *rtvals;
271         struct vnode *vp;
272         struct thread *td;
273         struct ucred *cred;
274         struct nfsmount *nmp;
275         struct nfsnode *np;
276         vm_page_t *pages;
277 
278         vp = ap->a_vp;
279         np = VTONFS(vp);
280         td = curthread;                         /* XXX */
281         cred = curthread->td_ucred;             /* XXX */
282         nmp = VFSTONFS(vp->v_mount);
283         pages = ap->a_m;
284         count = ap->a_count;
285         rtvals = ap->a_rtvals;
286         npages = btoc(count);
287         offset = IDX_TO_OFF(pages[0]->pindex);
288         
289         mtx_lock(&nmp->nm_mtx);
290         if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
291             (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
292                 mtx_unlock(&nmp->nm_mtx);
293                 (void)nfs_fsinfo(nmp, vp, cred, td);
294         } else
295                 mtx_unlock(&nmp->nm_mtx);
296 
297         mtx_lock(&np->n_mtx);
298         if (nfs_directio_enable && !nfs_directio_allow_mmap && 
299             (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
300                 mtx_unlock(&np->n_mtx);         
301                 nfs_printf("nfs_putpages: called on noncache-able vnode??\n");
302                 mtx_lock(&np->n_mtx);
303         }
304 
305         for (i = 0; i < npages; i++)
306                 rtvals[i] = VM_PAGER_AGAIN;
307 
308         /*
309          * When putting pages, do not extend file past EOF.
310          */
311         if (offset + count > np->n_size) {
312                 count = np->n_size - offset;
313                 if (count < 0)
314                         count = 0;
315         }
316         mtx_unlock(&np->n_mtx);
317 
318         /*
319          * We use only the kva address for the buffer, but this is extremely
320          * convienient and fast.
321          */
322         bp = getpbuf(&nfs_pbuf_freecnt);
323 
324         kva = (vm_offset_t) bp->b_data;
325         pmap_qenter(kva, pages, npages);
326         PCPU_INC(cnt.v_vnodeout);
327         PCPU_ADD(cnt.v_vnodepgsout, count);
328 
329         iov.iov_base = (caddr_t) kva;
330         iov.iov_len = count;
331         uio.uio_iov = &iov;
332         uio.uio_iovcnt = 1;
333         uio.uio_offset = offset;
334         uio.uio_resid = count;
335         uio.uio_segflg = UIO_SYSSPACE;
336         uio.uio_rw = UIO_WRITE;
337         uio.uio_td = td;
338 
339         if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
340             iomode = NFSV3WRITE_UNSTABLE;
341         else
342             iomode = NFSV3WRITE_FILESYNC;
343 
344         error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit);
345 
346         pmap_qremove(kva, npages);
347         relpbuf(bp, &nfs_pbuf_freecnt);
348 
349         if (!error) {
350                 int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
351                 for (i = 0; i < nwritten; i++) {
352                         rtvals[i] = VM_PAGER_OK;
353                         vm_page_undirty(pages[i]);
354                 }
355                 if (must_commit) {
356                         nfs_clearcommit(vp->v_mount);
357                 }
358         }
359         return rtvals[0];
360 }
361 
362 /*
363  * For nfs, cache consistency can only be maintained approximately.
364  * Although RFC1094 does not specify the criteria, the following is
365  * believed to be compatible with the reference port.
366  * For nfs:
367  * If the file's modify time on the server has changed since the
368  * last read rpc or you have written to the file,
369  * you may have lost data cache consistency with the
370  * server, so flush all of the file's data out of the cache.
371  * Then force a getattr rpc to ensure that you have up to date
372  * attributes.
373  * NB: This implies that cache data can be read when up to
374  * NFS_ATTRTIMEO seconds out of date. If you find that you need current
375  * attributes this could be forced by setting n_attrstamp to 0 before
376  * the VOP_GETATTR() call.
377  */
378 static inline int
379 nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
380 {
381         int error = 0;
382         struct vattr vattr;
383         struct nfsnode *np = VTONFS(vp);
384         int old_lock;
385         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
386         
387         /*
388          * Grab the exclusive lock before checking whether the cache is
389          * consistent.
390          * XXX - We can make this cheaper later (by acquiring cheaper locks).
391          * But for now, this suffices.
392          */
393         old_lock = nfs_upgrade_vnlock(vp);
394         mtx_lock(&np->n_mtx);
395         if (np->n_flag & NMODIFIED) {
396                 mtx_unlock(&np->n_mtx);
397                 if (vp->v_type != VREG) {
398                         if (vp->v_type != VDIR)
399                                 panic("nfs: bioread, not dir");
400                         (nmp->nm_rpcops->nr_invaldir)(vp);
401                         error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
402                         if (error)
403                                 goto out;
404                 }
405                 np->n_attrstamp = 0;
406                 error = VOP_GETATTR(vp, &vattr, cred);
407                 if (error)
408                         goto out;
409                 mtx_lock(&np->n_mtx);
410                 np->n_mtime = vattr.va_mtime;
411                 mtx_unlock(&np->n_mtx);
412         } else {
413                 mtx_unlock(&np->n_mtx);
414                 error = VOP_GETATTR(vp, &vattr, cred);
415                 if (error)
416                         return (error);
417                 mtx_lock(&np->n_mtx);
418                 if ((np->n_flag & NSIZECHANGED)
419                     || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
420                         mtx_unlock(&np->n_mtx);
421                         if (vp->v_type == VDIR)
422                                 (nmp->nm_rpcops->nr_invaldir)(vp);
423                         error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
424                         if (error)
425                                 goto out;
426                         mtx_lock(&np->n_mtx);
427                         np->n_mtime = vattr.va_mtime;
428                         np->n_flag &= ~NSIZECHANGED;
429                 }
430                 mtx_unlock(&np->n_mtx);
431         }
432 out:    
433         nfs_downgrade_vnlock(vp, old_lock);
434         return error;
435 }
436 
437 /*
438  * Vnode op for read using bio
439  */
440 int
441 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
442 {
443         struct nfsnode *np = VTONFS(vp);
444         int biosize, i;
445         struct buf *bp, *rabp;
446         struct thread *td;
447         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
448         daddr_t lbn, rabn;
449         int bcount;
450         int seqcount;
451         int nra, error = 0, n = 0, on = 0;
452 
453 #ifdef DIAGNOSTIC
454         if (uio->uio_rw != UIO_READ)
455                 panic("nfs_read mode");
456 #endif
457         if (uio->uio_resid == 0)
458                 return (0);
459         if (uio->uio_offset < 0)        /* XXX VDIR cookies can be negative */
460                 return (EINVAL);
461         td = uio->uio_td;
462 
463         mtx_lock(&nmp->nm_mtx);
464         if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
465             (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
466                 mtx_unlock(&nmp->nm_mtx);
467                 (void)nfs_fsinfo(nmp, vp, cred, td);
468         } else
469                 mtx_unlock(&nmp->nm_mtx);               
470 
471         if (vp->v_type != VDIR &&
472             (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
473                 return (EFBIG);
474 
475         if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
476                 /* No caching/ no readaheads. Just read data into the user buffer */
477                 return nfs_readrpc(vp, uio, cred);
478 
479         biosize = vp->v_mount->mnt_stat.f_iosize;
480         seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
481         
482         error = nfs_bioread_check_cons(vp, td, cred);
483         if (error)
484                 return error;
485 
486         do {
487             u_quad_t nsize;
488                         
489             mtx_lock(&np->n_mtx);
490             nsize = np->n_size;
491             mtx_unlock(&np->n_mtx);                 
492 
493             switch (vp->v_type) {
494             case VREG:
495                 nfsstats.biocache_reads++;
496                 lbn = uio->uio_offset / biosize;
497                 on = uio->uio_offset & (biosize - 1);
498 
499                 /*
500                  * Start the read ahead(s), as required.
501                  */
502                 if (nmp->nm_readahead > 0) {
503                     for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
504                         (off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
505                         rabn = lbn + 1 + nra;
506                         if (incore(&vp->v_bufobj, rabn) == NULL) {
507                             rabp = nfs_getcacheblk(vp, rabn, biosize, td);
508                             if (!rabp) {
509                                 error = nfs_sigintr(nmp, NULL, td);
510                                 return (error ? error : EINTR);
511                             }
512                             if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
513                                 rabp->b_flags |= B_ASYNC;
514                                 rabp->b_iocmd = BIO_READ;
515                                 vfs_busy_pages(rabp, 0);
516                                 if (nfs_asyncio(nmp, rabp, cred, td)) {
517                                     rabp->b_flags |= B_INVAL;
518                                     rabp->b_ioflags |= BIO_ERROR;
519                                     vfs_unbusy_pages(rabp);
520                                     brelse(rabp);
521                                     break;
522                                 }
523                             } else {
524                                 brelse(rabp);
525                             }
526                         }
527                     }
528                 }
529 
530                 /* Note that bcount is *not* DEV_BSIZE aligned. */
531                 bcount = biosize;
532                 if ((off_t)lbn * biosize >= nsize) {
533                         bcount = 0;
534                 } else if ((off_t)(lbn + 1) * biosize > nsize) {
535                         bcount = nsize - (off_t)lbn * biosize;
536                 }
537                 bp = nfs_getcacheblk(vp, lbn, bcount, td);
538 
539                 if (!bp) {
540                         error = nfs_sigintr(nmp, NULL, td);
541                         return (error ? error : EINTR);
542                 }
543 
544                 /*
545                  * If B_CACHE is not set, we must issue the read.  If this
546                  * fails, we return an error.
547                  */
548 
549                 if ((bp->b_flags & B_CACHE) == 0) {
550                     bp->b_iocmd = BIO_READ;
551                     vfs_busy_pages(bp, 0);
552                     error = nfs_doio(vp, bp, cred, td);
553                     if (error) {
554                         brelse(bp);
555                         return (error);
556                     }
557                 }
558 
559                 /*
560                  * on is the offset into the current bp.  Figure out how many
561                  * bytes we can copy out of the bp.  Note that bcount is
562                  * NOT DEV_BSIZE aligned.
563                  *
564                  * Then figure out how many bytes we can copy into the uio.
565                  */
566 
567                 n = 0;
568                 if (on < bcount)
569                         n = min((unsigned)(bcount - on), uio->uio_resid);
570                 break;
571             case VLNK:
572                 nfsstats.biocache_readlinks++;
573                 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
574                 if (!bp) {
575                         error = nfs_sigintr(nmp, NULL, td);
576                         return (error ? error : EINTR);
577                 }
578                 if ((bp->b_flags & B_CACHE) == 0) {
579                     bp->b_iocmd = BIO_READ;
580                     vfs_busy_pages(bp, 0);
581                     error = nfs_doio(vp, bp, cred, td);
582                     if (error) {
583                         bp->b_ioflags |= BIO_ERROR;
584                         brelse(bp);
585                         return (error);
586                     }
587                 }
588                 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
589                 on = 0;
590                 break;
591             case VDIR:
592                 nfsstats.biocache_readdirs++;
593                 if (np->n_direofoffset
594                     && uio->uio_offset >= np->n_direofoffset) {
595                     return (0);
596                 }
597                 lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
598                 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
599                 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
600                 if (!bp) {
601                     error = nfs_sigintr(nmp, NULL, td);
602                     return (error ? error : EINTR);
603                 }
604                 if ((bp->b_flags & B_CACHE) == 0) {
605                     bp->b_iocmd = BIO_READ;
606                     vfs_busy_pages(bp, 0);
607                     error = nfs_doio(vp, bp, cred, td);
608                     if (error) {
609                             brelse(bp);
610                     }
611                     while (error == NFSERR_BAD_COOKIE) {
612                         (nmp->nm_rpcops->nr_invaldir)(vp);
613                         error = nfs_vinvalbuf(vp, 0, td, 1);
614                         /*
615                          * Yuck! The directory has been modified on the
616                          * server. The only way to get the block is by
617                          * reading from the beginning to get all the
618                          * offset cookies.
619                          *
620                          * Leave the last bp intact unless there is an error.
621                          * Loop back up to the while if the error is another
622                          * NFSERR_BAD_COOKIE (double yuch!).
623                          */
624                         for (i = 0; i <= lbn && !error; i++) {
625                             if (np->n_direofoffset
626                                 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
627                                     return (0);
628                             bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
629                             if (!bp) {
630                                 error = nfs_sigintr(nmp, NULL, td);
631                                 return (error ? error : EINTR);
632                             }
633                             if ((bp->b_flags & B_CACHE) == 0) {
634                                     bp->b_iocmd = BIO_READ;
635                                     vfs_busy_pages(bp, 0);
636                                     error = nfs_doio(vp, bp, cred, td);
637                                     /*
638                                      * no error + B_INVAL == directory EOF,
639                                      * use the block.
640                                      */
641                                     if (error == 0 && (bp->b_flags & B_INVAL))
642                                             break;
643                             }
644                             /*
645                              * An error will throw away the block and the
646                              * for loop will break out.  If no error and this
647                              * is not the block we want, we throw away the
648                              * block and go for the next one via the for loop.
649                              */
650                             if (error || i < lbn)
651                                     brelse(bp);
652                         }
653                     }
654                     /*
655                      * The above while is repeated if we hit another cookie
656                      * error.  If we hit an error and it wasn't a cookie error,
657                      * we give up.
658                      */
659                     if (error)
660                             return (error);
661                 }
662 
663                 /*
664                  * If not eof and read aheads are enabled, start one.
665                  * (You need the current block first, so that you have the
666                  *  directory offset cookie of the next block.)
667                  */
668                 if (nmp->nm_readahead > 0 &&
669                     (bp->b_flags & B_INVAL) == 0 &&
670                     (np->n_direofoffset == 0 ||
671                     (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
672                     incore(&vp->v_bufobj, lbn + 1) == NULL) {
673                         rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
674                         if (rabp) {
675                             if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
676                                 rabp->b_flags |= B_ASYNC;
677                                 rabp->b_iocmd = BIO_READ;
678                                 vfs_busy_pages(rabp, 0);
679                                 if (nfs_asyncio(nmp, rabp, cred, td)) {
680                                     rabp->b_flags |= B_INVAL;
681                                     rabp->b_ioflags |= BIO_ERROR;
682                                     vfs_unbusy_pages(rabp);
683                                     brelse(rabp);
684                                 }
685                             } else {
686                                 brelse(rabp);
687                             }
688                         }
689                 }
690                 /*
691                  * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
692                  * chopped for the EOF condition, we cannot tell how large
693                  * NFS directories are going to be until we hit EOF.  So
694                  * an NFS directory buffer is *not* chopped to its EOF.  Now,
695                  * it just so happens that b_resid will effectively chop it
696                  * to EOF.  *BUT* this information is lost if the buffer goes
697                  * away and is reconstituted into a B_CACHE state ( due to
698                  * being VMIO ) later.  So we keep track of the directory eof
699                  * in np->n_direofoffset and chop it off as an extra step
700                  * right here.
701                  */
702                 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
703                 if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
704                         n = np->n_direofoffset - uio->uio_offset;
705                 break;
706             default:
707                 nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
708                 bp = NULL;
709                 break;
710             };
711 
712             if (n > 0) {
713                     error = uiomove(bp->b_data + on, (int)n, uio);
714             }
715             if (vp->v_type == VLNK)
716                 n = 0;
717             if (bp != NULL)
718                 brelse(bp);
719         } while (error == 0 && uio->uio_resid > 0 && n > 0);
720         return (error);
721 }
722 
723 /*
724  * The NFS write path cannot handle iovecs with len > 1. So we need to 
725  * break up iovecs accordingly (restricting them to wsize).
726  * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 
727  * For the ASYNC case, 2 copies are needed. The first a copy from the 
728  * user buffer to a staging buffer and then a second copy from the staging
729  * buffer to mbufs. This can be optimized by copying from the user buffer
730  * directly into mbufs and passing the chain down, but that requires a 
731  * fair amount of re-working of the relevant codepaths (and can be done
732  * later).
733  */
734 static int
735 nfs_directio_write(vp, uiop, cred, ioflag)
736         struct vnode *vp;
737         struct uio *uiop;
738         struct ucred *cred;
739         int ioflag;
740 {
741         int error;
742         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
743         struct thread *td = uiop->uio_td;
744         int size;
745         int wsize;
746         
747         mtx_lock(&nmp->nm_mtx);
748         wsize = nmp->nm_wsize;
749         mtx_unlock(&nmp->nm_mtx);
750         if (ioflag & IO_SYNC) {
751                 int iomode, must_commit;
752                 struct uio uio;
753                 struct iovec iov;
754 do_sync:
755                 while (uiop->uio_resid > 0) {
756                         size = min(uiop->uio_resid, wsize);
757                         size = min(uiop->uio_iov->iov_len, size);
758                         iov.iov_base = uiop->uio_iov->iov_base;
759                         iov.iov_len = size;
760                         uio.uio_iov = &iov;
761                         uio.uio_iovcnt = 1;
762                         uio.uio_offset = uiop->uio_offset;
763                         uio.uio_resid = size;
764                         uio.uio_segflg = UIO_USERSPACE;
765                         uio.uio_rw = UIO_WRITE;
766                         uio.uio_td = td;
767                         iomode = NFSV3WRITE_FILESYNC;
768                         error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 
769                                                       &iomode, &must_commit);
770                         KASSERT((must_commit == 0), 
771                                 ("nfs_directio_write: Did not commit write"));
772                         if (error)
773                                 return (error);
774                         uiop->uio_offset += size;
775                         uiop->uio_resid -= size;
776                         if (uiop->uio_iov->iov_len <= size) {
777                                 uiop->uio_iovcnt--;
778                                 uiop->uio_iov++;
779                         } else {
780                                 uiop->uio_iov->iov_base = 
781                                         (char *)uiop->uio_iov->iov_base + size;
782                                 uiop->uio_iov->iov_len -= size;
783                         }
784                 }
785         } else {
786                 struct uio *t_uio;
787                 struct iovec *t_iov;
788                 struct buf *bp;
789                 
790                 /*
791                  * Break up the write into blocksize chunks and hand these
792                  * over to nfsiod's for write back.
793                  * Unfortunately, this incurs a copy of the data. Since 
794                  * the user could modify the buffer before the write is 
795                  * initiated.
796                  * 
797                  * The obvious optimization here is that one of the 2 copies
798                  * in the async write path can be eliminated by copying the
799                  * data here directly into mbufs and passing the mbuf chain
800                  * down. But that will require a fair amount of re-working
801                  * of the code and can be done if there's enough interest
802                  * in NFS directio access.
803                  */
804                 while (uiop->uio_resid > 0) {
805                         size = min(uiop->uio_resid, wsize);
806                         size = min(uiop->uio_iov->iov_len, size);
807                         bp = getpbuf(&nfs_pbuf_freecnt);
808                         t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
809                         t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
810                         t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
811                         t_iov->iov_len = size;
812                         t_uio->uio_iov = t_iov;
813                         t_uio->uio_iovcnt = 1;
814                         t_uio->uio_offset = uiop->uio_offset;
815                         t_uio->uio_resid = size;
816                         t_uio->uio_segflg = UIO_SYSSPACE;
817                         t_uio->uio_rw = UIO_WRITE;
818                         t_uio->uio_td = td;
819                         bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size);
820                         bp->b_flags |= B_DIRECT;
821                         bp->b_iocmd = BIO_WRITE;
822                         if (cred != NOCRED) {
823                                 crhold(cred);
824                                 bp->b_wcred = cred;
825                         } else 
826                                 bp->b_wcred = NOCRED;                   
827                         bp->b_caller1 = (void *)t_uio;
828                         bp->b_vp = vp;
829                         error = nfs_asyncio(nmp, bp, NOCRED, td);
830                         if (error) {
831                                 free(t_iov->iov_base, M_NFSDIRECTIO);
832                                 free(t_iov, M_NFSDIRECTIO);
833                                 free(t_uio, M_NFSDIRECTIO);
834                                 bp->b_vp = NULL;
835                                 relpbuf(bp, &nfs_pbuf_freecnt);
836                                 if (error == EINTR)
837                                         return (error);
838                                 goto do_sync;
839                         }
840                         uiop->uio_offset += size;
841                         uiop->uio_resid -= size;
842                         if (uiop->uio_iov->iov_len <= size) {
843                                 uiop->uio_iovcnt--;
844                                 uiop->uio_iov++;
845                         } else {
846                                 uiop->uio_iov->iov_base = 
847                                         (char *)uiop->uio_iov->iov_base + size;
848                                 uiop->uio_iov->iov_len -= size;
849                         }
850                 }
851         }
852         return (0);
853 }
854 
855 /*
856  * Vnode op for write using bio
857  */
858 int
859 nfs_write(struct vop_write_args *ap)
860 {
861         int biosize;
862         struct uio *uio = ap->a_uio;
863         struct thread *td = uio->uio_td;
864         struct vnode *vp = ap->a_vp;
865         struct nfsnode *np = VTONFS(vp);
866         struct ucred *cred = ap->a_cred;
867         int ioflag = ap->a_ioflag;
868         struct buf *bp;
869         struct vattr vattr;
870         struct nfsmount *nmp = VFSTONFS(vp->v_mount);
871         daddr_t lbn;
872         int bcount;
873         int n, on, error = 0;
874         struct proc *p = td?td->td_proc:NULL;
875 
876 #ifdef DIAGNOSTIC
877         if (uio->uio_rw != UIO_WRITE)
878                 panic("nfs_write mode");
879         if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
880                 panic("nfs_write proc");
881 #endif
882         if (vp->v_type != VREG)
883                 return (EIO);
884         mtx_lock(&np->n_mtx);
885         if (np->n_flag & NWRITEERR) {
886                 np->n_flag &= ~NWRITEERR;
887                 mtx_unlock(&np->n_mtx);
888                 return (np->n_error);
889         } else
890                 mtx_unlock(&np->n_mtx);
891         mtx_lock(&nmp->nm_mtx);
892         if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
893             (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
894                 mtx_unlock(&nmp->nm_mtx);
895                 (void)nfs_fsinfo(nmp, vp, cred, td);
896         } else
897                 mtx_unlock(&nmp->nm_mtx);
898 
899         /*
900          * Synchronously flush pending buffers if we are in synchronous
901          * mode or if we are appending.
902          */
903         if (ioflag & (IO_APPEND | IO_SYNC)) {
904                 mtx_lock(&np->n_mtx);
905                 if (