FreeBSD/Linux Kernel Cross Reference
sys/nfs/nfs_bio.c
1 /*
2 * Copyright (c) 1989, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * Rick Macklem at The University of Guelph.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)nfs_bio.c 8.5 (Berkeley) 1/4/94
37 * $FreeBSD: src/sys/nfs/nfs_bio.c,v 1.28.2.11 1999/12/12 07:28:50 dillon Exp $
38 */
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/resourcevar.h>
43 #include <sys/signalvar.h>
44 #include <sys/proc.h>
45 #include <sys/buf.h>
46 #include <sys/vnode.h>
47 #include <sys/mount.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50
51 #include <vm/vm.h>
52 #include <vm/vm_param.h>
53 #include <vm/vm_extern.h>
54 #include <vm/vm_prot.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_object.h>
57 #include <vm/vm_pager.h>
58 #include <vm/vnode_pager.h>
59
60 #include <nfs/rpcv2.h>
61 #include <nfs/nfsproto.h>
62 #include <nfs/nfs.h>
63 #include <nfs/nfsmount.h>
64 #include <nfs/nqnfs.h>
65 #include <nfs/nfsnode.h>
66
67 static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
68 struct proc *p));
69
70 extern int nfs_numasync;
71 extern struct nfsstats nfsstats;
72
73 /*
74 * Ifdefs for FreeBSD-current's merged VM/buffer cache. It is unfortunate
75 * that this isn't done inside getblk() and brelse() so these calls
76 * wouldn't need to be here.
77 */
78 #ifdef B_VMIO
79 #define vnode_pager_uncache(vp)
80 #else
81 #define vfs_busy_pages(bp, f)
82 #define vfs_unbusy_pages(bp)
83 #define vfs_dirty_pages(bp)
84 #endif
85
86 /*
87 * Vnode op for VM getpages.
88 */
89 int
90 nfs_getpages(ap)
91 struct vop_getpages_args *ap;
92 {
93 int i, bsize;
94 vm_object_t obj;
95 int pcount;
96 struct uio auio;
97 struct iovec aiov;
98 int error;
99 vm_page_t m;
100
101 if (!(ap->a_vp->v_flag & VVMIO)) {
102 printf("nfs_getpages: called with non-VMIO vnode??\n");
103 return EOPNOTSUPP;
104 }
105
106 pcount = round_page(ap->a_count) / PAGE_SIZE;
107
108 obj = ap->a_m[ap->a_reqpage]->object;
109 bsize = ap->a_vp->v_mount->mnt_stat.f_iosize;
110
111 for (i = 0; i < pcount; i++) {
112 if (i != ap->a_reqpage) {
113 vnode_pager_freepage(ap->a_m[i]);
114 }
115 }
116 m = ap->a_m[ap->a_reqpage];
117
118 m->busy++;
119 m->flags &= ~PG_BUSY;
120
121 auio.uio_iov = &aiov;
122 auio.uio_iovcnt = 1;
123 aiov.iov_base = 0;
124 aiov.iov_len = MAXBSIZE;
125 auio.uio_resid = MAXBSIZE;
126 auio.uio_offset = IDX_TO_OFF(m->pindex);
127 auio.uio_segflg = UIO_NOCOPY;
128 auio.uio_rw = UIO_READ;
129 auio.uio_procp = curproc;
130 error = nfs_bioread(ap->a_vp, &auio, IO_NODELOCKED, curproc->p_ucred, 1);
131
132 m->flags |= PG_BUSY;
133 m->busy--;
134 if (m->busy == 0 && (m->flags & PG_WANTED)) {
135 m->flags &= ~PG_WANTED;
136 wakeup(m);
137 }
138
139 if (error && (auio.uio_resid == MAXBSIZE))
140 return VM_PAGER_ERROR;
141 return 0;
142 }
143
144 /*
145 * Vnode op for read using bio
146 * Any similarity to readip() is purely coincidental
147 */
148 int
149 nfs_bioread(vp, uio, ioflag, cred, getpages)
150 register struct vnode *vp;
151 register struct uio *uio;
152 int ioflag;
153 struct ucred *cred;
154 int getpages;
155 {
156 register struct nfsnode *np = VTONFS(vp);
157 register int biosize, diff, i;
158 struct buf *bp = 0, *rabp;
159 struct vattr vattr;
160 struct proc *p;
161 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
162 daddr_t lbn, rabn;
163 int bufsize;
164 int nra, error = 0, n = 0, on = 0, not_readin;
165
166 #ifdef DIAGNOSTIC
167 if (uio->uio_rw != UIO_READ)
168 panic("nfs_read mode");
169 #endif
170 if (uio->uio_resid == 0)
171 return (0);
172 if (uio->uio_offset < 0)
173 return (EINVAL);
174 p = uio->uio_procp;
175 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
176 (void)nfs_fsinfo(nmp, vp, cred, p);
177 biosize = vp->v_mount->mnt_stat.f_iosize;
178 /*
179 * For nfs, cache consistency can only be maintained approximately.
180 * Although RFC1094 does not specify the criteria, the following is
181 * believed to be compatible with the reference port.
182 * For nqnfs, full cache consistency is maintained within the loop.
183 * For nfs:
184 * If the file's modify time on the server has changed since the
185 * last read rpc or you have written to the file,
186 * you may have lost data cache consistency with the
187 * server, so flush all of the file's data out of the cache.
188 * Then force a getattr rpc to ensure that you have up to date
189 * attributes.
190 * NB: This implies that cache data can be read when up to
191 * NFS_ATTRTIMEO seconds out of date. If you find that you need current
192 * attributes this could be forced by setting n_attrstamp to 0 before
193 * the VOP_GETATTR() call.
194 */
195 if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
196 if (np->n_flag & NMODIFIED) {
197 if (vp->v_type != VREG) {
198 if (vp->v_type != VDIR)
199 panic("nfs: bioread, not dir");
200 nfs_invaldir(vp);
201 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
202 if (error)
203 return (error);
204 }
205 np->n_attrstamp = 0;
206 error = VOP_GETATTR(vp, &vattr, cred, p);
207 if (error)
208 return (error);
209 np->n_mtime = vattr.va_mtime.tv_sec;
210 } else {
211 error = VOP_GETATTR(vp, &vattr, cred, p);
212 if (error)
213 return (error);
214 if (np->n_mtime != vattr.va_mtime.tv_sec) {
215 if (vp->v_type == VDIR)
216 nfs_invaldir(vp);
217 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
218 if (error)
219 return (error);
220 np->n_mtime = vattr.va_mtime.tv_sec;
221 }
222 }
223 }
224 do {
225
226 /*
227 * Get a valid lease. If cached data is stale, flush it.
228 */
229 if (nmp->nm_flag & NFSMNT_NQNFS) {
230 if (NQNFS_CKINVALID(vp, np, ND_READ)) {
231 do {
232 error = nqnfs_getlease(vp, ND_READ, cred, p);
233 } while (error == NQNFS_EXPIRED);
234 if (error)
235 return (error);
236 if (np->n_lrev != np->n_brev ||
237 (np->n_flag & NQNFSNONCACHE) ||
238 ((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
239 if (vp->v_type == VDIR)
240 nfs_invaldir(vp);
241 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
242 if (error)
243 return (error);
244 np->n_brev = np->n_lrev;
245 }
246 } else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
247 nfs_invaldir(vp);
248 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
249 if (error)
250 return (error);
251 }
252 }
253 if (np->n_flag & NQNFSNONCACHE) {
254 switch (vp->v_type) {
255 case VREG:
256 return (nfs_readrpc(vp, uio, cred));
257 case VLNK:
258 return (nfs_readlinkrpc(vp, uio, cred));
259 case VDIR:
260 break;
261 default:
262 printf(" NQNFSNONCACHE: type %x unexpected\n",
263 vp->v_type);
264 };
265 }
266 switch (vp->v_type) {
267 case VREG:
268 nfsstats.biocache_reads++;
269 lbn = uio->uio_offset / biosize;
270 on = uio->uio_offset & (biosize - 1);
271 not_readin = 1;
272
273 /*
274 * Start the read ahead(s), as required.
275 */
276 if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
277 for (nra = 0; nra < nmp->nm_readahead &&
278 (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
279 rabn = lbn + 1 + nra;
280 if (!incore(vp, rabn)) {
281 rabp = nfs_getcacheblk(vp, rabn, biosize, p);
282 if (!rabp)
283 return (EINTR);
284 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
285 rabp->b_flags |= (B_READ | B_ASYNC);
286 vfs_busy_pages(rabp, 0);
287 if (nfs_asyncio(rabp, cred)) {
288 rabp->b_flags |= B_INVAL|B_ERROR;
289 vfs_unbusy_pages(rabp);
290 brelse(rabp);
291 }
292 } else {
293 brelse(rabp);
294 }
295 }
296 }
297 }
298
299 /*
300 * If the block is in the cache and has the required data
301 * in a valid region, just copy it out.
302 * Otherwise, get the block and write back/read in,
303 * as required.
304 */
305 again:
306 bufsize = biosize;
307 if ((off_t)(lbn + 1) * biosize > np->n_size &&
308 (off_t)(lbn + 1) * biosize - np->n_size < biosize) {
309 bufsize = np->n_size - lbn * biosize;
310 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
311 }
312 bp = nfs_getcacheblk(vp, lbn, bufsize, p);
313 if (!bp)
314 return (EINTR);
315 /*
316 * If we are being called from nfs_getpages, we must
317 * make sure the buffer is a vmio buffer. The vp will
318 * already be setup for vmio but there may be some old
319 * non-vmio buffers attached to it.
320 */
321 if (getpages && !(bp->b_flags & B_VMIO)) {
322 #ifdef DIAGNOSTIC
323 printf("nfs_bioread: non vmio buf found, discarding\n");
324 #endif
325 bp->b_flags |= B_NOCACHE;
326 bp->b_flags |= B_INVAFTERWRITE;
327 if (bp->b_dirtyend > 0) {
328 if ((bp->b_flags & B_DELWRI) == 0)
329 panic("nfsbioread");
330 if (VOP_BWRITE(bp) == EINTR)
331 return (EINTR);
332 } else
333 brelse(bp);
334 goto again;
335 }
336 if ((bp->b_flags & B_CACHE) == 0) {
337 bp->b_flags |= B_READ;
338 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
339 not_readin = 0;
340 vfs_busy_pages(bp, 0);
341 error = nfs_doio(bp, cred, p);
342 if (error) {
343 brelse(bp);
344 return (error);
345 }
346 }
347 if (bufsize > on) {
348 n = min((unsigned)(bufsize - on), uio->uio_resid);
349 } else {
350 n = 0;
351 }
352 diff = np->n_size - uio->uio_offset;
353 if (diff < n)
354 n = diff;
355 if (not_readin && n > 0) {
356 if (on < bp->b_validoff || (on + n) > bp->b_validend) {
357 bp->b_flags |= B_NOCACHE;
358 if (bp->b_dirtyend > 0) {
359 if ((bp->b_flags & B_DELWRI) == 0)
360 panic("nfsbioread");
361 if (VOP_BWRITE(bp) == EINTR)
362 return (EINTR);
363 } else
364 brelse(bp);
365 goto again;
366 }
367 }
368 vp->v_lastr = lbn;
369 diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
370 if (diff < n)
371 n = diff;
372 break;
373 case VLNK:
374 nfsstats.biocache_readlinks++;
375 bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
376 if (!bp)
377 return (EINTR);
378 if ((bp->b_flags & B_CACHE) == 0) {
379 bp->b_flags |= B_READ;
380 vfs_busy_pages(bp, 0);
381 error = nfs_doio(bp, cred, p);
382 if (error) {
383 bp->b_flags |= B_ERROR;
384 brelse(bp);
385 return (error);
386 }
387 }
388 n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
389 on = 0;
390 break;
391 case VDIR:
392 nfsstats.biocache_readdirs++;
393 if (np->n_direofoffset
394 && uio->uio_offset >= np->n_direofoffset) {
395 return (0);
396 }
397 lbn = uio->uio_offset / NFS_DIRBLKSIZ;
398 on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
399 bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
400 if (!bp)
401 return (EINTR);
402 if ((bp->b_flags & B_CACHE) == 0) {
403 bp->b_flags |= B_READ;
404 vfs_busy_pages(bp, 0);
405 error = nfs_doio(bp, cred, p);
406 if (error) {
407 vfs_unbusy_pages(bp);
408 brelse(bp);
409 while (error == NFSERR_BAD_COOKIE) {
410 nfs_invaldir(vp);
411 error = nfs_vinvalbuf(vp, 0, cred, p, 1);
412 /*
413 * Yuck! The directory has been modified on the
414 * server. The only way to get the block is by
415 * reading from the beginning to get all the
416 * offset cookies.
417 */
418 for (i = 0; i <= lbn && !error; i++) {
419 if (np->n_direofoffset
420 && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
421 return (0);
422 bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
423 if (!bp)
424 return (EINTR);
425 if ((bp->b_flags & B_DONE) == 0) {
426 bp->b_flags |= B_READ;
427 vfs_busy_pages(bp, 0);
428 error = nfs_doio(bp, cred, p);
429 if (error) {
430 vfs_unbusy_pages(bp);
431 brelse(bp);
432 } else if (i < lbn)
433 brelse(bp);
434 }
435 }
436 }
437 if (error)
438 return (error);
439 }
440 }
441
442 /*
443 * If not eof and read aheads are enabled, start one.
444 * (You need the current block first, so that you have the
445 * directory offset cookie of the next block.)
446 */
447 if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
448 (np->n_direofoffset == 0 ||
449 (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
450 !(np->n_flag & NQNFSNONCACHE) &&
451 !incore(vp, lbn + 1)) {
452 rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
453 if (rabp) {
454 if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
455 rabp->b_flags |= (B_READ | B_ASYNC);
456 vfs_busy_pages(rabp, 0);
457 if (nfs_asyncio(rabp, cred)) {
458 rabp->b_flags |= B_INVAL|B_ERROR;
459 vfs_unbusy_pages(rabp);
460 brelse(rabp);
461 }
462 } else {
463 brelse(rabp);
464 }
465 }
466 }
467 /*
468 * Make sure we use a signed variant of min() since
469 * the second term may be negative.
470 */
471 n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
472 break;
473 default:
474 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
475 break;
476 };
477
478 if (n > 0) {
479 error = uiomove(bp->b_data + on, (int)n, uio);
480 }
481 switch (vp->v_type) {
482 case VREG:
483 break;
484 case VLNK:
485 n = 0;
486 break;
487 case VDIR:
488 if (np->n_flag & NQNFSNONCACHE)
489 bp->b_flags |= B_INVAL;
490 break;
491 default:
492 printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
493 }
494 brelse(bp);
495 } while (error == 0 && uio->uio_resid > 0 && n > 0);
496 return (error);
497 }
498
499 /*
500 * Vnode op for write using bio
501 */
502 int
503 nfs_write(ap)
504 struct vop_write_args /* {
505 struct vnode *a_vp;
506 struct uio *a_uio;
507 int a_ioflag;
508 struct ucred *a_cred;
509 } */ *ap;
510 {
511 register int biosize;
512 register struct uio *uio = ap->a_uio;
513 struct proc *p = uio->uio_procp;
514 register struct vnode *vp = ap->a_vp;
515 struct nfsnode *np = VTONFS(vp);
516 register struct ucred *cred = ap->a_cred;
517 int ioflag = ap->a_ioflag;
518 struct buf *bp;
519 struct vattr vattr;
520 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
521 daddr_t lbn;
522 int bufsize;
523 int n, on, error = 0, iomode, must_commit;
524
525 #ifdef DIAGNOSTIC
526 if (uio->uio_rw != UIO_WRITE)
527 panic("nfs_write mode");
528 if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
529 panic("nfs_write proc");
530 #endif
531 if (vp->v_type != VREG)
532 return (EIO);
533 if (np->n_flag & NWRITEERR) {
534 np->n_flag &= ~NWRITEERR;
535 return (np->n_error);
536 }
537 if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
538 (void)nfs_fsinfo(nmp, vp, cred, p);
539 if (ioflag & (IO_APPEND | IO_SYNC)) {
540 if (np->n_flag & NMODIFIED) {
541 np->n_attrstamp = 0;
542 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
543 if (error)
544 return (error);
545 }
546 if (ioflag & IO_APPEND) {
547 np->n_attrstamp = 0;
548 error = VOP_GETATTR(vp, &vattr, cred, p);
549 if (error)
550 return (error);
551 uio->uio_offset = np->n_size;
552 }
553 }
554 if (uio->uio_offset < 0)
555 return (EINVAL);
556 if (uio->uio_resid == 0)
557 return (0);
558 /*
559 * Maybe this should be above the vnode op call, but so long as
560 * file servers have no limits, i don't think it matters
561 */
562 if (p && uio->uio_offset + uio->uio_resid >
563 p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
564 psignal(p, SIGXFSZ);
565 return (EFBIG);
566 }
567 /*
568 * I use nm_rsize, not nm_wsize so that all buffer cache blocks
569 * will be the same size within a filesystem. nfs_writerpc will
570 * still use nm_wsize when sizing the rpc's.
571 */
572 biosize = vp->v_mount->mnt_stat.f_iosize;
573 do {
574
575 /*
576 * XXX make sure we aren't cached in the VM page cache
577 */
578 /*
579 * Check for a valid write lease.
580 */
581 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
582 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
583 do {
584 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
585 } while (error == NQNFS_EXPIRED);
586 if (error)
587 return (error);
588 if (np->n_lrev != np->n_brev ||
589 (np->n_flag & NQNFSNONCACHE)) {
590 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
591 if (error)
592 return (error);
593 np->n_brev = np->n_lrev;
594 }
595 }
596 if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
597 iomode = NFSV3WRITE_FILESYNC;
598 error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
599 if (must_commit)
600 nfs_clearcommit(vp->v_mount);
601 return (error);
602 }
603 nfsstats.biocache_writes++;
604 lbn = uio->uio_offset / biosize;
605 on = uio->uio_offset & (biosize-1);
606 n = min((unsigned)(biosize - on), uio->uio_resid);
607 again:
608 if (uio->uio_offset + n > np->n_size) {
609 np->n_size = uio->uio_offset + n;
610 np->n_flag |= NMODIFIED;
611 vnode_pager_setsize(vp, (u_long)np->n_size);
612 }
613 bufsize = biosize;
614 if ((lbn + 1) * biosize > np->n_size) {
615 bufsize = np->n_size - lbn * biosize;
616 bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
617 }
618 bp = nfs_getcacheblk(vp, lbn, bufsize, p);
619 if (!bp)
620 return (EINTR);
621 if (bp->b_wcred == NOCRED) {
622 crhold(cred);
623 bp->b_wcred = cred;
624 }
625 np->n_flag |= NMODIFIED;
626
627 if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
628 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
629 }
630
631 /*
632 * If the new write will leave a contiguous dirty
633 * area, just update the b_dirtyoff and b_dirtyend,
634 * otherwise force a write rpc of the old dirty area.
635 */
636 if (bp->b_dirtyend > 0 &&
637 (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
638 bp->b_proc = p;
639 if (VOP_BWRITE(bp) == EINTR)
640 return (EINTR);
641 goto again;
642 }
643
644 /*
645 * Check for valid write lease and get one as required.
646 * In case getblk() and/or bwrite() delayed us.
647 */
648 if ((nmp->nm_flag & NFSMNT_NQNFS) &&
649 NQNFS_CKINVALID(vp, np, ND_WRITE)) {
650 do {
651 error = nqnfs_getlease(vp, ND_WRITE, cred, p);
652 } while (error == NQNFS_EXPIRED);
653 if (error) {
654 brelse(bp);
655 return (error);
656 }
657 if (np->n_lrev != np->n_brev ||
658 (np->n_flag & NQNFSNONCACHE)) {
659 brelse(bp);
660 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
661 if (error)
662 return (error);
663 np->n_brev = np->n_lrev;
664 goto again;
665 }
666 }
667 error = uiomove((char *)bp->b_data + on, n, uio);
668 if (error) {
669 bp->b_flags |= B_ERROR;
670 brelse(bp);
671 return (error);
672 }
673 if (bp->b_dirtyend > 0) {
674 bp->b_dirtyoff = min(on, bp->b_dirtyoff);
675 bp->b_dirtyend = max((on + n), bp->b_dirtyend);
676 } else {
677 bp->b_dirtyoff = on;
678 bp->b_dirtyend = on + n;
679 }
680 if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
681 bp->b_validoff > bp->b_dirtyend) {
682 bp->b_validoff = bp->b_dirtyoff;
683 bp->b_validend = bp->b_dirtyend;
684 } else {
685 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
686 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
687 }
688
689 /*
690 * Since this block is being modified, it must be written
691 * again and not just committed.
692 */
693 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
694
695 /*
696 * If the lease is non-cachable or IO_SYNC do bwrite().
697 */
698 if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
699 bp->b_proc = p;
700 error = VOP_BWRITE(bp);
701 if (error)
702 return (error);
703 if (np->n_flag & NQNFSNONCACHE) {
704 error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
705 if (error)
706 return (error);
707 }
708 } else if ((n + on) == biosize &&
709 (nmp->nm_flag & NFSMNT_NQNFS) == 0) {
710 bp->b_proc = (struct proc *)0;
711 bp->b_flags |= B_ASYNC;
712 (void)nfs_writebp(bp, 0);
713 } else
714 bdwrite(bp);
715 } while (uio->uio_resid > 0 && n > 0);
716 return (0);
717 }
718
719 /*
720 * Get an nfs cache block.
721 * Allocate a new one if the block isn't currently in the cache
722 * and return the block marked busy. If the calling process is
723 * interrupted by a signal for an interruptible mount point, return
724 * NULL.
725 */
726 static struct buf *
727 nfs_getcacheblk(vp, bn, size, p)
728 struct vnode *vp;
729 daddr_t bn;
730 int size;
731 struct proc *p;
732 {
733 register struct buf *bp;
734 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
735 int biosize = vp->v_mount->mnt_stat.f_iosize;
736
737 if (nmp->nm_flag & NFSMNT_INT) {
738 bp = getblk(vp, bn, size, PCATCH, 0);
739 while (bp == (struct buf *)0) {
740 if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
741 return ((struct buf *)0);
742 bp = getblk(vp, bn, size, 0, 2 * hz);
743 }
744 } else
745 bp = getblk(vp, bn, size, 0, 0);
746
747 if( vp->v_type == VREG)
748 bp->b_blkno = (bn * biosize) / DEV_BSIZE;
749
750 return (bp);
751 }
752
753 /*
754 * Flush and invalidate all dirty buffers. If another process is already
755 * doing the flush, just wait for completion.
756 */
757 int
758 nfs_vinvalbuf(vp, flags, cred, p, intrflg)
759 struct vnode *vp;
760 int flags;
761 struct ucred *cred;
762 struct proc *p;
763 int intrflg;
764 {
765 register struct nfsnode *np = VTONFS(vp);
766 struct nfsmount *nmp = VFSTONFS(vp->v_mount);
767 int error = 0, slpflag, slptimeo;
768
769 if ((nmp->nm_flag & NFSMNT_INT) == 0)
770 intrflg = 0;
771 if (intrflg) {
772 slpflag = PCATCH;
773 slptimeo = 2 * hz;
774 } else {
775 slpflag = 0;
776 slptimeo = 0;
777 }
778 /*
779 * First wait for any other process doing a flush to complete.
780 */
781 while (np->n_flag & NFLUSHINPROG) {
782 np->n_flag |= NFLUSHWANT;
783 error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
784 slptimeo);
785 if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
786 return (EINTR);
787 }
788
789 /*
790 * Now, flush as required.
791 */
792 np->n_flag |= NFLUSHINPROG;
793 error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
794 while (error) {
795 if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
796 np->n_flag &= ~NFLUSHINPROG;
797 if (np->n_flag & NFLUSHWANT) {
798 np->n_flag &= ~NFLUSHWANT;
799 wakeup((caddr_t)&np->n_flag);
800 }
801 return (EINTR);
802 }
803 error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
804 }
805 np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
806 if (np->n_flag & NFLUSHWANT) {
807 np->n_flag &= ~NFLUSHWANT;
808 wakeup((caddr_t)&np->n_flag);
809 }
810 return (0);
811 }
812
813 /*
814 * Initiate asynchronous I/O. Return an error if no nfsiods are available.
815 * This is mainly to avoid queueing async I/O requests when the nfsiods
816 * are all hung on a dead server.
817 */
818 int
819 nfs_asyncio(bp, cred)
820 register struct buf *bp;
821 struct ucred *cred;
822 {
823 struct nfsmount *nmp;
824 int i;
825 int gotiod;
826 int slpflag = 0;
827 int slptimeo = 0;
828 int error;
829
830 if (nfs_numasync == 0)
831 return (EIO);
832
833 nmp = VFSTONFS(bp->b_vp->v_mount);
834 again:
835 if (nmp->nm_flag & NFSMNT_INT)
836 slpflag = PCATCH;
837 gotiod = FALSE;
838
839 /*
840 * Find a free iod to process this request.
841 */
842 for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
843 if (nfs_iodwant[i]) {
844 /*
845 * Found one, so wake it up and tell it which
846 * mount to process.
847 */
848 NFS_DPF(ASYNCIO,
849 ("nfs_asyncio: waking iod %d for mount %p\n",
850 i, nmp));
851 nfs_iodwant[i] = (struct proc *)0;
852 nfs_iodmount[i] = nmp;
853 nmp->nm_bufqiods++;
854 wakeup((caddr_t)&nfs_iodwant[i]);
855 gotiod = TRUE;
856 break;
857 }
858
859 /*
860 * If none are free, we may already have an iod working on this mount
861 * point. If so, it will process our request.
862 */
863 if (!gotiod) {
864 if (nmp->nm_bufqiods > 0) {
865 NFS_DPF(ASYNCIO,
866 ("nfs_asyncio: %d iods are already processing mount %p\n",
867 nmp->nm_bufqiods, nmp));
868 gotiod = TRUE;
869 }
870 }
871
872 /*
873 * If we have an iod which can process the request, then queue
874 * the buffer.
875 */
876 if (gotiod) {
877 /*
878 * Ensure that the queue never grows too large.
879 */
880 while (nmp->nm_bufqlen >= 2*nfs_numasync) {
881 NFS_DPF(ASYNCIO,
882 ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
883 nmp->nm_bufqwant = TRUE;
884 error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
885 "nfsaio", slptimeo);
886 if (error) {
887 if (nfs_sigintr(nmp, NULL, bp->b_proc))
888 return (EINTR);
889 if (slpflag == PCATCH) {
890 slpflag = 0;
891 slptimeo = 2 * hz;
892 }
893 }
894 /*
895 * We might have lost our iod while sleeping,
896 * so check and loop if nescessary.
897 */
898 if (nmp->nm_bufqiods == 0) {
899 NFS_DPF(ASYNCIO,
900 ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
901 goto again;
902 }
903 }
904
905 if (bp->b_flags & B_READ) {
906 if (bp->b_rcred == NOCRED && cred != NOCRED) {
907 crhold(cred);
908 bp->b_rcred = cred;
909 }
910 } else {
911 bp->b_flags |= B_WRITEINPROG;
912 if (bp->b_wcred == NOCRED && cred != NOCRED) {
913 crhold(cred);
914 bp->b_wcred = cred;
915 }
916 }
917
918 TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
919 nmp->nm_bufqlen++;
920 return (0);
921 }
922
923 /*
924 * All the iods are busy on other mounts, so return EIO to
925 * force the caller to process the i/o synchronously.
926 */
927 NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
928 return (EIO);
929 }
930
931 /*
932 * Do an I/O operation to/from a cache block. This may be called
933 * synchronously or from an nfsiod.
934 */
935 int
936 nfs_doio(bp, cr, p)
937 register struct buf *bp;
938 struct ucred *cr;
939 struct proc *p;
940 {
941 register struct uio *uiop;
942 register struct vnode *vp;
943 struct nfsnode *np;
944 struct nfsmount *nmp;
945 int error = 0, diff, len, iomode, must_commit = 0;
946 struct uio uio;
947 struct iovec io;
948
949 vp = bp->b_vp;
950 np = VTONFS(vp);
951 nmp = VFSTONFS(vp->v_mount);
952 uiop = &uio;
953 uiop->uio_iov = &io;
954 uiop->uio_iovcnt = 1;
955 uiop->uio_segflg = UIO_SYSSPACE;
956 uiop->uio_procp = p;
957
958 /*
959 * Historically, paging was done with physio, but no more.
960 */
961 if (bp->b_flags & B_PHYS) {
962 /*
963 * ...though reading /dev/drum still gets us here.
964 */
965 io.iov_len = uiop->uio_resid = bp->b_bcount;
966 /* mapping was done by vmapbuf() */
967 io.iov_base = bp->b_data;
968 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
969 if (bp->b_flags & B_READ) {
970 uiop->uio_rw = UIO_READ;
971 nfsstats.read_physios++;
972 error = nfs_readrpc(vp, uiop, cr);
973 } else {
974 int com;
975
976 iomode = NFSV3WRITE_DATASYNC;
977 uiop->uio_rw = UIO_WRITE;
978 nfsstats.write_physios++;
979 error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
980 }
981 if (error) {
982 bp->b_flags |= B_ERROR;
983 bp->b_error = error;
984 }
985 } else if (bp->b_flags & B_READ) {
986 io.iov_len = uiop->uio_resid = bp->b_bcount;
987 io.iov_base = bp->b_data;
988 uiop->uio_rw = UIO_READ;
989 switch (vp->v_type) {
990 case VREG:
991 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
992 nfsstats.read_bios++;
993 error = nfs_readrpc(vp, uiop, cr);
994 if (!error) {
995 bp->b_validoff = 0;
996 if (uiop->uio_resid) {
997 /*
998 * If len > 0, there is a hole in the file and
999 * no writes after the hole have been pushed to
1000 * the server yet.
1001 * Just zero fill the rest of the valid area.
1002 */
1003 diff = bp->b_bcount - uiop->uio_resid;
1004 len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
1005 + diff);
1006 if (len > 0) {
1007 len = min(len, uiop->uio_resid);
1008 bzero((char *)bp->b_data + diff, len);
1009 bp->b_validend = diff + len;
1010 } else
1011 bp->b_validend = diff;
1012 } else
1013 bp->b_validend = bp->b_bcount;
1014 }
1015 if (p && (vp->v_flag & VTEXT) &&
1016 (((nmp->nm_flag & NFSMNT_NQNFS) &&
1017 NQNFS_CKINVALID(vp, np, ND_READ) &&
1018 np->n_lrev != np->n_brev) ||
1019 (!(nmp->nm_flag & NFSMNT_NQNFS) &&
1020 np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1021 uprintf("Process killed due to text file modification\n");
1022 psignal(p, SIGKILL);
1023 #ifdef __NetBSD__
1024 p->p_holdcnt++;
1025 #else
1026 p->p_flag |= P_NOSWAP;
1027 #endif
1028 }
1029 break;
1030 case VLNK:
1031 uiop->uio_offset = (off_t)0;
1032 nfsstats.readlink_bios++;
1033 error = nfs_readlinkrpc(vp, uiop, cr);
1034 break;
1035 case VDIR:
1036 nfsstats.readdir_bios++;
1037 uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
1038 if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
1039 error = nfs_readdirplusrpc(vp, uiop, cr);
1040 if (error == NFSERR_NOTSUPP)
1041 nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
1042 }
1043 if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
1044 error = nfs_readdirrpc(vp, uiop, cr);
1045 break;
1046 default:
1047 printf("nfs_doio: type %x unexpected\n",vp->v_type);
1048 break;
1049 };
1050 if (error) {
1051 bp->b_flags |= B_ERROR;
1052 bp->b_error = error;
1053 }
1054 } else {
1055 if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
1056 bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
1057
1058 if (bp->b_dirtyend > bp->b_dirtyoff) {
1059 io.iov_len = uiop->uio_resid = bp->b_dirtyend
1060 - bp->b_dirtyoff;
1061 uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
1062 + bp->b_dirtyoff;
1063 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
1064 uiop->uio_rw = UIO_WRITE;
1065 nfsstats.write_bios++;
1066 if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
1067 iomode = NFSV3WRITE_UNSTABLE;
1068 else
1069 iomode = NFSV3WRITE_FILESYNC;
1070 bp->b_flags |= B_WRITEINPROG;
1071 error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
1072 if (!error && iomode == NFSV3WRITE_UNSTABLE) {
1073 bp->b_flags |= B_NEEDCOMMIT;
1074 if (bp->b_dirtyoff == 0
1075 && bp->b_dirtyend == bp->b_bufsize)
1076 bp->b_flags |= B_CLUSTEROK;
1077 } else
1078 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
1079 bp->b_flags &= ~B_WRITEINPROG;
1080
1081 /*
1082 * For an interrupted write, the buffer is still valid
1083 * and the write hasn't been pushed to the server yet,
1084 * so we can't set B_ERROR and report the interruption
1085 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
1086 * is not relevant, so the rpc attempt is essentially
1087 * a noop. For the case of a V3 write rpc not being
1088 * committed to stable storage, the block is still
1089 * dirty and requires either a commit rpc or another
1090 * write rpc with iomode == NFSV3WRITE_FILESYNC before
1091 * the block is reused. This is indicated by setting
1092 * the B_DELWRI and B_NEEDCOMMIT flags.
1093 */
1094 if (error == EINTR
1095 || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
1096 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
1097 bp->b_flags |= B_DELWRI;
1098
1099 /*
1100 * Since for the B_ASYNC case, nfs_bwrite() has reassigned the
1101 * buffer to the clean list, we have to reassign it back to the
1102 * dirty one. Ugh.
1103 */
1104 if (bp->b_flags & B_ASYNC)
1105 reassignbuf(bp, vp);
1106 else if (error)
1107 bp->b_flags |= B_EINTR;
1108 } else {
1109 if (error) {
1110 bp->b_flags |= B_ERROR;
1111 bp->b_error = np->n_error = error;
1112 np->n_flag |= NWRITEERR;
1113 }
1114 bp->b_dirtyoff = bp->b_dirtyend = 0;
1115 }
1116 } else {
1117 bp->b_resid = 0;
1118 biodone(bp);
1119 return (0);
1120 }
1121 }
1122 bp->b_resid = uiop->uio_resid;
1123 if (must_commit)
1124 nfs_clearcommit(vp->v_mount);
1125 biodone(bp);
1126 return (error);
1127 }
Cache object: 35857f450b1a90513ebd5adce59d9c29
|