FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_bio.c
1 /*
2 * Copyright (c) 1994,1997 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Absolutely no warranty of function or purpose is made by the author
12 * John S. Dyson.
13 *
14 * $FreeBSD$
15 */
16
17 /*
18 * this file contains a new buffer I/O scheme implementing a coherent
19 * VM object and buffer cache scheme. Pains have been taken to make
20 * sure that the performance degradation associated with schemes such
21 * as this is not realized.
22 *
23 * Author: John S. Dyson
24 * Significant help during the development and debugging phases
25 * had been provided by David Greenman, also of the FreeBSD core team.
26 *
27 * see man buf(9) for more info.
28 */
29
30 #define VMIO
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/sysproto.h>
34 #include <sys/kernel.h>
35 #include <sys/sysctl.h>
36 #include <sys/proc.h>
37 #include <sys/vnode.h>
38 #include <sys/vmmeter.h>
39 #include <sys/lock.h>
40 #include <miscfs/specfs/specdev.h>
41 #include <vm/vm.h>
42 #include <vm/vm_param.h>
43 #include <vm/vm_prot.h>
44 #include <vm/vm_kern.h>
45 #include <vm/vm_pageout.h>
46 #include <vm/vm_page.h>
47 #include <vm/vm_object.h>
48 #include <vm/vm_extern.h>
49 #include <vm/vm_map.h>
50 #include <sys/buf.h>
51 #include <sys/mount.h>
52 #include <sys/malloc.h>
53 #include <sys/resourcevar.h>
54
55 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
56
57 struct bio_ops bioops; /* I/O operation notification */
58
59 #if 0 /* replaced bu sched_sync */
60 static void vfs_update __P((void));
61 static struct proc *updateproc;
62 static struct kproc_desc up_kp = {
63 "update",
64 vfs_update,
65 &updateproc
66 };
67 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
68 #endif
69
70 struct buf *buf; /* buffer header pool */
71 struct swqueue bswlist;
72
73 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
74 vm_offset_t to);
75 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
76 vm_offset_t to);
77 static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
78 vm_offset_t off, vm_offset_t size,
79 vm_page_t m);
80 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
81 int pageno, vm_page_t m);
82 static void vfs_clean_pages(struct buf * bp);
83 static void vfs_setdirty(struct buf *bp);
84 static void vfs_vmio_release(struct buf *bp);
85 static void flushdirtybuffers(int slpflag, int slptimeo);
86
87 int needsbuffer;
88
89 /*
90 * Internal update daemon, process 3
91 * The variable vfs_update_wakeup allows for internal syncs.
92 */
93 int vfs_update_wakeup;
94
95
96 /*
97 * buffers base kva
98 */
99
100 /*
101 * bogus page -- for I/O to/from partially complete buffers
102 * this is a temporary solution to the problem, but it is not
103 * really that bad. it would be better to split the buffer
104 * for input in the case of buffers partially already in memory,
105 * but the code is intricate enough already.
106 */
107 vm_page_t bogus_page;
108 static vm_offset_t bogus_offset;
109
110 static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
111 bufmallocspace, maxbufmallocspace;
112 int numdirtybuffers;
113 static int lodirtybuffers, hidirtybuffers;
114 static int numfreebuffers, lofreebuffers, hifreebuffers;
115 static int kvafreespace;
116
117 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
118 &numdirtybuffers, 0, "");
119 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
120 &lodirtybuffers, 0, "");
121 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
122 &hidirtybuffers, 0, "");
123 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
124 &numfreebuffers, 0, "");
125 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
126 &lofreebuffers, 0, "");
127 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
128 &hifreebuffers, 0, "");
129 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
130 &maxbufspace, 0, "");
131 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
132 &bufspace, 0, "");
133 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
134 &maxvmiobufspace, 0, "");
135 SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
136 &vmiospace, 0, "");
137 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
138 &maxbufmallocspace, 0, "");
139 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
140 &bufmallocspace, 0, "");
141 SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
142 &kvafreespace, 0, "");
143
144 static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
145 struct bqueues bufqueues[BUFFER_QUEUES] = {0};
146
147 extern int vm_swap_size;
148
149 #define BUF_MAXUSE 24
150
151 #define VFS_BIO_NEED_ANY 1
152 #define VFS_BIO_NEED_LOWLIMIT 2
153 #define VFS_BIO_NEED_FREE 4
154
155 /*
156 * Initialize buffer headers and related structures.
157 */
158 void
159 bufinit()
160 {
161 struct buf *bp;
162 int i;
163
164 TAILQ_INIT(&bswlist);
165 LIST_INIT(&invalhash);
166
167 /* first, make a null hash table */
168 for (i = 0; i < BUFHSZ; i++)
169 LIST_INIT(&bufhashtbl[i]);
170
171 /* next, make a null set of free lists */
172 for (i = 0; i < BUFFER_QUEUES; i++)
173 TAILQ_INIT(&bufqueues[i]);
174
175 /* finally, initialize each buffer header and stick on empty q */
176 for (i = 0; i < nbuf; i++) {
177 bp = &buf[i];
178 bzero(bp, sizeof *bp);
179 bp->b_flags = B_INVAL; /* we're just an empty header */
180 bp->b_dev = NODEV;
181 bp->b_rcred = NOCRED;
182 bp->b_wcred = NOCRED;
183 bp->b_qindex = QUEUE_EMPTY;
184 bp->b_xflags = 0;
185 LIST_INIT(&bp->b_dep);
186 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
187 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
188 }
189 /*
190 * maxbufspace is currently calculated to support all filesystem blocks
191 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
192 * cache is still the same as it would be for 8K filesystems. This
193 * keeps the size of the buffer cache "in check" for big block filesystems.
194 */
195 maxbufspace = (nbuf + 8) * DFLTBSIZE;
196 /*
197 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
198 */
199 maxvmiobufspace = 2 * maxbufspace / 3;
200 /*
201 * Limit the amount of malloc memory since it is wired permanently into
202 * the kernel space. Even though this is accounted for in the buffer
203 * allocation, we don't want the malloced region to grow uncontrolled.
204 * The malloc scheme improves memory utilization significantly on average
205 * (small) directories.
206 */
207 maxbufmallocspace = maxbufspace / 20;
208
209 /*
210 * Remove the probability of deadlock conditions by limiting the
211 * number of dirty buffers.
212 */
213 hidirtybuffers = nbuf / 8 + 20;
214 lodirtybuffers = nbuf / 16 + 10;
215 numdirtybuffers = 0;
216 lofreebuffers = nbuf / 18 + 5;
217 hifreebuffers = 2 * lofreebuffers;
218 numfreebuffers = nbuf;
219 kvafreespace = 0;
220
221 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
222 bogus_page = vm_page_alloc(kernel_object,
223 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
224 VM_ALLOC_NORMAL);
225
226 }
227
228 /*
229 * Free the kva allocation for a buffer
230 * Must be called only at splbio or higher,
231 * as this is the only locking for buffer_map.
232 */
233 static void
234 bfreekva(struct buf * bp)
235 {
236 if (bp->b_kvasize == 0)
237 return;
238
239 vm_map_delete(buffer_map,
240 (vm_offset_t) bp->b_kvabase,
241 (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
242
243 bp->b_kvasize = 0;
244
245 }
246
247 /*
248 * remove the buffer from the appropriate free list
249 */
250 void
251 bremfree(struct buf * bp)
252 {
253 int s = splbio();
254
255 if (bp->b_qindex != QUEUE_NONE) {
256 if (bp->b_qindex == QUEUE_EMPTY) {
257 kvafreespace -= bp->b_kvasize;
258 }
259 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
260 bp->b_qindex = QUEUE_NONE;
261 } else {
262 #if !defined(MAX_PERF)
263 panic("bremfree: removing a buffer when not on a queue");
264 #endif
265 }
266 if ((bp->b_flags & B_INVAL) ||
267 (bp->b_flags & (B_DELWRI|B_LOCKED)) == 0)
268 --numfreebuffers;
269 splx(s);
270 }
271
272
273 /*
274 * Get a buffer with the specified data. Look in the cache first.
275 */
276 int
277 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
278 struct buf ** bpp)
279 {
280 struct buf *bp;
281
282 bp = getblk(vp, blkno, size, 0, 0);
283 *bpp = bp;
284
285 /* if not found in cache, do some I/O */
286 if ((bp->b_flags & B_CACHE) == 0) {
287 if (curproc != NULL)
288 curproc->p_stats->p_ru.ru_inblock++;
289 bp->b_flags |= B_READ;
290 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
291 if (bp->b_rcred == NOCRED) {
292 if (cred != NOCRED)
293 crhold(cred);
294 bp->b_rcred = cred;
295 }
296 vfs_busy_pages(bp, 0);
297 VOP_STRATEGY(vp, bp);
298 return (biowait(bp));
299 }
300 return (0);
301 }
302
303 /*
304 * Operates like bread, but also starts asynchronous I/O on
305 * read-ahead blocks.
306 */
307 int
308 breadn(struct vnode * vp, daddr_t blkno, int size,
309 daddr_t * rablkno, int *rabsize,
310 int cnt, struct ucred * cred, struct buf ** bpp)
311 {
312 struct buf *bp, *rabp;
313 int i;
314 int rv = 0, readwait = 0;
315
316 *bpp = bp = getblk(vp, blkno, size, 0, 0);
317
318 /* if not found in cache, do some I/O */
319 if ((bp->b_flags & B_CACHE) == 0) {
320 if (curproc != NULL)
321 curproc->p_stats->p_ru.ru_inblock++;
322 bp->b_flags |= B_READ;
323 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
324 if (bp->b_rcred == NOCRED) {
325 if (cred != NOCRED)
326 crhold(cred);
327 bp->b_rcred = cred;
328 }
329 vfs_busy_pages(bp, 0);
330 VOP_STRATEGY(vp, bp);
331 ++readwait;
332 }
333 for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
334 if (inmem(vp, *rablkno))
335 continue;
336 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
337
338 if ((rabp->b_flags & B_CACHE) == 0) {
339 if (curproc != NULL)
340 curproc->p_stats->p_ru.ru_inblock++;
341 rabp->b_flags |= B_READ | B_ASYNC;
342 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
343 if (rabp->b_rcred == NOCRED) {
344 if (cred != NOCRED)
345 crhold(cred);
346 rabp->b_rcred = cred;
347 }
348 vfs_busy_pages(rabp, 0);
349 VOP_STRATEGY(vp, rabp);
350 } else {
351 brelse(rabp);
352 }
353 }
354
355 if (readwait) {
356 rv = biowait(bp);
357 }
358 return (rv);
359 }
360
361 /*
362 * Write, release buffer on completion. (Done by iodone
363 * if async.)
364 */
365 int
366 bwrite(struct buf * bp)
367 {
368 int oldflags, s;
369 struct vnode *vp;
370 struct mount *mp;
371
372
373 if (bp->b_flags & B_INVAL) {
374 brelse(bp);
375 return (0);
376 }
377
378 oldflags = bp->b_flags;
379
380 #if !defined(MAX_PERF)
381 if ((bp->b_flags & B_BUSY) == 0)
382 panic("bwrite: buffer is not busy???");
383 #endif
384
385 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
386 bp->b_flags |= B_WRITEINPROG;
387
388 s = splbio();
389 if ((oldflags & B_DELWRI) == B_DELWRI) {
390 --numdirtybuffers;
391 reassignbuf(bp, bp->b_vp);
392 }
393
394 bp->b_vp->v_numoutput++;
395 vfs_busy_pages(bp, 1);
396 if (curproc != NULL)
397 curproc->p_stats->p_ru.ru_oublock++;
398 splx(s);
399 VOP_STRATEGY(bp->b_vp, bp);
400
401 /*
402 * Collect statistics on synchronous and asynchronous writes.
403 * Writes to block devices are charged to their associated
404 * filesystem (if any).
405 */
406 if ((vp = bp->b_vp) != NULL) {
407 if (vp->v_type == VBLK)
408 mp = vp->v_specmountpoint;
409 else
410 mp = vp->v_mount;
411 if (mp != NULL)
412 if ((oldflags & B_ASYNC) == 0)
413 mp->mnt_stat.f_syncwrites++;
414 else
415 mp->mnt_stat.f_asyncwrites++;
416 }
417
418 if ((oldflags & B_ASYNC) == 0) {
419 int rtval = biowait(bp);
420 brelse(bp);
421 return (rtval);
422 }
423 return (0);
424 }
425
426 void
427 vfs_bio_need_satisfy(void) {
428 ++numfreebuffers;
429 if (!needsbuffer)
430 return;
431 if (numdirtybuffers < lodirtybuffers) {
432 needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT);
433 } else {
434 needsbuffer &= ~VFS_BIO_NEED_ANY;
435 }
436 if (numfreebuffers >= hifreebuffers) {
437 needsbuffer &= ~VFS_BIO_NEED_FREE;
438 }
439 wakeup(&needsbuffer);
440 }
441
442 /*
443 * Delayed write. (Buffer is marked dirty).
444 */
445 void
446 bdwrite(struct buf * bp)
447 {
448 struct vnode *vp;
449
450 #if !defined(MAX_PERF)
451 if ((bp->b_flags & B_BUSY) == 0) {
452 panic("bdwrite: buffer is not busy");
453 }
454 #endif
455
456 if (bp->b_flags & B_INVAL) {
457 brelse(bp);
458 return;
459 }
460 bp->b_flags &= ~(B_READ|B_RELBUF);
461 if ((bp->b_flags & B_DELWRI) == 0) {
462 bp->b_flags |= B_DONE | B_DELWRI;
463 reassignbuf(bp, bp->b_vp);
464 ++numdirtybuffers;
465 }
466
467 /*
468 * This bmap keeps the system from needing to do the bmap later,
469 * perhaps when the system is attempting to do a sync. Since it
470 * is likely that the indirect block -- or whatever other datastructure
471 * that the filesystem needs is still in memory now, it is a good
472 * thing to do this. Note also, that if the pageout daemon is
473 * requesting a sync -- there might not be enough memory to do
474 * the bmap then... So, this is important to do.
475 */
476 if (bp->b_lblkno == bp->b_blkno) {
477 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
478 }
479
480 /*
481 * Set the *dirty* buffer range based upon the VM system dirty pages.
482 */
483 vfs_setdirty(bp);
484
485 /*
486 * We need to do this here to satisfy the vnode_pager and the
487 * pageout daemon, so that it thinks that the pages have been
488 * "cleaned". Note that since the pages are in a delayed write
489 * buffer -- the VFS layer "will" see that the pages get written
490 * out on the next sync, or perhaps the cluster will be completed.
491 */
492 vfs_clean_pages(bp);
493 bqrelse(bp);
494
495 /*
496 * XXX The soft dependency code is not prepared to
497 * have I/O done when a bdwrite is requested. For
498 * now we just let the write be delayed if it is
499 * requested by the soft dependency code.
500 */
501 if ((vp = bp->b_vp) &&
502 ((vp->v_type == VBLK && vp->v_specmountpoint &&
503 (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
504 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
505 return;
506
507 if (numdirtybuffers >= hidirtybuffers)
508 flushdirtybuffers(0, 0);
509
510 return;
511 }
512
513
514 /*
515 * Same as first half of bdwrite, mark buffer dirty, but do not release it.
516 * Check how this compares with vfs_setdirty(); XXX [JRE]
517 */
518 void
519 bdirty(bp)
520 struct buf *bp;
521 {
522
523 bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */
524 if ((bp->b_flags & B_DELWRI) == 0) {
525 bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */
526 reassignbuf(bp, bp->b_vp);
527 ++numdirtybuffers;
528 }
529 }
530
531 /*
532 * Asynchronous write.
533 * Start output on a buffer, but do not wait for it to complete.
534 * The buffer is released when the output completes.
535 */
536 void
537 bawrite(struct buf * bp)
538 {
539 bp->b_flags |= B_ASYNC;
540 (void) VOP_BWRITE(bp);
541 }
542
543 /*
544 * Ordered write.
545 * Start output on a buffer, and flag it so that the device will write
546 * it in the order it was queued. The buffer is released when the output
547 * completes.
548 */
549 int
550 bowrite(struct buf * bp)
551 {
552 bp->b_flags |= B_ORDERED|B_ASYNC;
553 return (VOP_BWRITE(bp));
554 }
555
556 /*
557 * Release a buffer.
558 */
559 void
560 brelse(struct buf * bp)
561 {
562 int s;
563
564 if (bp->b_flags & B_CLUSTER) {
565 relpbuf(bp);
566 return;
567 }
568
569 s = splbio();
570
571 /* anyone need this block? */
572 if (bp->b_flags & B_WANTED) {
573 bp->b_flags &= ~(B_WANTED | B_AGE);
574 wakeup(bp);
575 }
576
577 if (bp->b_flags & B_LOCKED)
578 bp->b_flags &= ~B_ERROR;
579
580 if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) {
581 bp->b_flags &= ~B_ERROR;
582 bdirty(bp);
583 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
584 (bp->b_bufsize <= 0)) {
585 bp->b_flags |= B_INVAL;
586 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
587 (*bioops.io_deallocate)(bp);
588 if (bp->b_flags & B_DELWRI)
589 --numdirtybuffers;
590 bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
591 if ((bp->b_flags & B_VMIO) == 0) {
592 if (bp->b_bufsize)
593 allocbuf(bp, 0);
594 if (bp->b_vp)
595 brelvp(bp);
596 }
597 }
598
599 /*
600 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release()
601 * is called with B_DELWRI set, the underlying pages may wind up
602 * getting freed causing a previous write (bdwrite()) to get 'lost'
603 * because pages associated with a B_DELWRI bp are marked clean.
604 *
605 * We still allow the B_INVAL case to call vfs_vmio_release(), even
606 * if B_DELWRI is set.
607 */
608
609 if (bp->b_flags & B_DELWRI)
610 bp->b_flags &= ~B_RELBUF;
611
612 /*
613 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
614 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
615 * but the VM object is kept around. The B_NOCACHE flag is used to
616 * invalidate the pages in the VM object.
617 *
618 * The b_{validoff,validend,dirtyoff,dirtyend} values are relative
619 * to b_offset and currently have byte granularity, whereas the
620 * valid flags in the vm_pages have only DEV_BSIZE resolution.
621 * The byte resolution fields are used to avoid unnecessary re-reads
622 * of the buffer but the code really needs to be genericized so
623 * other filesystem modules can take advantage of these fields.
624 *
625 * XXX this seems to cause performance problems.
626 */
627 if ((bp->b_flags & B_VMIO)
628 && !(bp->b_vp->v_tag == VT_NFS &&
629 bp->b_vp->v_type != VBLK &&
630 (bp->b_flags & B_DELWRI) != 0)
631 #ifdef notdef
632 && (bp->b_vp->v_tag != VT_NFS
633 || bp->b_vp->v_type == VBLK
634 || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
635 || bp->b_validend == 0
636 || (bp->b_validoff == 0
637 && bp->b_validend == bp->b_bufsize))
638 #endif
639 ) {
640
641 int i, j, resid;
642 vm_page_t m;
643 off_t foff;
644 vm_pindex_t poff;
645 vm_object_t obj;
646 struct vnode *vp;
647
648 vp = bp->b_vp;
649
650 /*
651 * Get the base offset and length of the buffer. Note that
652 * for block sizes that are less then PAGE_SIZE, the b_data
653 * base of the buffer does not represent exactly b_offset and
654 * neither b_offset nor b_size are necessarily page aligned.
655 * Instead, the starting position of b_offset is:
656 *
657 * b_data + (b_offset & PAGE_MASK)
658 *
659 * block sizes less then DEV_BSIZE (usually 512) are not
660 * supported due to the page granularity bits (m->valid,
661 * m->dirty, etc...).
662 *
663 * See man buf(9) for more information
664 */
665
666 resid = bp->b_bufsize;
667 foff = bp->b_offset;
668
669 for (i = 0; i < bp->b_npages; i++) {
670 m = bp->b_pages[i];
671 vm_page_flag_clear(m, PG_ZERO);
672 if (m == bogus_page) {
673
674 obj = (vm_object_t) vp->v_object;
675 poff = OFF_TO_IDX(bp->b_offset);
676
677 for (j = i; j < bp->b_npages; j++) {
678 m = bp->b_pages[j];
679 if (m == bogus_page) {
680 m = vm_page_lookup(obj, poff + j);
681 #if !defined(MAX_PERF)
682 if (!m) {
683 panic("brelse: page missing\n");
684 }
685 #endif
686 bp->b_pages[j] = m;
687 }
688 }
689
690 if ((bp->b_flags & B_INVAL) == 0) {
691 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
692 }
693 }
694 if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
695 int poffset = foff & PAGE_MASK;
696 int presid = resid > (PAGE_SIZE - poffset) ?
697 (PAGE_SIZE - poffset) : resid;
698
699 KASSERT(presid >= 0, ("brelse: extra page"));
700 vm_page_set_invalid(m, poffset, presid);
701 }
702 resid -= PAGE_SIZE - (foff & PAGE_MASK);
703 foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
704 }
705
706 if (bp->b_flags & (B_INVAL | B_RELBUF))
707 vfs_vmio_release(bp);
708
709 } else if (bp->b_flags & B_VMIO) {
710
711 if (bp->b_flags & (B_INVAL | B_RELBUF))
712 vfs_vmio_release(bp);
713
714 }
715
716 #if !defined(MAX_PERF)
717 if (bp->b_qindex != QUEUE_NONE)
718 panic("brelse: free buffer onto another queue???");
719 #endif
720
721 /* enqueue */
722 /* buffers with no memory */
723 if (bp->b_bufsize == 0) {
724 bp->b_flags |= B_INVAL;
725 bp->b_qindex = QUEUE_EMPTY;
726 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
727 LIST_REMOVE(bp, b_hash);
728 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
729 bp->b_dev = NODEV;
730 kvafreespace += bp->b_kvasize;
731
732 /* buffers with junk contents */
733 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
734 bp->b_flags |= B_INVAL;
735 bp->b_qindex = QUEUE_AGE;
736 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
737 LIST_REMOVE(bp, b_hash);
738 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
739 bp->b_dev = NODEV;
740
741 /* buffers that are locked */
742 } else if (bp->b_flags & B_LOCKED) {
743 bp->b_qindex = QUEUE_LOCKED;
744 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
745
746 /* buffers with stale but valid contents */
747 } else if (bp->b_flags & B_AGE) {
748 bp->b_qindex = QUEUE_AGE;
749 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
750
751 /* buffers with valid and quite potentially reuseable contents */
752 } else {
753 bp->b_qindex = QUEUE_LRU;
754 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
755 }
756
757 if ((bp->b_flags & B_INVAL) ||
758 (bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
759 if (bp->b_flags & B_DELWRI) {
760 --numdirtybuffers;
761 bp->b_flags &= ~B_DELWRI;
762 }
763 vfs_bio_need_satisfy();
764 }
765
766 /* unlock */
767 bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
768 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
769 splx(s);
770 }
771
772 /*
773 * Release a buffer.
774 */
775 void
776 bqrelse(struct buf * bp)
777 {
778 int s;
779
780 s = splbio();
781
782 /* anyone need this block? */
783 if (bp->b_flags & B_WANTED) {
784 bp->b_flags &= ~(B_WANTED | B_AGE);
785 wakeup(bp);
786 }
787
788 #if !defined(MAX_PERF)
789 if (bp->b_qindex != QUEUE_NONE)
790 panic("bqrelse: free buffer onto another queue???");
791 #endif
792
793 if (bp->b_flags & B_LOCKED) {
794 bp->b_flags &= ~B_ERROR;
795 bp->b_qindex = QUEUE_LOCKED;
796 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
797 /* buffers with stale but valid contents */
798 } else {
799 bp->b_qindex = QUEUE_LRU;
800 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
801 }
802
803 if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) {
804 vfs_bio_need_satisfy();
805 }
806
807 /* unlock */
808 bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
809 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
810 splx(s);
811 }
812
813 static void
814 vfs_vmio_release(bp)
815 struct buf *bp;
816 {
817 int i, s;
818 vm_page_t m;
819
820 s = splvm();
821 for (i = 0; i < bp->b_npages; i++) {
822 m = bp->b_pages[i];
823 bp->b_pages[i] = NULL;
824 /*
825 * In order to keep page LRU ordering consistent, put
826 * everything on the inactive queue.
827 */
828 vm_page_unwire(m, 0);
829 /*
830 * We don't mess with busy pages, it is
831 * the responsibility of the process that
832 * busied the pages to deal with them.
833 */
834 if ((m->flags & PG_BUSY) || (m->busy != 0))
835 continue;
836
837 if (m->wire_count == 0) {
838 vm_page_flag_clear(m, PG_ZERO);
839 /*
840 * Might as well free the page if we can and it has
841 * no valid data.
842 */
843 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
844 vm_page_busy(m);
845 vm_page_protect(m, VM_PROT_NONE);
846 vm_page_free(m);
847 }
848 }
849 }
850 splx(s);
851 bufspace -= bp->b_bufsize;
852 vmiospace -= bp->b_bufsize;
853 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
854 bp->b_npages = 0;
855 bp->b_bufsize = 0;
856 bp->b_flags &= ~B_VMIO;
857 if (bp->b_vp)
858 brelvp(bp);
859 }
860
861 /*
862 * Check to see if a block is currently memory resident.
863 */
864 struct buf *
865 gbincore(struct vnode * vp, daddr_t blkno)
866 {
867 struct buf *bp;
868 struct bufhashhdr *bh;
869
870 bh = BUFHASH(vp, blkno);
871 bp = bh->lh_first;
872
873 /* Search hash chain */
874 while (bp != NULL) {
875 /* hit */
876 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
877 (bp->b_flags & B_INVAL) == 0) {
878 break;
879 }
880 bp = bp->b_hash.le_next;
881 }
882 return (bp);
883 }
884
885 /*
886 * this routine implements clustered async writes for
887 * clearing out B_DELWRI buffers... This is much better
888 * than the old way of writing only one buffer at a time.
889 */
890 int
891 vfs_bio_awrite(struct buf * bp)
892 {
893 int i;
894 daddr_t lblkno = bp->b_lblkno;
895 struct vnode *vp = bp->b_vp;
896 int s;
897 int ncl;
898 struct buf *bpa;
899 int nwritten;
900 int size;
901 int maxcl;
902
903 s = splbio();
904 /*
905 * right now we support clustered writing only to regular files
906 */
907 if ((vp->v_type == VREG) &&
908 (vp->v_mount != 0) && /* Only on nodes that have the size info */
909 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
910
911 size = vp->v_mount->mnt_stat.f_iosize;
912 maxcl = MAXPHYS / size;
913
914 for (i = 1; i < maxcl; i++) {
915 if ((bpa = gbincore(vp, lblkno + i)) &&
916 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
917 (B_DELWRI | B_CLUSTEROK)) &&
918 (bpa->b_bufsize == size)) {
919 if ((bpa->b_blkno == bpa->b_lblkno) ||
920 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
921 break;
922 } else {
923 break;
924 }
925 }
926 ncl = i;
927 /*
928 * this is a possible cluster write
929 */
930 if (ncl != 1) {
931 nwritten = cluster_wbuild(vp, size, lblkno, ncl);
932 splx(s);
933 return nwritten;
934 }
935 }
936
937 bremfree(bp);
938 bp->b_flags |= B_BUSY | B_ASYNC;
939
940 splx(s);
941 /*
942 * default (old) behavior, writing out only one block
943 */
944 nwritten = bp->b_bufsize;
945 (void) VOP_BWRITE(bp);
946 return nwritten;
947 }
948
949
950 /*
951 * Find a buffer header which is available for use.
952 */
953 static struct buf *
954 getnewbuf(struct vnode *vp, daddr_t blkno,
955 int slpflag, int slptimeo, int size, int maxsize)
956 {
957 struct buf *bp, *bp1;
958 int nbyteswritten = 0;
959 vm_offset_t addr;
960 static int writerecursion = 0;
961
962 start:
963 if (bufspace >= maxbufspace)
964 goto trytofreespace;
965
966 /* can we constitute a new buffer? */
967 if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
968 #if !defined(MAX_PERF)
969 if (bp->b_qindex != QUEUE_EMPTY)
970 panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
971 bp->b_qindex);
972 #endif
973 bp->b_flags |= B_BUSY;
974 bremfree(bp);
975 goto fillbuf;
976 }
977 trytofreespace:
978 /*
979 * We keep the file I/O from hogging metadata I/O
980 * This is desirable because file data is cached in the
981 * VM/Buffer cache even if a buffer is freed.
982 */
983 if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
984 #if !defined(MAX_PERF)
985 if (bp->b_qindex != QUEUE_AGE)
986 panic("getnewbuf: inconsistent AGE queue, qindex=%d",
987 bp->b_qindex);
988 #endif
989 } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
990 #if !defined(MAX_PERF)
991 if (bp->b_qindex != QUEUE_LRU)
992 panic("getnewbuf: inconsistent LRU queue, qindex=%d",
993 bp->b_qindex);
994 #endif
995 }
996 if (!bp) {
997 /* wait for a free buffer of any kind */
998 needsbuffer |= VFS_BIO_NEED_ANY;
999 do
1000 if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
1001 "newbuf", slptimeo))
1002 return (NULL);
1003 while (needsbuffer & VFS_BIO_NEED_ANY);
1004 return (0);
1005 }
1006 KASSERT(!(bp->b_flags & B_BUSY),
1007 ("getnewbuf: busy buffer on free list\n"));
1008 /*
1009 * We are fairly aggressive about freeing VMIO buffers, but since
1010 * the buffering is intact without buffer headers, there is not
1011 * much loss. We gain by maintaining non-VMIOed metadata in buffers.
1012 */
1013 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
1014 if ((bp->b_flags & B_VMIO) == 0 ||
1015 (vmiospace < maxvmiobufspace)) {
1016 --bp->b_usecount;
1017 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1018 if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1019 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1020 goto start;
1021 }
1022 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1023 }
1024 }
1025
1026
1027 /* if we are a delayed write, convert to an async write */
1028 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
1029
1030 /*
1031 * If our delayed write is likely to be used soon, then
1032 * recycle back onto the LRU queue.
1033 */
1034 if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) &&
1035 (bp->b_lblkno >= blkno) && (maxsize > 0)) {
1036
1037 if (bp->b_usecount > 0) {
1038 if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) {
1039
1040 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
1041
1042 if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
1043 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1044 bp->b_usecount--;
1045 goto start;
1046 }
1047 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
1048 }
1049 }
1050 }
1051
1052 /*
1053 * Certain layered filesystems can recursively re-enter the vfs_bio
1054 * code, due to delayed writes. This helps keep the system from
1055 * deadlocking.
1056 * This hack to avoid premature panic is courtesy of alfred
1057 * (alfred@freebsd.org)
1058 */
1059 if (writerecursion > 0) {
1060 if (writerecursion > 5) {
1061 int loop = 0;
1062 norecurse:
1063 bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1064 while (bp) {
1065 if ((bp->b_flags & B_DELWRI) == 0)
1066 break;
1067 bp = TAILQ_NEXT(bp, b_freelist);
1068 }
1069 if (bp == NULL) {
1070 bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1071 while (bp) {
1072 if ((bp->b_flags & B_DELWRI) == 0)
1073 break;
1074 bp = TAILQ_NEXT(bp, b_freelist);
1075 }
1076 }
1077 if (bp == NULL) {
1078 needsbuffer |= VFS_BIO_NEED_ANY;
1079 if (tsleep(&needsbuffer,
1080 (PRIBIO + 4) | slpflag,
1081 "nbufhack", slptimeo+1))
1082 return (NULL);
1083 if (loop++ < 5)
1084 goto norecurse;
1085 else
1086 goto start;
1087 }
1088 } else {
1089 bremfree(bp);
1090 bp->b_flags |= B_BUSY | B_AGE | B_ASYNC;
1091 nbyteswritten += bp->b_bufsize;
1092 ++writerecursion;
1093 VOP_BWRITE(bp);
1094 --writerecursion;
1095 if (!slpflag && !slptimeo) {
1096 return (0);
1097 }
1098 goto start;
1099 }
1100 } else {
1101 ++writerecursion;
1102 nbyteswritten += vfs_bio_awrite(bp);
1103 --writerecursion;
1104 if (!slpflag && !slptimeo) {
1105 return (0);
1106 }
1107 goto start;
1108 }
1109 }
1110
1111 if (bp->b_flags & B_WANTED) {
1112 bp->b_flags &= ~B_WANTED;
1113 wakeup(bp);
1114 }
1115 bremfree(bp);
1116 bp->b_flags |= B_BUSY;
1117
1118 if (bp->b_flags & B_VMIO) {
1119 bp->b_flags &= ~B_ASYNC;
1120 vfs_vmio_release(bp);
1121 }
1122
1123 if (bp->b_vp)
1124 brelvp(bp);
1125
1126 fillbuf:
1127
1128 /* we are not free, nor do we contain interesting data */
1129 if (bp->b_rcred != NOCRED) {
1130 crfree(bp->b_rcred);
1131 bp->b_rcred = NOCRED;
1132 }
1133 if (bp->b_wcred != NOCRED) {
1134 crfree(bp->b_wcred);
1135 bp->b_wcred = NOCRED;
1136 }
1137 if (LIST_FIRST(&bp->b_dep) != NULL &&
1138 bioops.io_deallocate)
1139 (*bioops.io_deallocate)(bp);
1140
1141 LIST_REMOVE(bp, b_hash);
1142 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
1143 if (bp->b_bufsize) {
1144 allocbuf(bp, 0);
1145 }
1146 bp->b_flags = B_BUSY;
1147 bp->b_dev = NODEV;
1148 bp->b_vp = NULL;
1149 bp->b_blkno = bp->b_lblkno = 0;
1150 bp->b_offset = NOOFFSET;
1151 bp->b_iodone = 0;
1152 bp->b_error = 0;
1153 bp->b_resid = 0;
1154 bp->b_bcount = 0;
1155 bp->b_npages = 0;
1156 bp->b_dirtyoff = bp->b_dirtyend = 0;
1157 bp->b_validoff = bp->b_validend = 0;
1158 bp->b_usecount = 5;
1159 /* Here, not kern_physio.c, is where this should be done*/
1160 LIST_INIT(&bp->b_dep);
1161
1162 maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
1163
1164 /*
1165 * we assume that buffer_map is not at address 0
1166 */
1167 addr = 0;
1168 if (maxsize != bp->b_kvasize) {
1169 bfreekva(bp);
1170
1171 findkvaspace:
1172 /*
1173 * See if we have buffer kva space
1174 */
1175 if (vm_map_findspace(buffer_map,
1176 vm_map_min(buffer_map), maxsize, &addr)) {
1177 if (kvafreespace > 0) {
1178 int totfree = 0, freed;
1179 do {
1180 freed = 0;
1181 for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
1182 bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) {
1183 if (bp1->b_kvasize != 0) {
1184 totfree += bp1->b_kvasize;
1185 freed = bp1->b_kvasize;
1186 bremfree(bp1);
1187 bfreekva(bp1);
1188 brelse(bp1);
1189 break;
1190 }
1191 }
1192 } while (freed);
1193 /*
1194 * if we found free space, then retry with the same buffer.
1195 */
1196 if (totfree)
1197 goto findkvaspace;
1198 }
1199 bp->b_flags |= B_INVAL;
1200 brelse(bp);
1201 goto trytofreespace;
1202 }
1203 }
1204
1205 /*
1206 * See if we are below are allocated minimum
1207 */
1208 if (bufspace >= (maxbufspace + nbyteswritten)) {
1209 bp->b_flags |= B_INVAL;
1210 brelse(bp);
1211 goto trytofreespace;
1212 }
1213
1214 /*
1215 * create a map entry for the buffer -- in essence
1216 * reserving the kva space.
1217 */
1218 if (addr) {
1219 vm_map_insert(buffer_map, NULL, 0,
1220 addr, addr + maxsize,
1221 VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
1222
1223 bp->b_kvabase = (caddr_t) addr;
1224 bp->b_kvasize = maxsize;
1225 }
1226 bp->b_data = bp->b_kvabase;
1227
1228 return (bp);
1229 }
1230
1231 static void
1232 waitfreebuffers(int slpflag, int slptimeo) {
1233 while (numfreebuffers < hifreebuffers) {
1234 flushdirtybuffers(slpflag, slptimeo);
1235 if (numfreebuffers >= hifreebuffers)
1236 break;
1237 needsbuffer |= VFS_BIO_NEED_FREE;
1238 if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
1239 break;
1240 }
1241 }
1242
1243 static void
1244 flushdirtybuffers(int slpflag, int slptimeo) {
1245 int s;
1246 static pid_t flushing = 0;
1247
1248 s = splbio();
1249
1250 if (flushing) {
1251 if (flushing == curproc->p_pid) {
1252 splx(s);
1253 return;
1254 }
1255 while (flushing) {
1256 if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) {
1257 splx(s);
1258 return;
1259 }
1260 }
1261 }
1262 flushing = curproc->p_pid;
1263
1264 while (numdirtybuffers > lodirtybuffers) {
1265 struct buf *bp;
1266 needsbuffer |= VFS_BIO_NEED_LOWLIMIT;
1267 bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
1268 if (bp == NULL)
1269 bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
1270
1271 while (bp && ((bp->b_flags & B_DELWRI) == 0)) {
1272 bp = TAILQ_NEXT(bp, b_freelist);
1273 }
1274
1275 if (bp) {
1276 vfs_bio_awrite(bp);
1277 continue;
1278 }
1279 break;
1280 }
1281
1282 flushing = 0;
1283 wakeup(&flushing);
1284 splx(s);
1285 }
1286
1287 /*
1288 * Check to see if a block is currently memory resident.
1289 */
1290 struct buf *
1291 incore(struct vnode * vp, daddr_t blkno)
1292 {
1293 struct buf *bp;
1294
1295 int s = splbio();
1296 bp = gbincore(vp, blkno);
1297 splx(s);
1298 return (bp);
1299 }
1300
1301 /*
1302 * Returns true if no I/O is needed to access the
1303 * associated VM object. This is like incore except
1304 * it also hunts around in the VM system for the data.
1305 */
1306
1307 int
1308 inmem(struct vnode * vp, daddr_t blkno)
1309 {
1310 vm_object_t obj;
1311 vm_offset_t toff, tinc, size;
1312 vm_page_t m;
1313 vm_ooffset_t off;
1314
1315 if (incore(vp, blkno))
1316 return 1;
1317 if (vp->v_mount == NULL)
1318 return 0;
1319 if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
1320 return 0;
1321
1322 obj = vp->v_object;
1323 size = PAGE_SIZE;
1324 if (size > vp->v_mount->mnt_stat.f_iosize)
1325 size = vp->v_mount->mnt_stat.f_iosize;
1326 off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
1327
1328 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1329 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1330 if (!m)
1331 return 0;
1332 tinc = size;
1333 if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
1334 tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
1335 if (vm_page_is_valid(m,
1336 (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
1337 return 0;
1338 }
1339 return 1;
1340 }
1341
1342 /*
1343 * now we set the dirty range for the buffer --
1344 * for NFS -- if the file is mapped and pages have
1345 * been written to, let it know. We want the
1346 * entire range of the buffer to be marked dirty if
1347 * any of the pages have been written to for consistancy
1348 * with the b_validoff, b_validend set in the nfs write
1349 * code, and used by the nfs read code.
1350 */
1351 static void
1352 vfs_setdirty(struct buf *bp) {
1353 int i;
1354 vm_object_t object;
1355 vm_offset_t boffset;
1356 #if 0
1357 vm_offset_t offset;
1358 #endif
1359
1360 /*
1361 * We qualify the scan for modified pages on whether the
1362 * object has been flushed yet. The OBJ_WRITEABLE flag
1363 * is not cleared simply by protecting pages off.
1364 */
1365 if ((bp->b_flags & B_VMIO) &&
1366 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1367 /*
1368 * test the pages to see if they have been modified directly
1369 * by users through the VM system.
1370 */
1371 for (i = 0; i < bp->b_npages; i++) {
1372 vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
1373 vm_page_test_dirty(bp->b_pages[i]);
1374 }
1375
1376 /*
1377 * scan forwards for the first page modified
1378 */
1379 for (i = 0; i < bp->b_npages; i++) {
1380 if (bp->b_pages[i]->dirty) {
1381 break;
1382 }
1383 }
1384 boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1385 if (boffset < bp->b_dirtyoff) {
1386 bp->b_dirtyoff = max(boffset, 0);
1387 }
1388
1389 /*
1390 * scan backwards for the last page modified
1391 */
1392 for (i = bp->b_npages - 1; i >= 0; --i) {
1393 if (bp->b_pages[i]->dirty) {
1394 break;
1395 }
1396 }
1397 boffset = (i + 1);
1398 #if 0
1399 offset = boffset + bp->b_pages[0]->pindex;
1400 if (offset >= object->size)
1401 boffset = object->size - bp->b_pages[0]->pindex;
1402 #endif
1403 boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
1404 if (bp->b_dirtyend < boffset)
1405 bp->b_dirtyend = min(boffset, bp->b_bufsize);
1406 }
1407 }
1408
1409 /*
1410 * Get a block given a specified block and offset into a file/device.
1411 */
1412 struct buf *
1413 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1414 {
1415 struct buf *bp;
1416 int i, s;
1417 struct bufhashhdr *bh;
1418
1419 #if !defined(MAX_PERF)
1420 if (size > MAXBSIZE)
1421 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1422 #endif
1423
1424 s = splbio();
1425 loop:
1426 if (numfreebuffers < lofreebuffers) {
1427 waitfreebuffers(slpflag, slptimeo);
1428 }
1429
1430 if ((bp = gbincore(vp, blkno))) {
1431 if (bp->b_flags & B_BUSY) {
1432
1433 bp->b_flags |= B_WANTED;
1434 if (bp->b_usecount < BUF_MAXUSE)
1435 ++bp->b_usecount;
1436
1437 if (!tsleep(bp,
1438 (PRIBIO + 4) | slpflag, "getblk", slptimeo)) {
1439 goto loop;
1440 }
1441
1442 splx(s);
1443 return (struct buf *) NULL;
1444 }
1445 bp->b_flags |= B_BUSY | B_CACHE;
1446 bremfree(bp);
1447
1448 /*
1449 * check for size inconsistancies (note that they shouldn't
1450 * happen but do when filesystems don't handle the size changes
1451 * correctly.) We are conservative on metadata and don't just
1452 * extend the buffer but write (if needed) and re-constitute it.
1453 */
1454
1455 if (bp->b_bcount != size) {
1456 if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1457 allocbuf(bp, size);
1458 } else {
1459 if (bp->b_flags & B_DELWRI) {
1460 bp->b_flags |= B_NOCACHE;
1461 VOP_BWRITE(bp);
1462 } else {
1463 if ((bp->b_flags & B_VMIO) &&
1464 (LIST_FIRST(&bp->b_dep) == NULL)) {
1465 bp->b_flags |= B_RELBUF;
1466 brelse(bp);
1467 } else {
1468 bp->b_flags |= B_NOCACHE;
1469 VOP_BWRITE(bp);
1470 }
1471 }
1472 goto loop;
1473 }
1474 }
1475 KASSERT(bp->b_offset != NOOFFSET,
1476 ("getblk: no buffer offset"));
1477 /*
1478 * Check that the constituted buffer really deserves for the
1479 * B_CACHE bit to be set. B_VMIO type buffers might not
1480 * contain fully valid pages. Normal (old-style) buffers
1481 * should be fully valid.
1482 */
1483 if (
1484 (bp->b_flags & (B_VMIO|B_CACHE)) == (B_VMIO|B_CACHE) &&
1485 (bp->b_vp->v_tag != VT_NFS || bp->b_validend <= 0)
1486 ) {
1487 int checksize = bp->b_bufsize;
1488 int poffset = bp->b_offset & PAGE_MASK;
1489 int resid;
1490 for (i = 0; i < bp->b_npages; i++) {
1491 resid = (checksize > (PAGE_SIZE - poffset)) ?
1492 (PAGE_SIZE - poffset) : checksize;
1493 if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) {
1494 bp->b_flags &= ~(B_CACHE | B_DONE);
1495 break;
1496 }
1497 checksize -= resid;
1498 poffset = 0;
1499 }
1500 }
1501
1502 if (bp->b_usecount < BUF_MAXUSE)
1503 ++bp->b_usecount;
1504 splx(s);
1505 return (bp);
1506 } else {
1507 int bsize, maxsize, vmio;
1508 off_t offset;
1509
1510 if (vp->v_type == VBLK)
1511 bsize = DEV_BSIZE;
1512 else if (vp->v_mountedhere)
1513 bsize = vp->v_mountedhere->mnt_stat.f_iosize;
1514 else if (vp->v_mount)
1515 bsize = vp->v_mount->mnt_stat.f_iosize;
1516 else
1517 bsize = size;
1518
1519 offset = (off_t)blkno * bsize;
1520 vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
1521 maxsize = vmio ? size + (offset & PAGE_MASK) : size;
1522 maxsize = imax(maxsize, bsize);
1523
1524 if ((bp = getnewbuf(vp, blkno,
1525 slpflag, slptimeo, size, maxsize)) == 0) {
1526 if (slpflag || slptimeo) {
1527 splx(s);
1528 return NULL;
1529 }
1530 goto loop;
1531 }
1532
1533 /*
1534 * This code is used to make sure that a buffer is not
1535 * created while the getnewbuf routine is blocked.
1536 * This can be a problem whether the vnode is locked or not.
1537 */
1538 if (gbincore(vp, blkno)) {
1539 bp->b_flags |= B_INVAL;
1540 brelse(bp);
1541 goto loop;
1542 }
1543
1544 /*
1545 * Insert the buffer into the hash, so that it can
1546 * be found by incore.
1547 */
1548 bp->b_blkno = bp->b_lblkno = blkno;
1549 bp->b_offset = offset;
1550
1551 bgetvp(vp, bp);
1552 LIST_REMOVE(bp, b_hash);
1553 bh = BUFHASH(vp, blkno);
1554 LIST_INSERT_HEAD(bh, bp, b_hash);
1555
1556 if (vmio) {
1557 bp->b_flags |= (B_VMIO | B_CACHE);
1558 #if defined(VFS_BIO_DEBUG)
1559 if (vp->v_type != VREG && vp->v_type != VBLK)
1560 printf("getblk: vmioing file type %d???\n", vp->v_type);
1561 #endif
1562 } else {
1563 bp->b_flags &= ~B_VMIO;
1564 }
1565
1566 allocbuf(bp, size);
1567
1568 splx(s);
1569 return (bp);
1570 }
1571 }
1572
1573 /*
1574 * Get an empty, disassociated buffer of given size.
1575 */
1576 struct buf *
1577 geteblk(int size)
1578 {
1579 struct buf *bp;
1580 int s;
1581
1582 s = splbio();
1583 while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
1584 splx(s);
1585 allocbuf(bp, size);
1586 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
1587 return (bp);
1588 }
1589
1590
1591 /*
1592 * This code constitutes the buffer memory from either anonymous system
1593 * memory (in the case of non-VMIO operations) or from an associated
1594 * VM object (in the case of VMIO operations).
1595 *
1596 * Note that this code is tricky, and has many complications to resolve
1597 * deadlock or inconsistant data situations. Tread lightly!!!
1598 *
1599 * Modify the length of a buffer's underlying buffer storage without
1600 * destroying information (unless, of course the buffer is shrinking).
1601 */
1602 int
1603 allocbuf(struct buf * bp, int size)
1604 {
1605
1606 int s;
1607 int newbsize, mbsize;
1608 int i;
1609
1610 #if !defined(MAX_PERF)
1611 if (!(bp->b_flags & B_BUSY))
1612 panic("allocbuf: buffer not busy");
1613
1614 if (bp->b_kvasize < size)
1615 panic("allocbuf: buffer too small");
1616 #endif
1617
1618 if ((bp->b_flags & B_VMIO) == 0) {
1619 caddr_t origbuf;
1620 int origbufsize;
1621 /*
1622 * Just get anonymous memory from the kernel
1623 */
1624 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1625 #if !defined(NO_B_MALLOC)
1626 if (bp->b_flags & B_MALLOC)
1627 newbsize = mbsize;
1628 else
1629 #endif
1630 newbsize = round_page(size);
1631
1632 if (newbsize < bp->b_bufsize) {
1633 #if !defined(NO_B_MALLOC)
1634 /*
1635 * malloced buffers are not shrunk
1636 */
1637 if (bp->b_flags & B_MALLOC) {
1638 if (newbsize) {
1639 bp->b_bcount = size;
1640 } else {
1641 free(bp->b_data, M_BIOBUF);
1642 bufspace -= bp->b_bufsize;
1643 bufmallocspace -= bp->b_bufsize;
1644 bp->b_data = bp->b_kvabase;
1645 bp->b_bufsize = 0;
1646 bp->b_bcount = 0;
1647 bp->b_flags &= ~B_MALLOC;
1648 }
1649 return 1;
1650 }
1651 #endif
1652 vm_hold_free_pages(
1653 bp,
1654 (vm_offset_t) bp->b_data + newbsize,
1655 (vm_offset_t) bp->b_data + bp->b_bufsize);
1656 } else if (newbsize > bp->b_bufsize) {
1657 #if !defined(NO_B_MALLOC)
1658 /*
1659 * We only use malloced memory on the first allocation.
1660 * and revert to page-allocated memory when the buffer grows.
1661 */
1662 if ( (bufmallocspace < maxbufmallocspace) &&
1663 (bp->b_bufsize == 0) &&
1664 (mbsize <= PAGE_SIZE/2)) {
1665
1666 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1667 bp->b_bufsize = mbsize;
1668 bp->b_bcount = size;
1669 bp->b_flags |= B_MALLOC;
1670 bufspace += mbsize;
1671 bufmallocspace += mbsize;
1672 return 1;
1673 }
1674 #endif
1675 origbuf = NULL;
1676 origbufsize = 0;
1677 #if !defined(NO_B_MALLOC)
1678 /*
1679 * If the buffer is growing on its other-than-first allocation,
1680 * then we revert to the page-allocation scheme.
1681 */
1682 if (bp->b_flags & B_MALLOC) {
1683 origbuf = bp->b_data;
1684 origbufsize = bp->b_bufsize;
1685 bp->b_data = bp->b_kvabase;
1686 bufspace -= bp->b_bufsize;
1687 bufmallocspace -= bp->b_bufsize;
1688 bp->b_bufsize = 0;
1689 bp->b_flags &= ~B_MALLOC;
1690 newbsize = round_page(newbsize);
1691 }
1692 #endif
1693 vm_hold_load_pages(
1694 bp,
1695 (vm_offset_t) bp->b_data + bp->b_bufsize,
1696 (vm_offset_t) bp->b_data + newbsize);
1697 #if !defined(NO_B_MALLOC)
1698 if (origbuf) {
1699 bcopy(origbuf, bp->b_data, origbufsize);
1700 free(origbuf, M_BIOBUF);
1701 }
1702 #endif
1703 }
1704 } else {
1705 vm_page_t m;
1706 int desiredpages;
1707
1708 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1709 desiredpages = (size == 0) ? 0 :
1710 num_pages((bp->b_offset & PAGE_MASK) + newbsize);
1711
1712 #if !defined(NO_B_MALLOC)
1713 if (bp->b_flags & B_MALLOC)
1714 panic("allocbuf: VMIO buffer can't be malloced");
1715 #endif
1716
1717 if (newbsize < bp->b_bufsize) {
1718 if (desiredpages < bp->b_npages) {
1719 for (i = desiredpages; i < bp->b_npages; i++) {
1720 /*
1721 * the page is not freed here -- it
1722 * is the responsibility of
1723 * vnode_pager_setsize. However, we
1724 * have to wait if it is busy in order
1725 * to be able to unwire the page.
1726 */
1727 m = bp->b_pages[i];
1728 KASSERT(m != bogus_page,
1729 ("allocbuf: bogus page found"));
1730
1731 while(vm_page_sleep(m, "biodep", &m->busy))
1732 ;
1733
1734 bp->b_pages[i] = NULL;
1735 vm_page_unwire(m, 0);
1736 }
1737 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
1738 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1739 bp->b_npages = desiredpages;
1740 }
1741 } else if (newbsize > bp->b_bufsize) {
1742 vm_object_t obj;
1743 vm_offset_t tinc, toff;
1744 vm_ooffset_t off;
1745 vm_pindex_t objoff;
1746 int pageindex, curbpnpages;
1747 struct vnode *vp;
1748 int bsize;
1749 int orig_validoff = bp->b_validoff;
1750 int orig_validend = bp->b_validend;
1751
1752 vp = bp->b_vp;
1753
1754 if (vp->v_type == VBLK)
1755 bsize = DEV_BSIZE;
1756 else
1757 bsize = vp->v_mount->mnt_stat.f_iosize;
1758
1759 if (bp->b_npages < desiredpages) {
1760 obj = vp->v_object;
1761 tinc = PAGE_SIZE;
1762
1763 off = bp->b_offset;
1764 KASSERT(bp->b_offset != NOOFFSET,
1765 ("allocbuf: no buffer offset"));
1766 curbpnpages = bp->b_npages;
1767 doretry:
1768 bp->b_validoff = orig_validoff;
1769 bp->b_validend = orig_validend;
1770 bp->b_flags |= B_CACHE;
1771 for (toff = 0; toff < newbsize; toff += tinc) {
1772 objoff = OFF_TO_IDX(off + toff);
1773 pageindex = objoff - OFF_TO_IDX(off);
1774 tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK);
1775 if (pageindex < curbpnpages) {
1776
1777 m = bp->b_pages[pageindex];
1778 #ifdef VFS_BIO_DIAG
1779 if (m->pindex != objoff)
1780 panic("allocbuf: page changed offset?!!!?");
1781 #endif
1782 if (tinc > (newbsize - toff))
1783 tinc = newbsize - toff;
1784 if (bp->b_flags & B_CACHE)
1785 vfs_buf_set_valid(bp, off, toff, tinc, m);
1786 continue;
1787 }
1788 m = vm_page_lookup(obj, objoff);
1789 if (!m) {
1790 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1791 if (!m) {
1792 VM_WAIT;
1793 vm_pageout_deficit += (desiredpages - curbpnpages);
1794 goto doretry;
1795 }
1796
1797 vm_page_wire(m);
1798 vm_page_flag_clear(m, PG_BUSY);
1799 bp->b_flags &= ~B_CACHE;
1800
1801 } else if (m->flags & PG_BUSY) {
1802 s = splvm();
1803 if (m->flags & PG_BUSY) {
1804 vm_page_flag_set(m, PG_WANTED);
1805 tsleep(m, PVM, "pgtblk", 0);
1806 }
1807 splx(s);
1808 goto doretry;
1809 } else {
1810 if ((curproc != pageproc) &&
1811 ((m->queue - m->pc) == PQ_CACHE) &&
1812 ((cnt.v_free_count + cnt.v_cache_count) <
1813 (cnt.v_free_min + cnt.v_cache_min))) {
1814 pagedaemon_wakeup();
1815 }
1816 if (tinc > (newbsize - toff))
1817 tinc = newbsize - toff;
1818 if (bp->b_flags & B_CACHE)
1819 vfs_buf_set_valid(bp, off, toff, tinc, m);
1820 vm_page_flag_clear(m, PG_ZERO);
1821 vm_page_wire(m);
1822 }
1823 bp->b_pages[pageindex] = m;
1824 curbpnpages = pageindex + 1;
1825 }
1826 if (vp->v_tag == VT_NFS &&
1827 vp->v_type != VBLK) {
1828 if (bp->b_dirtyend > 0) {
1829 bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
1830 bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
1831 }
1832 if (bp->b_validend == 0)
1833 bp->b_flags &= ~B_CACHE;
1834 }
1835 bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data);
1836 bp->b_npages = curbpnpages;
1837 pmap_qenter((vm_offset_t) bp->b_data,
1838 bp->b_pages, bp->b_npages);
1839 ((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1840 }
1841 }
1842 }
1843 if (bp->b_flags & B_VMIO)
1844 vmiospace += (newbsize - bp->b_bufsize);
1845 bufspace += (newbsize - bp->b_bufsize);
1846 bp->b_bufsize = newbsize;
1847 bp->b_bcount = size;
1848 return 1;
1849 }
1850
1851 /*
1852 * Wait for buffer I/O completion, returning error status.
1853 */
1854 int
1855 biowait(register struct buf * bp)
1856 {
1857 int s;
1858
1859 s = splbio();
1860 while ((bp->b_flags & B_DONE) == 0)
1861 #if defined(NO_SCHEDULE_MODS)
1862 tsleep(bp, PRIBIO, "biowait", 0);
1863 #else
1864 if (bp->b_flags & B_READ)
1865 tsleep(bp, PRIBIO, "biord", 0);
1866 else
1867 tsleep(bp, PRIBIO, "biowr", 0);
1868 #endif
1869 splx(s);
1870 if (bp->b_flags & B_EINTR) {
1871 bp->b_flags &= ~B_EINTR;
1872 return (EINTR);
1873 }
1874 if (bp->b_flags & B_ERROR) {
1875 return (bp->b_error ? bp->b_error : EIO);
1876 } else {
1877 return (0);
1878 }
1879 }
1880
1881 /*
1882 * Finish I/O on a buffer, calling an optional function.
1883 * This is usually called from interrupt level, so process blocking
1884 * is not *a good idea*.
1885 */
1886 void
1887 biodone(register struct buf * bp)
1888 {
1889 int s;
1890
1891 s = splbio();
1892
1893 #if !defined(MAX_PERF)
1894 if (!(bp->b_flags & B_BUSY))
1895 panic("biodone: buffer not busy");
1896 #endif
1897
1898 if (bp->b_flags & B_DONE) {
1899 splx(s);
1900 #if !defined(MAX_PERF)
1901 printf("biodone: buffer already done\n");
1902 #endif
1903 return;
1904 }
1905 bp->b_flags |= B_DONE;
1906
1907 if (bp->b_flags & B_FREEBUF) {
1908 brelse(bp);
1909 splx(s);
1910 return;
1911 }
1912
1913 if ((bp->b_flags & B_READ) == 0) {
1914 vwakeup(bp);
1915 }
1916
1917 /* call optional completion function if requested */
1918 if (bp->b_flags & B_CALL) {
1919 bp->b_flags &= ~B_CALL;
1920 (*bp->b_iodone) (bp);
1921 splx(s);
1922 return;
1923 }
1924 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
1925 (*bioops.io_complete)(bp);
1926
1927 if (bp->b_flags & B_VMIO) {
1928 int i, resid;
1929 vm_ooffset_t foff;
1930 vm_page_t m;
1931 vm_object_t obj;
1932 int iosize;
1933 struct vnode *vp = bp->b_vp;
1934
1935 obj = vp->v_object;
1936
1937 #if defined(VFS_BIO_DEBUG)
1938 if (vp->v_usecount == 0) {
1939 panic("biodone: zero vnode ref count");
1940 }
1941
1942 if (vp->v_object == NULL) {
1943 panic("biodone: missing VM object");
1944 }
1945
1946 if ((vp->v_flag & VOBJBUF) == 0) {
1947 panic("biodone: vnode is not setup for merged cache");
1948 }
1949 #endif
1950
1951 foff = bp->b_offset;
1952 KASSERT(bp->b_offset != NOOFFSET,
1953 ("biodone: no buffer offset"));
1954
1955 #if !defined(MAX_PERF)
1956 if (!obj) {
1957 panic("biodone: no object");
1958 }
1959 #endif
1960 #if defined(VFS_BIO_DEBUG)
1961 if (obj->paging_in_progress < bp->b_npages) {
1962 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1963 obj->paging_in_progress, bp->b_npages);
1964 }
1965 #endif
1966 iosize = bp->b_bufsize;
1967 for (i = 0; i < bp->b_npages; i++) {
1968 int bogusflag = 0;
1969 m = bp->b_pages[i];
1970 if (m == bogus_page) {
1971 bogusflag = 1;
1972 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1973 if (!m) {
1974 #if defined(VFS_BIO_DEBUG)
1975 printf("biodone: page disappeared\n");
1976 #endif
1977 vm_object_pip_subtract(obj, 1);
1978 continue;
1979 }
1980 bp->b_pages[i] = m;
1981 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
1982 }
1983 #if defined(VFS_BIO_DEBUG)
1984 if (OFF_TO_IDX(foff) != m->pindex) {
1985 printf(
1986 "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
1987 (unsigned long)foff, m->pindex);
1988 }
1989 #endif
1990 resid = IDX_TO_OFF(m->pindex + 1) - foff;
1991 if (resid > iosize)
1992 resid = iosize;
1993
1994 /*
1995 * In the write case, the valid and clean bits are
1996 * already changed correctly, so we only need to do this
1997 * here in the read case.
1998 */
1999 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
2000 vfs_page_set_valid(bp, foff, i, m);
2001 }
2002 vm_page_flag_clear(m, PG_ZERO);
2003
2004 /*
2005 * when debugging new filesystems or buffer I/O methods, this
2006 * is the most common error that pops up. if you see this, you
2007 * have not set the page busy flag correctly!!!
2008 */
2009 if (m->busy == 0) {
2010 #if !defined(MAX_PERF)
2011 printf("biodone: page busy < 0, "
2012 "pindex: %d, foff: 0x(%x,%x), "
2013 "resid: %d, index: %d\n",
2014 (int) m->pindex, (int)(foff >> 32),
2015 (int) foff & 0xffffffff, resid, i);
2016 #endif
2017 if (vp->v_type != VBLK)
2018 #if !defined(MAX_PERF)
2019 printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
2020 bp->b_vp->v_mount->mnt_stat.f_iosize,
2021 (int) bp->b_lblkno,
2022 bp->b_flags, bp->b_npages);
2023 else
2024 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
2025 (int) bp->b_lblkno,
2026 bp->b_flags, bp->b_npages);
2027 printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
2028 m->valid, m->dirty, m->wire_count);
2029 #endif
2030 panic("biodone: page busy < 0\n");
2031 }
2032 vm_page_io_finish(m);
2033 vm_object_pip_subtract(obj, 1);
2034 foff += resid;
2035 iosize -= resid;
2036 }
2037 if (obj &&
2038 (obj->paging_in_progress == 0) &&
2039 (obj->flags & OBJ_PIPWNT)) {
2040 vm_object_clear_flag(obj, OBJ_PIPWNT);
2041 wakeup(obj);
2042 }
2043 }
2044 /*
2045 * For asynchronous completions, release the buffer now. The brelse
2046 * checks for B_WANTED and will do the wakeup there if necessary - so
2047 * no need to do a wakeup here in the async case.
2048 */
2049
2050 if (bp->b_flags & B_ASYNC) {
2051 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
2052 brelse(bp);
2053 else
2054 bqrelse(bp);
2055 } else {
2056 bp->b_flags &= ~B_WANTED;
2057 wakeup(bp);
2058 }
2059 splx(s);
2060 }
2061
2062 #if 0 /* not with kirks code */
2063 static int vfs_update_interval = 30;
2064
2065 static void
2066 vfs_update()
2067 {
2068 while (1) {
2069 tsleep(&vfs_update_wakeup, PUSER, "update",
2070 hz * vfs_update_interval);
2071 vfs_update_wakeup = 0;
2072 sync(curproc, NULL);
2073 }
2074 }
2075
2076 static int
2077 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
2078 {
2079 int error = sysctl_handle_int(oidp,
2080 oidp->oid_arg1, oidp->oid_arg2, req);
2081 if (!error)
2082 wakeup(&vfs_update_wakeup);
2083 return error;
2084 }
2085
2086 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
2087 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
2088
2089 #endif
2090
2091
2092 /*
2093 * This routine is called in lieu of iodone in the case of
2094 * incomplete I/O. This keeps the busy status for pages
2095 * consistant.
2096 */
2097 void
2098 vfs_unbusy_pages(struct buf * bp)
2099 {
2100 int i;
2101
2102 if (bp->b_flags & B_VMIO) {
2103 struct vnode *vp = bp->b_vp;
2104 vm_object_t obj = vp->v_object;
2105
2106 for (i = 0; i < bp->b_npages; i++) {
2107 vm_page_t m = bp->b_pages[i];
2108
2109 if (m == bogus_page) {
2110 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
2111 #if !defined(MAX_PERF)
2112 if (!m) {
2113 panic("vfs_unbusy_pages: page missing\n");
2114 }
2115 #endif
2116 bp->b_pages[i] = m;
2117 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2118 }
2119 vm_object_pip_subtract(obj, 1);
2120 vm_page_flag_clear(m, PG_ZERO);
2121 vm_page_io_finish(m);
2122 }
2123 if (obj->paging_in_progress == 0 &&
2124 (obj->flags & OBJ_PIPWNT)) {
2125 vm_object_clear_flag(obj, OBJ_PIPWNT);
2126 wakeup(obj);
2127 }
2128 }
2129 }
2130
2131 /*
2132 * Set NFS' b_validoff and b_validend fields from the valid bits
2133 * of a page. If the consumer is not NFS, and the page is not
2134 * valid for the entire range, clear the B_CACHE flag to force
2135 * the consumer to re-read the page.
2136 */
2137 static void
2138 vfs_buf_set_valid(struct buf *bp,
2139 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
2140 vm_page_t m)
2141 {
2142 if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
2143 vm_offset_t svalid, evalid;
2144 int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE);
2145
2146 /*
2147 * This only bothers with the first valid range in the
2148 * page.
2149 */
2150 svalid = off;
2151 while (validbits && !(validbits & 1)) {
2152 svalid += DEV_BSIZE;
2153 validbits >>= 1;
2154 }
2155 evalid = svalid;
2156 while (validbits & 1) {
2157 evalid += DEV_BSIZE;
2158 validbits >>= 1;
2159 }
2160 evalid = min(evalid, off + size);
2161 /*
2162 * Make sure this range is contiguous with the range
2163 * built up from previous pages. If not, then we will
2164 * just use the range from the previous pages.
2165 */
2166 if (svalid == bp->b_validend) {
2167 bp->b_validoff = min(bp->b_validoff, svalid);
2168 bp->b_validend = max(bp->b_validend, evalid);
2169 }
2170 } else if (!vm_page_is_valid(m,
2171 (vm_offset_t) ((foff + off) & PAGE_MASK),
2172 size)) {
2173 bp->b_flags &= ~B_CACHE;
2174 }
2175 }
2176
2177 /*
2178 * Set the valid bits in a page, taking care of the b_validoff,
2179 * b_validend fields which NFS uses to optimise small reads. Off is
2180 * the offset within the file and pageno is the page index within the buf.
2181 */
2182 static void
2183 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
2184 {
2185 struct vnode *vp = bp->b_vp;
2186 vm_ooffset_t soff, eoff;
2187
2188 soff = off;
2189 eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
2190 if (eoff > bp->b_offset + bp->b_bufsize)
2191 eoff = bp->b_offset + bp->b_bufsize;
2192 if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
2193 vm_ooffset_t sv, ev;
2194 vm_page_set_invalid(m,
2195 (vm_offset_t) (soff & PAGE_MASK),
2196 (vm_offset_t) (eoff - soff));
2197 sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
2198 ev = (bp->b_offset + bp->b_validend + (DEV_BSIZE - 1)) &
2199 ~(DEV_BSIZE - 1);
2200 soff = qmax(sv, soff);
2201 eoff = qmin(ev, eoff);
2202 }
2203 if (eoff > soff)
2204 vm_page_set_validclean(m,
2205 (vm_offset_t) (soff & PAGE_MASK),
2206 (vm_offset_t) (eoff - soff));
2207 }
2208
2209 /*
2210 * This routine is called before a device strategy routine.
2211 * It is used to tell the VM system that paging I/O is in
2212 * progress, and treat the pages associated with the buffer
2213 * almost as being PG_BUSY. Also the object paging_in_progress
2214 * flag is handled to make sure that the object doesn't become
2215 * inconsistant.
2216 */
2217 void
2218 vfs_busy_pages(struct buf * bp, int clear_modify)
2219 {
2220 int i, bogus;
2221
2222 if (bp->b_flags & B_VMIO) {
2223 struct vnode *vp = bp->b_vp;
2224 vm_object_t obj = vp->v_object;
2225 vm_ooffset_t foff;
2226
2227 foff = bp->b_offset;
2228 KASSERT(bp->b_offset != NOOFFSET,
2229 ("vfs_busy_pages: no buffer offset"));
2230 vfs_setdirty(bp);
2231
2232 retry:
2233 for (i = 0; i < bp->b_npages; i++) {
2234 vm_page_t m = bp->b_pages[i];
2235 if (vm_page_sleep(m, "vbpage", NULL))
2236 goto retry;
2237 }
2238
2239 bogus = 0;
2240 for (i = 0; i < bp->b_npages; i++) {
2241 vm_page_t m = bp->b_pages[i];
2242
2243 vm_page_flag_clear(m, PG_ZERO);
2244 if ((bp->b_flags & B_CLUSTER) == 0) {
2245 vm_object_pip_add(obj, 1);
2246 vm_page_io_start(m);
2247 }
2248
2249 vm_page_protect(m, VM_PROT_NONE);
2250 if (clear_modify)
2251 vfs_page_set_valid(bp, foff, i, m);
2252 else if (m->valid == VM_PAGE_BITS_ALL &&
2253 (bp->b_flags & B_CACHE) == 0) {
2254 bp->b_pages[i] = bogus_page;
2255 bogus++;
2256 }
2257 foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2258 }
2259 if (bogus)
2260 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
2261 }
2262 }
2263
2264 /*
2265 * Tell the VM system that the pages associated with this buffer
2266 * are clean. This is used for delayed writes where the data is
2267 * going to go to disk eventually without additional VM intevention.
2268 */
2269 void
2270 vfs_clean_pages(struct buf * bp)
2271 {
2272 int i;
2273
2274 if (bp->b_flags & B_VMIO) {
2275 vm_ooffset_t foff;
2276 foff = bp->b_offset;
2277 KASSERT(bp->b_offset != NOOFFSET,
2278 ("vfs_clean_pages: no buffer offset"));
2279 for (i = 0; i < bp->b_npages; i++) {
2280 vm_page_t m = bp->b_pages[i];
2281 vfs_page_set_valid(bp, foff, i, m);
2282 foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
2283 }
2284 }
2285 }
2286
2287 void
2288 vfs_bio_clrbuf(struct buf *bp) {
2289 int i, mask = 0;
2290 caddr_t sa, ea;
2291 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
2292 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
2293 (bp->b_offset & PAGE_MASK) == 0) {
2294 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
2295 if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
2296 ((bp->b_pages[0]->valid & mask) != mask)) {
2297 bzero(bp->b_data, bp->b_bufsize);
2298 }
2299 bp->b_pages[0]->valid |= mask;
2300 bp->b_resid = 0;
2301 return;
2302 }
2303 ea = sa = bp->b_data;
2304 for(i=0;i<bp->b_npages;i++,sa=ea) {
2305 int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE;
2306 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
2307 ea = (caddr_t)ulmin((u_long)ea,
2308 (u_long)bp->b_data + bp->b_bufsize);
2309 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
2310 if ((bp->b_pages[i]->valid & mask) == mask)
2311 continue;
2312 if ((bp->b_pages[i]->valid & mask) == 0) {
2313 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
2314 bzero(sa, ea - sa);
2315 }
2316 } else {
2317 for (; sa < ea; sa += DEV_BSIZE, j++) {
2318 if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
2319 (bp->b_pages[i]->valid & (1<<j)) == 0)
2320 bzero(sa, DEV_BSIZE);
2321 }
2322 }
2323 bp->b_pages[i]->valid |= mask;
2324 vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
2325 }
2326 bp->b_resid = 0;
2327 } else {
2328 clrbuf(bp);
2329 }
2330 }
2331
2332 /*
2333 * vm_hold_load_pages and vm_hold_unload pages get pages into
2334 * a buffers address space. The pages are anonymous and are
2335 * not associated with a file object.
2336 */
2337 void
2338 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2339 {
2340 vm_offset_t pg;
2341 vm_page_t p;
2342 int index;
2343
2344 to = round_page(to);
2345 from = round_page(from);
2346 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2347
2348 for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2349
2350 tryagain:
2351
2352 p = vm_page_alloc(kernel_object,
2353 ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
2354 VM_ALLOC_NORMAL);
2355 if (!p) {
2356 vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
2357 VM_WAIT;
2358 goto tryagain;
2359 }
2360 vm_page_wire(p);
2361 p->valid = VM_PAGE_BITS_ALL;
2362 vm_page_flag_clear(p, PG_ZERO);
2363 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
2364 bp->b_pages[index] = p;
2365 vm_page_wakeup(p);
2366 }
2367 bp->b_npages = index;
2368 }
2369
2370 void
2371 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
2372 {
2373 vm_offset_t pg;
2374 vm_page_t p;
2375 int index, newnpages;
2376
2377 from = round_page(from);
2378 to = round_page(to);
2379 newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
2380
2381 for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
2382 p = bp->b_pages[index];
2383 if (p && (index < bp->b_npages)) {
2384 #if !defined(MAX_PERF)
2385 if (p->busy) {
2386 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
2387 bp->b_blkno, bp->b_lblkno);
2388 }
2389 #endif
2390 bp->b_pages[index] = NULL;
2391 pmap_kremove(pg);
2392 vm_page_busy(p);
2393 vm_page_unwire(p, 0);
2394 vm_page_free(p);
2395 }
2396 }
2397 bp->b_npages = newnpages;
2398 }
2399
2400
2401 #include "opt_ddb.h"
2402 #ifdef DDB
2403 #include <ddb/ddb.h>
2404
2405 DB_SHOW_COMMAND(buffer, db_show_buffer)
2406 {
2407 /* get args */
2408 struct buf *bp = (struct buf *)addr;
2409
2410 if (!have_addr) {
2411 db_printf("usage: show buffer <addr>\n");
2412 return;
2413 }
2414
2415 db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc,
2416 (u_int)bp->b_flags, PRINT_BUF_FLAGS);
2417 db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
2418 "b_resid = %ld\nb_dev = 0x%x, b_data = %p, "
2419 "b_blkno = %d, b_pblkno = %d\n",
2420 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
2421 bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno);
2422 if (bp->b_npages) {
2423 int i;
2424 db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
2425 for (i = 0; i < bp->b_npages; i++) {
2426 vm_page_t m;
2427 m = bp->b_pages[i];
2428 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
2429 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
2430 if ((i + 1) < bp->b_npages)
2431 db_printf(",");
2432 }
2433 db_printf("\n");
2434 }
2435 }
2436 #endif /* DDB */
Cache object: 0b5d38941f4338758121f94aaacb68c9
|