FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_bio.c
1 /*
2 * Copyright (c) 1994 John S. Dyson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice immediately at the beginning of the file, without modification,
10 * this list of conditions, and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Absolutely no warranty of function or purpose is made by the author
15 * John S. Dyson.
16 * 4. This work was done expressly for inclusion into FreeBSD. Other use
17 * is allowed if this notation is included.
18 * 5. Modifications may be freely made to this file if the above conditions
19 * are met.
20 *
21 * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.104.2.15 1999/09/05 08:15:37 peter Exp $
22 */
23
24 /*
25 * this file contains a new buffer I/O scheme implementing a coherent
26 * VM object and buffer cache scheme. Pains have been taken to make
27 * sure that the performance degradation associated with schemes such
28 * as this is not realized.
29 *
30 * Author: John S. Dyson
31 * Significant help during the development and debugging phases
32 * had been provided by David Greenman, also of the FreeBSD core team.
33 */
34
35 #include "opt_bounce.h"
36
37 #define VMIO
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/sysproto.h>
41 #include <sys/kernel.h>
42 #include <sys/sysctl.h>
43 #include <sys/proc.h>
44 #include <sys/vnode.h>
45 #include <sys/vmmeter.h>
46 #include <vm/vm.h>
47 #include <vm/vm_param.h>
48 #include <vm/vm_prot.h>
49 #include <vm/vm_kern.h>
50 #include <vm/vm_pageout.h>
51 #include <vm/vm_page.h>
52 #include <vm/vm_object.h>
53 #include <vm/vm_extern.h>
54 #include <vm/lock.h>
55 #include <vm/vm_map.h>
56 #include <sys/buf.h>
57 #include <sys/mount.h>
58 #include <sys/malloc.h>
59 #include <sys/resourcevar.h>
60 #include <sys/proc.h>
61
62 #include <miscfs/specfs/specdev.h>
63
64 static void vfs_update __P((void));
65 static struct proc *updateproc;
66 static struct kproc_desc up_kp = {
67 "update",
68 vfs_update,
69 &updateproc
70 };
71 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
72
73 struct buf *buf; /* buffer header pool */
74 struct swqueue bswlist;
75
76 int count_lock_queue __P((void));
77 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
78 vm_offset_t to);
79 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
80 vm_offset_t to);
81 static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff,
82 vm_offset_t off, vm_offset_t size,
83 vm_page_t m);
84 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
85 int pageno, vm_page_t m);
86 static void vfs_clean_pages(struct buf * bp);
87 static void vfs_setdirty(struct buf *bp);
88 static void vfs_vmio_release(struct buf *bp);
89
90 int needsbuffer;
91
92 /*
93 * Internal update daemon, process 3
94 * The variable vfs_update_wakeup allows for internal syncs.
95 */
96 int vfs_update_wakeup;
97
98
99 /*
100 * buffers base kva
101 */
102
103 /*
104 * bogus page -- for I/O to/from partially complete buffers
105 * this is a temporary solution to the problem, but it is not
106 * really that bad. it would be better to split the buffer
107 * for input in the case of buffers partially already in memory,
108 * but the code is intricate enough already.
109 */
110 vm_page_t bogus_page;
111 static vm_offset_t bogus_offset;
112
113 static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
114 bufmallocspace, maxbufmallocspace;
115
116 static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
117 static struct bqueues bufqueues[BUFFER_QUEUES];
118
119 extern int vm_swap_size;
120
121 #define BUF_MAXUSE 16
122
123 /*
124 * Initialize buffer headers and related structures.
125 */
126 void
127 bufinit()
128 {
129 struct buf *bp;
130 int i;
131
132 TAILQ_INIT(&bswlist);
133 LIST_INIT(&invalhash);
134
135 /* first, make a null hash table */
136 for (i = 0; i < BUFHSZ; i++)
137 LIST_INIT(&bufhashtbl[i]);
138
139 /* next, make a null set of free lists */
140 for (i = 0; i < BUFFER_QUEUES; i++)
141 TAILQ_INIT(&bufqueues[i]);
142
143 /* finally, initialize each buffer header and stick on empty q */
144 for (i = 0; i < nbuf; i++) {
145 bp = &buf[i];
146 bzero(bp, sizeof *bp);
147 bp->b_flags = B_INVAL; /* we're just an empty header */
148 bp->b_dev = NODEV;
149 bp->b_rcred = NOCRED;
150 bp->b_wcred = NOCRED;
151 bp->b_qindex = QUEUE_EMPTY;
152 bp->b_vnbufs.le_next = NOLIST;
153 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
154 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
155 }
156 /*
157 * maxbufspace is currently calculated to support all filesystem blocks
158 * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
159 * cache is still the same as it would be for 8K filesystems. This
160 * keeps the size of the buffer cache "in check" for big block filesystems.
161 */
162 maxbufspace = (nbuf + 8) * DFLTBSIZE;
163 /*
164 * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
165 */
166 maxvmiobufspace = 2 * maxbufspace / 3;
167 /*
168 * Limit the amount of malloc memory since it is wired permanently into
169 * the kernel space. Even though this is accounted for in the buffer
170 * allocation, we don't want the malloced region to grow uncontrolled.
171 * The malloc scheme improves memory utilization significantly on average
172 * (small) directories.
173 */
174 maxbufmallocspace = maxbufspace / 20;
175
176 bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
177 bogus_page = vm_page_alloc(kernel_object,
178 ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
179 VM_ALLOC_NORMAL);
180
181 }
182
183 /*
184 * Free the kva allocation for a buffer
185 * Must be called only at splbio or higher,
186 * as this is the only locking for buffer_map.
187 */
188 static void
189 bfreekva(struct buf * bp)
190 {
191 if (bp->b_kvasize == 0)
192 return;
193
194 vm_map_delete(buffer_map,
195 (vm_offset_t) bp->b_kvabase,
196 (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
197
198 bp->b_kvasize = 0;
199
200 }
201
202 /*
203 * remove the buffer from the appropriate free list
204 */
205 void
206 bremfree(struct buf * bp)
207 {
208 int s = splbio();
209
210 if (bp->b_qindex != QUEUE_NONE) {
211 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
212 bp->b_qindex = QUEUE_NONE;
213 } else {
214 panic("bremfree: removing a buffer when not on a queue");
215 }
216 splx(s);
217 }
218
219 /*
220 * Get a buffer with the specified data. Look in the cache first.
221 */
222 int
223 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
224 struct buf ** bpp)
225 {
226 struct buf *bp;
227
228 bp = getblk(vp, blkno, size, 0, 0);
229 *bpp = bp;
230
231 /* if not found in cache, do some I/O */
232 if ((bp->b_flags & B_CACHE) == 0) {
233 if (curproc != NULL)
234 curproc->p_stats->p_ru.ru_inblock++;
235 bp->b_flags |= B_READ;
236 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
237 if (bp->b_rcred == NOCRED) {
238 if (cred != NOCRED)
239 crhold(cred);
240 bp->b_rcred = cred;
241 }
242 vfs_busy_pages(bp, 0);
243 VOP_STRATEGY(bp);
244 return (biowait(bp));
245 }
246 return (0);
247 }
248
249 /*
250 * Operates like bread, but also starts asynchronous I/O on
251 * read-ahead blocks.
252 */
253 int
254 breadn(struct vnode * vp, daddr_t blkno, int size,
255 daddr_t * rablkno, int *rabsize,
256 int cnt, struct ucred * cred, struct buf ** bpp)
257 {
258 struct buf *bp, *rabp;
259 int i;
260 int rv = 0, readwait = 0;
261
262 *bpp = bp = getblk(vp, blkno, size, 0, 0);
263
264 /* if not found in cache, do some I/O */
265 if ((bp->b_flags & B_CACHE) == 0) {
266 if (curproc != NULL)
267 curproc->p_stats->p_ru.ru_inblock++;
268 bp->b_flags |= B_READ;
269 bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
270 if (bp->b_rcred == NOCRED) {
271 if (cred != NOCRED)
272 crhold(cred);
273 bp->b_rcred = cred;
274 }
275 vfs_busy_pages(bp, 0);
276 VOP_STRATEGY(bp);
277 ++readwait;
278 }
279 for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
280 if (inmem(vp, *rablkno))
281 continue;
282 rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
283
284 if ((rabp->b_flags & B_CACHE) == 0) {
285 if (curproc != NULL)
286 curproc->p_stats->p_ru.ru_inblock++;
287 rabp->b_flags |= B_READ | B_ASYNC;
288 rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
289 if (rabp->b_rcred == NOCRED) {
290 if (cred != NOCRED)
291 crhold(cred);
292 rabp->b_rcred = cred;
293 }
294 vfs_busy_pages(rabp, 0);
295 VOP_STRATEGY(rabp);
296 } else {
297 brelse(rabp);
298 }
299 }
300
301 if (readwait) {
302 rv = biowait(bp);
303 }
304 return (rv);
305 }
306
307 /*
308 * Write, release buffer on completion. (Done by iodone
309 * if async.)
310 */
311 int
312 bwrite(struct buf * bp)
313 {
314 int oldflags = bp->b_flags;
315
316 if (bp->b_flags & B_INVAL) {
317 brelse(bp);
318 return (0);
319 }
320 if (!(bp->b_flags & B_BUSY))
321 panic("bwrite: buffer is not busy???");
322
323 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
324 bp->b_flags |= B_WRITEINPROG;
325
326 if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
327 reassignbuf(bp, bp->b_vp);
328 }
329
330 bp->b_vp->v_numoutput++;
331 vfs_busy_pages(bp, 1);
332 if (curproc != NULL)
333 curproc->p_stats->p_ru.ru_oublock++;
334 VOP_STRATEGY(bp);
335
336 if ((oldflags & B_ASYNC) == 0) {
337 int rtval = biowait(bp);
338
339 if (oldflags & B_DELWRI) {
340 reassignbuf(bp, bp->b_vp);
341 }
342 brelse(bp);
343 return (rtval);
344 }
345 return (0);
346 }
347
348 int
349 vn_bwrite(ap)
350 struct vop_bwrite_args *ap;
351 {
352 return (bwrite(ap->a_bp));
353 }
354
355 /*
356 * Delayed write. (Buffer is marked dirty).
357 */
358 void
359 bdwrite(struct buf * bp)
360 {
361
362 if ((bp->b_flags & B_BUSY) == 0) {
363 panic("bdwrite: buffer is not busy");
364 }
365 if (bp->b_flags & B_INVAL) {
366 brelse(bp);
367 return;
368 }
369 if (bp->b_flags & B_TAPE) {
370 bawrite(bp);
371 return;
372 }
373 bp->b_flags &= ~(B_READ|B_RELBUF);
374 if ((bp->b_flags & B_DELWRI) == 0) {
375 bp->b_flags |= B_DONE | B_DELWRI;
376 reassignbuf(bp, bp->b_vp);
377 }
378
379 /*
380 * This bmap keeps the system from needing to do the bmap later,
381 * perhaps when the system is attempting to do a sync. Since it
382 * is likely that the indirect block -- or whatever other datastructure
383 * that the filesystem needs is still in memory now, it is a good
384 * thing to do this. Note also, that if the pageout daemon is
385 * requesting a sync -- there might not be enough memory to do
386 * the bmap then... So, this is important to do.
387 */
388 if( bp->b_lblkno == bp->b_blkno) {
389 VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
390 }
391
392 /*
393 * Set the *dirty* buffer range based upon the VM system dirty pages.
394 */
395 vfs_setdirty(bp);
396
397 /*
398 * We need to do this here to satisfy the vnode_pager and the
399 * pageout daemon, so that it thinks that the pages have been
400 * "cleaned". Note that since the pages are in a delayed write
401 * buffer -- the VFS layer "will" see that the pages get written
402 * out on the next sync, or perhaps the cluster will be completed.
403 */
404 vfs_clean_pages(bp);
405 bqrelse(bp);
406 return;
407 }
408
409 /*
410 * Asynchronous write.
411 * Start output on a buffer, but do not wait for it to complete.
412 * The buffer is released when the output completes.
413 */
414 void
415 bawrite(struct buf * bp)
416 {
417 bp->b_flags |= B_ASYNC;
418 (void) VOP_BWRITE(bp);
419 }
420
421 /*
422 * Ordered write.
423 * Start output on a buffer, but only wait for it to complete if the
424 * output device cannot guarantee ordering in some other way. Devices
425 * that can perform asynchronous ordered writes will set the B_ASYNC
426 * flag in their strategy routine.
427 * The buffer is released when the output completes.
428 */
429 int
430 bowrite(struct buf * bp)
431 {
432 bp->b_flags |= B_ORDERED;
433 return (VOP_BWRITE(bp));
434 }
435
436 /*
437 * Release a buffer.
438 */
439 void
440 brelse(struct buf * bp)
441 {
442 int s;
443
444 if (bp->b_flags & B_CLUSTER) {
445 relpbuf(bp);
446 return;
447 }
448 /* anyone need a "free" block? */
449 s = splbio();
450
451 /* anyone need this block? */
452 if (bp->b_flags & B_WANTED) {
453 bp->b_flags &= ~(B_WANTED | B_AGE);
454 wakeup(bp);
455 }
456
457 if (bp->b_flags & B_LOCKED)
458 bp->b_flags &= ~B_ERROR;
459
460 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
461 (bp->b_bufsize <= 0)) {
462 bp->b_flags |= B_INVAL;
463 bp->b_flags &= ~(B_DELWRI | B_CACHE);
464 if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
465 if (bp->b_bufsize)
466 allocbuf(bp, 0);
467 brelvp(bp);
468 }
469 }
470
471 /*
472 * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release()
473 * is called with B_DELWRI set, the underlying pages may wind up
474 * getting freed causing a previous write (bdwrite()) to get 'lost'
475 * because pages associated with a B_DELWRI bp are marked clean.
476 *
477 * We still allow the B_INVAL case to call vfs_vmio_release(), even
478 * if B_DELWRI is set.
479 */
480
481 if (bp->b_flags & B_DELWRI)
482 bp->b_flags &= ~B_RELBUF;
483
484 /*
485 * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
486 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
487 * but the VM object is kept around. The B_NOCACHE flag is used to
488 * invalidate the pages in the VM object.
489 *
490 * If the buffer is a partially filled NFS buffer, keep it
491 * since invalidating it now will lose informatio. The valid
492 * flags in the vm_pages have only DEV_BSIZE resolution but
493 * the b_validoff, b_validend fields have byte resolution.
494 * This can avoid unnecessary re-reads of the buffer.
495 */
496 if ((bp->b_flags & B_VMIO)
497 && (bp->b_vp->v_tag != VT_NFS
498 || bp->b_vp->v_type == VBLK
499 || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR))
500 || bp->b_validend == 0
501 || (bp->b_validoff == 0
502 && bp->b_validend == bp->b_bufsize))) {
503 vm_ooffset_t foff;
504 vm_object_t obj;
505 int i, resid;
506 vm_page_t m;
507 struct vnode *vp;
508 int iototal = bp->b_bufsize;
509
510 vp = bp->b_vp;
511 if (!vp)
512 panic("brelse: missing vp");
513
514 if (bp->b_npages) {
515 vm_pindex_t poff;
516 obj = (vm_object_t) vp->v_object;
517 if (vp->v_type == VBLK)
518 foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
519 else
520 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
521 poff = OFF_TO_IDX(foff);
522 for (i = 0; i < bp->b_npages; i++) {
523 m = bp->b_pages[i];
524 if (m == bogus_page) {
525 m = vm_page_lookup(obj, poff + i);
526 if (!m) {
527 panic("brelse: page missing\n");
528 }
529 bp->b_pages[i] = m;
530 pmap_qenter(trunc_page(bp->b_data),
531 bp->b_pages, bp->b_npages);
532 }
533 resid = IDX_TO_OFF(m->pindex+1) - foff;
534 if (resid > iototal)
535 resid = iototal;
536 if (resid > 0) {
537 /*
538 * Don't invalidate the page if the local machine has already
539 * modified it. This is the lesser of two evils, and should
540 * be fixed.
541 */
542 if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
543 vm_page_test_dirty(m);
544 if (m->dirty == 0) {
545 vm_page_set_invalid(m, (vm_offset_t) foff, resid);
546 if (m->valid == 0)
547 vm_page_protect(m, VM_PROT_NONE);
548 }
549 }
550 if (resid >= PAGE_SIZE) {
551 if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
552 bp->b_flags |= B_INVAL;
553 }
554 } else {
555 if (!vm_page_is_valid(m,
556 (((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
557 bp->b_flags |= B_INVAL;
558 }
559 }
560 }
561 foff += resid;
562 iototal -= resid;
563 }
564 }
565 if (bp->b_flags & (B_INVAL | B_RELBUF))
566 vfs_vmio_release(bp);
567 }
568 if (bp->b_qindex != QUEUE_NONE)
569 panic("brelse: free buffer onto another queue???");
570
571 /* enqueue */
572 /* buffers with no memory */
573 if (bp->b_bufsize == 0) {
574 bp->b_qindex = QUEUE_EMPTY;
575 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
576 LIST_REMOVE(bp, b_hash);
577 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
578 bp->b_dev = NODEV;
579 /*
580 * Get rid of the kva allocation *now*
581 */
582 bfreekva(bp);
583 if (needsbuffer) {
584 wakeup(&needsbuffer);
585 needsbuffer=0;
586 }
587 /* buffers with junk contents */
588 } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
589 bp->b_qindex = QUEUE_AGE;
590 TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
591 LIST_REMOVE(bp, b_hash);
592 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
593 bp->b_dev = NODEV;
594 if (needsbuffer) {
595 wakeup(&needsbuffer);
596 needsbuffer=0;
597 }
598 /* buffers that are locked */
599 } else if (bp->b_flags & B_LOCKED) {
600 bp->b_qindex = QUEUE_LOCKED;
601 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
602 /* buffers with stale but valid contents */
603 } else if (bp->b_flags & B_AGE) {
604 bp->b_qindex = QUEUE_AGE;
605 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
606 if (needsbuffer) {
607 wakeup(&needsbuffer);
608 needsbuffer=0;
609 }
610 /* buffers with valid and quite potentially reuseable contents */
611 } else {
612 bp->b_qindex = QUEUE_LRU;
613 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
614 if (needsbuffer) {
615 wakeup(&needsbuffer);
616 needsbuffer=0;
617 }
618 }
619
620 /* unlock */
621 bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
622 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
623 splx(s);
624 }
625
626 /*
627 * Release a buffer.
628 */
629 void
630 bqrelse(struct buf * bp)
631 {
632 int s;
633
634 s = splbio();
635
636
637 /* anyone need this block? */
638 if (bp->b_flags & B_WANTED) {
639 bp->b_flags &= ~(B_WANTED | B_AGE);
640 wakeup(bp);
641 }
642
643 if (bp->b_qindex != QUEUE_NONE)
644 panic("bqrelse: free buffer onto another queue???");
645
646 if (bp->b_flags & B_LOCKED) {
647 bp->b_flags &= ~B_ERROR;
648 bp->b_qindex = QUEUE_LOCKED;
649 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
650 /* buffers with stale but valid contents */
651 } else {
652 bp->b_qindex = QUEUE_LRU;
653 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
654 if (needsbuffer) {
655 wakeup(&needsbuffer);
656 needsbuffer=0;
657 }
658 }
659
660 /* unlock */
661 bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
662 B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
663 splx(s);
664 }
665
666 static void
667 vfs_vmio_release(bp)
668 struct buf *bp;
669 {
670 int i;
671 vm_page_t m;
672
673 for (i = 0; i < bp->b_npages; i++) {
674 m = bp->b_pages[i];
675 bp->b_pages[i] = NULL;
676 vm_page_unwire(m);
677 /*
678 * We don't mess with busy pages, it is
679 * the responsibility of the process that
680 * busied the pages to deal with them.
681 */
682 if ((m->flags & PG_BUSY) || (m->busy != 0))
683 continue;
684
685 if (m->wire_count == 0) {
686
687 if (m->flags & PG_WANTED) {
688 m->flags &= ~PG_WANTED;
689 wakeup(m);
690 }
691
692 /*
693 * If this is an async free -- we cannot place
694 * pages onto the cache queue, so our policy for
695 * such buffers is to avoid the cache queue, and
696 * only modify the active queue or free queue.
697 */
698 if ((bp->b_flags & B_ASYNC) == 0) {
699
700 /*
701 * In the case of sync buffer frees, we can do pretty much
702 * anything to any of the memory queues. Specifically,
703 * the cache queue is free to be modified.
704 */
705 if (m->valid) {
706 if(m->dirty == 0)
707 vm_page_test_dirty(m);
708 /*
709 * this keeps pressure off of the process memory
710 */
711 if ((vm_swap_size == 0) ||
712 (cnt.v_free_count < cnt.v_free_min)) {
713 if ((m->dirty == 0) &&
714 (m->hold_count == 0))
715 vm_page_cache(m);
716 else
717 vm_page_deactivate(m);
718 }
719 } else if (m->hold_count == 0) {
720 vm_page_protect(m, VM_PROT_NONE);
721 vm_page_free(m);
722 }
723 } else {
724 /*
725 * If async, then at least we clear the
726 * act_count.
727 */
728 m->act_count = 0;
729 }
730 }
731 }
732 bufspace -= bp->b_bufsize;
733 vmiospace -= bp->b_bufsize;
734 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
735 bp->b_npages = 0;
736 bp->b_bufsize = 0;
737 bp->b_flags &= ~B_VMIO;
738 if (bp->b_vp)
739 brelvp(bp);
740 }
741
742 /*
743 * Check to see if a block is currently memory resident.
744 */
745 struct buf *
746 gbincore(struct vnode * vp, daddr_t blkno)
747 {
748 struct buf *bp;
749 struct bufhashhdr *bh;
750
751 bh = BUFHASH(vp, blkno);
752 bp = bh->lh_first;
753
754 /* Search hash chain */
755 while (bp != NULL) {
756 /* hit */
757 if (bp->b_vp == vp && bp->b_lblkno == blkno &&
758 (bp->b_flags & B_INVAL) == 0) {
759 break;
760 }
761 bp = bp->b_hash.le_next;
762 }
763 return (bp);
764 }
765
766 /*
767 * this routine implements clustered async writes for
768 * clearing out B_DELWRI buffers... This is much better
769 * than the old way of writing only one buffer at a time.
770 */
771 int
772 vfs_bio_awrite(struct buf * bp)
773 {
774 int i;
775 daddr_t lblkno = bp->b_lblkno;
776 struct vnode *vp = bp->b_vp;
777 int s;
778 int ncl;
779 struct buf *bpa;
780 int nwritten;
781
782 s = splbio();
783 /*
784 * right now we support clustered writing only to regular files
785 */
786 if ((vp->v_type == VREG) &&
787 (vp->v_mount != 0) && /* Only on nodes that have the size info */
788 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
789 int size;
790 int maxcl;
791
792 size = vp->v_mount->mnt_stat.f_iosize;
793 maxcl = MAXPHYS / size;
794
795 for (i = 1; i < maxcl; i++) {
796 if ((bpa = gbincore(vp, lblkno + i)) &&
797 ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
798 (B_DELWRI | B_CLUSTEROK)) &&
799 (bpa->b_bufsize == size)) {
800 if ((bpa->b_blkno == bpa->b_lblkno) ||
801 (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
802 break;
803 } else {
804 break;
805 }
806 }
807 ncl = i;
808 /*
809 * this is a possible cluster write
810 */
811 if (ncl != 1) {
812 nwritten = cluster_wbuild(vp, size, lblkno, ncl);
813 splx(s);
814 return nwritten;
815 }
816 }
817 bremfree(bp);
818 splx(s);
819 /*
820 * default (old) behavior, writing out only one block
821 */
822 bp->b_flags |= B_BUSY | B_ASYNC;
823 nwritten = bp->b_bufsize;
824 (void) VOP_BWRITE(bp);
825 return nwritten;
826 }
827
828
829 /*
830 * Find a buffer header which is available for use.
831 */
832 static struct buf *
833 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
834 {
835 struct buf *bp;
836 int nbyteswritten = 0;
837 vm_offset_t addr;
838
839 start:
840 if (bufspace >= maxbufspace)
841 goto trytofreespace;
842
843 /* can we constitute a new buffer? */
844 if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
845 if (bp->b_qindex != QUEUE_EMPTY)
846 panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
847 bp->b_qindex);
848 bp->b_flags |= B_BUSY;
849 bremfree(bp);
850 goto fillbuf;
851 }
852 trytofreespace:
853 /*
854 * We keep the file I/O from hogging metadata I/O
855 * This is desirable because file data is cached in the
856 * VM/Buffer cache even if a buffer is freed.
857 */
858 if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
859 if (bp->b_qindex != QUEUE_AGE)
860 panic("getnewbuf: inconsistent AGE queue, qindex=%d",
861 bp->b_qindex);
862 } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
863 if (bp->b_qindex != QUEUE_LRU)
864 panic("getnewbuf: inconsistent LRU queue, qindex=%d",
865 bp->b_qindex);
866 }
867 if (!bp) {
868 /* wait for a free buffer of any kind */
869 needsbuffer = 1;
870 do
871 tsleep(&needsbuffer, (PRIBIO + 1) | slpflag, "newbuf",
872 slptimeo);
873 while (needsbuffer);
874 return (0);
875 }
876
877 #if defined(DIAGNOSTIC)
878 if (bp->b_flags & B_BUSY) {
879 panic("getnewbuf: busy buffer on free list\n");
880 }
881 #endif
882
883 /*
884 * We are fairly aggressive about freeing VMIO buffers, but since
885 * the buffering is intact without buffer headers, there is not
886 * much loss. We gain by maintaining non-VMIOed metadata in buffers.
887 */
888 if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
889 if ((bp->b_flags & B_VMIO) == 0 ||
890 (vmiospace < maxvmiobufspace)) {
891 --bp->b_usecount;
892 TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
893 if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
894 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
895 goto start;
896 }
897 TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
898 }
899 }
900
901 /* if we are a delayed write, convert to an async write */
902 if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
903 nbyteswritten += vfs_bio_awrite(bp);
904 if (!slpflag && !slptimeo) {
905 return (0);
906 }
907 goto start;
908 }
909
910 if (bp->b_flags & B_WANTED) {
911 bp->b_flags &= ~B_WANTED;
912 wakeup(bp);
913 }
914 bremfree(bp);
915 bp->b_flags |= B_BUSY;
916
917 if (bp->b_flags & B_VMIO) {
918 bp->b_flags &= ~B_ASYNC;
919 vfs_vmio_release(bp);
920 }
921
922 if (bp->b_vp)
923 brelvp(bp);
924
925 fillbuf:
926 /* we are not free, nor do we contain interesting data */
927 if (bp->b_rcred != NOCRED) {
928 crfree(bp->b_rcred);
929 bp->b_rcred = NOCRED;
930 }
931 if (bp->b_wcred != NOCRED) {
932 crfree(bp->b_wcred);
933 bp->b_wcred = NOCRED;
934 }
935
936 LIST_REMOVE(bp, b_hash);
937 LIST_INSERT_HEAD(&invalhash, bp, b_hash);
938 if (bp->b_bufsize) {
939 allocbuf(bp, 0);
940 }
941 bp->b_flags = B_BUSY;
942 bp->b_dev = NODEV;
943 bp->b_vp = NULL;
944 bp->b_blkno = bp->b_lblkno = 0;
945 bp->b_iodone = 0;
946 bp->b_error = 0;
947 bp->b_resid = 0;
948 bp->b_bcount = 0;
949 bp->b_npages = 0;
950 bp->b_dirtyoff = bp->b_dirtyend = 0;
951 bp->b_validoff = bp->b_validend = 0;
952 bp->b_usecount = 4;
953
954 maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
955
956 /*
957 * we assume that buffer_map is not at address 0
958 */
959 addr = 0;
960 if (maxsize != bp->b_kvasize) {
961 bfreekva(bp);
962
963 /*
964 * See if we have buffer kva space
965 */
966 if (vm_map_findspace(buffer_map,
967 vm_map_min(buffer_map), maxsize, &addr)) {
968 bp->b_flags |= B_INVAL;
969 brelse(bp);
970 goto trytofreespace;
971 }
972 }
973
974 /*
975 * See if we are below are allocated minimum
976 */
977 if (bufspace >= (maxbufspace + nbyteswritten)) {
978 bp->b_flags |= B_INVAL;
979 brelse(bp);
980 goto trytofreespace;
981 }
982
983 /*
984 * create a map entry for the buffer -- in essence
985 * reserving the kva space.
986 */
987 if (addr) {
988 vm_map_insert(buffer_map, NULL, 0,
989 addr, addr + maxsize,
990 VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
991
992 bp->b_kvabase = (caddr_t) addr;
993 bp->b_kvasize = maxsize;
994 }
995 bp->b_data = bp->b_kvabase;
996
997 return (bp);
998 }
999
1000 /*
1001 * Check to see if a block is currently memory resident.
1002 */
1003 struct buf *
1004 incore(struct vnode * vp, daddr_t blkno)
1005 {
1006 struct buf *bp;
1007
1008 int s = splbio();
1009 bp = gbincore(vp, blkno);
1010 splx(s);
1011 return (bp);
1012 }
1013
1014 /*
1015 * Returns true if no I/O is needed to access the
1016 * associated VM object. This is like incore except
1017 * it also hunts around in the VM system for the data.
1018 */
1019
1020 int
1021 inmem(struct vnode * vp, daddr_t blkno)
1022 {
1023 vm_object_t obj;
1024 vm_offset_t toff, tinc;
1025 vm_page_t m;
1026 vm_ooffset_t off;
1027
1028 if (incore(vp, blkno))
1029 return 1;
1030 if (vp->v_mount == NULL)
1031 return 0;
1032 if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
1033 return 0;
1034
1035 obj = vp->v_object;
1036 tinc = PAGE_SIZE;
1037 if (tinc > vp->v_mount->mnt_stat.f_iosize)
1038 tinc = vp->v_mount->mnt_stat.f_iosize;
1039 off = blkno * vp->v_mount->mnt_stat.f_iosize;
1040
1041 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
1042
1043 m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
1044 if (!m)
1045 return 0;
1046 if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
1047 return 0;
1048 }
1049 return 1;
1050 }
1051
1052 /*
1053 * now we set the dirty range for the buffer --
1054 * for NFS -- if the file is mapped and pages have
1055 * been written to, let it know. We want the
1056 * entire range of the buffer to be marked dirty if
1057 * any of the pages have been written to for consistancy
1058 * with the b_validoff, b_validend set in the nfs write
1059 * code, and used by the nfs read code.
1060 */
1061 static void
1062 vfs_setdirty(struct buf *bp) {
1063 int i;
1064 vm_object_t object;
1065 vm_offset_t boffset, offset;
1066 /*
1067 * We qualify the scan for modified pages on whether the
1068 * object has been flushed yet. The OBJ_WRITEABLE flag
1069 * is not cleared simply by protecting pages off.
1070 */
1071 if ((bp->b_flags & B_VMIO) &&
1072 ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
1073 /*
1074 * test the pages to see if they have been modified directly
1075 * by users through the VM system.
1076 */
1077 for (i = 0; i < bp->b_npages; i++)
1078 vm_page_test_dirty(bp->b_pages[i]);
1079
1080 /*
1081 * scan forwards for the first page modified
1082 */
1083 for (i = 0; i < bp->b_npages; i++) {
1084 if (bp->b_pages[i]->dirty) {
1085 break;
1086 }
1087 }
1088 boffset = (i << PAGE_SHIFT);
1089 if (boffset < bp->b_dirtyoff) {
1090 bp->b_dirtyoff = boffset;
1091 }
1092
1093 /*
1094 * scan backwards for the last page modified
1095 */
1096 for (i = bp->b_npages - 1; i >= 0; --i) {
1097 if (bp->b_pages[i]->dirty) {
1098 break;
1099 }
1100 }
1101 boffset = (i + 1);
1102 offset = boffset + bp->b_pages[0]->pindex;
1103 if (offset >= object->size)
1104 boffset = object->size - bp->b_pages[0]->pindex;
1105 if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
1106 bp->b_dirtyend = (boffset << PAGE_SHIFT);
1107 }
1108 }
1109
1110 /*
1111 * Get a block given a specified block and offset into a file/device.
1112 */
1113 struct buf *
1114 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
1115 {
1116 struct buf *bp;
1117 int s;
1118 struct bufhashhdr *bh;
1119 int maxsize;
1120
1121 if (vp->v_mount) {
1122 maxsize = vp->v_mount->mnt_stat.f_iosize;
1123 /*
1124 * This happens on mount points.
1125 */
1126 if (maxsize < size)
1127 maxsize = size;
1128 } else {
1129 maxsize = size;
1130 }
1131
1132 if (size > MAXBSIZE)
1133 panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
1134
1135 s = splbio();
1136 loop:
1137 if ((bp = gbincore(vp, blkno))) {
1138 if (bp->b_flags & B_BUSY) {
1139 bp->b_flags |= B_WANTED;
1140 if (bp->b_usecount < BUF_MAXUSE)
1141 ++bp->b_usecount;
1142 if (!tsleep(bp,
1143 (PRIBIO + 1) | slpflag, "getblk", slptimeo))
1144 goto loop;
1145
1146 splx(s);
1147 return (struct buf *) NULL;
1148 }
1149 bp->b_flags |= B_BUSY | B_CACHE;
1150 bremfree(bp);
1151
1152 /*
1153 * check for size inconsistancies (note that they shouldn't happen
1154 * but do when filesystems don't handle the size changes correctly.)
1155 * We are conservative on metadata and don't just extend the buffer
1156 * but write and re-constitute it.
1157 */
1158
1159 if (bp->b_bcount != size) {
1160 if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
1161 allocbuf(bp, size);
1162 } else {
1163 bp->b_flags |= B_NOCACHE;
1164 VOP_BWRITE(bp);
1165 goto loop;
1166 }
1167 }
1168
1169 if (bp->b_usecount < BUF_MAXUSE)
1170 ++bp->b_usecount;
1171 splx(s);
1172 return (bp);
1173 } else {
1174 vm_object_t obj;
1175
1176 if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == 0) {
1177 if (slpflag || slptimeo) {
1178 splx(s);
1179 return NULL;
1180 }
1181 goto loop;
1182 }
1183
1184 /*
1185 * This code is used to make sure that a buffer is not
1186 * created while the getnewbuf routine is blocked.
1187 * This can be a problem whether the vnode is locked or not.
1188 */
1189 if (gbincore(vp, blkno)) {
1190 bp->b_flags |= B_INVAL;
1191 brelse(bp);
1192 goto loop;
1193 }
1194
1195 /*
1196 * Insert the buffer into the hash, so that it can
1197 * be found by incore.
1198 */
1199 bp->b_blkno = bp->b_lblkno = blkno;
1200 bgetvp(vp, bp);
1201 LIST_REMOVE(bp, b_hash);
1202 bh = BUFHASH(vp, blkno);
1203 LIST_INSERT_HEAD(bh, bp, b_hash);
1204
1205 if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
1206 bp->b_flags |= (B_VMIO | B_CACHE);
1207 #if defined(VFS_BIO_DEBUG)
1208 if (vp->v_type != VREG && vp->v_type != VBLK)
1209 printf("getblk: vmioing file type %d???\n", vp->v_type);
1210 #endif
1211 } else {
1212 bp->b_flags &= ~B_VMIO;
1213 }
1214 splx(s);
1215
1216 allocbuf(bp, size);
1217 #ifdef PC98
1218 /*
1219 * 1024byte/sector support
1220 */
1221 #define B_XXX2 0x8000000
1222 if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
1223 #endif
1224 return (bp);
1225 }
1226 }
1227
1228 /*
1229 * Get an empty, disassociated buffer of given size.
1230 */
1231 struct buf *
1232 geteblk(int size)
1233 {
1234 struct buf *bp;
1235 int s;
1236
1237 s = splbio();
1238 while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
1239 splx(s);
1240 allocbuf(bp, size);
1241 bp->b_flags |= B_INVAL;
1242 return (bp);
1243 }
1244
1245
1246 /*
1247 * This code constitutes the buffer memory from either anonymous system
1248 * memory (in the case of non-VMIO operations) or from an associated
1249 * VM object (in the case of VMIO operations).
1250 *
1251 * Note that this code is tricky, and has many complications to resolve
1252 * deadlock or inconsistant data situations. Tread lightly!!!
1253 *
1254 * Modify the length of a buffer's underlying buffer storage without
1255 * destroying information (unless, of course the buffer is shrinking).
1256 */
1257 int
1258 allocbuf(struct buf * bp, int size)
1259 {
1260
1261 int s;
1262 int newbsize, mbsize;
1263 int i;
1264
1265 if (!(bp->b_flags & B_BUSY))
1266 panic("allocbuf: buffer not busy");
1267
1268 if (bp->b_kvasize < size)
1269 panic("allocbuf: buffer too small");
1270
1271 if ((bp->b_flags & B_VMIO) == 0) {
1272 caddr_t origbuf;
1273 int origbufsize;
1274 /*
1275 * Just get anonymous memory from the kernel
1276 */
1277 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1278 #if !defined(NO_B_MALLOC)
1279 if (bp->b_flags & B_MALLOC)
1280 newbsize = mbsize;
1281 else
1282 #endif
1283 newbsize = round_page(size);
1284
1285 if (newbsize < bp->b_bufsize) {
1286 #if !defined(NO_B_MALLOC)
1287 /*
1288 * malloced buffers are not shrunk
1289 */
1290 if (bp->b_flags & B_MALLOC) {
1291 if (newbsize) {
1292 bp->b_bcount = size;
1293 } else {
1294 free(bp->b_data, M_BIOBUF);
1295 bufspace -= bp->b_bufsize;
1296 bufmallocspace -= bp->b_bufsize;
1297 bp->b_data = bp->b_kvabase;
1298 bp->b_bufsize = 0;
1299 bp->b_bcount = 0;
1300 bp->b_flags &= ~B_MALLOC;
1301 }
1302 return 1;
1303 }
1304 #endif
1305 vm_hold_free_pages(
1306 bp,
1307 (vm_offset_t) bp->b_data + newbsize,
1308 (vm_offset_t) bp->b_data + bp->b_bufsize);
1309 } else if (newbsize > bp->b_bufsize) {
1310 #if !defined(NO_B_MALLOC)
1311 /*
1312 * We only use malloced memory on the first allocation.
1313 * and revert to page-allocated memory when the buffer grows.
1314 */
1315 if ( (bufmallocspace < maxbufmallocspace) &&
1316 (bp->b_bufsize == 0) &&
1317 (mbsize <= PAGE_SIZE/2)) {
1318
1319 bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
1320 bp->b_bufsize = mbsize;
1321 bp->b_bcount = size;
1322 bp->b_flags |= B_MALLOC;
1323 bufspace += mbsize;
1324 bufmallocspace += mbsize;
1325 return 1;
1326 }
1327 #endif
1328 origbuf = NULL;
1329 origbufsize = 0;
1330 #if !defined(NO_B_MALLOC)
1331 /*
1332 * If the buffer is growing on it's other-than-first allocation,
1333 * then we revert to the page-allocation scheme.
1334 */
1335 if (bp->b_flags & B_MALLOC) {
1336 origbuf = bp->b_data;
1337 origbufsize = bp->b_bufsize;
1338 bp->b_data = bp->b_kvabase;
1339 bufspace -= bp->b_bufsize;
1340 bufmallocspace -= bp->b_bufsize;
1341 bp->b_bufsize = 0;
1342 bp->b_flags &= ~B_MALLOC;
1343 newbsize = round_page(newbsize);
1344 }
1345 #endif
1346 vm_hold_load_pages(
1347 bp,
1348 (vm_offset_t) bp->b_data + bp->b_bufsize,
1349 (vm_offset_t) bp->b_data + newbsize);
1350 #if !defined(NO_B_MALLOC)
1351 if (origbuf) {
1352 bcopy(origbuf, bp->b_data, origbufsize);
1353 free(origbuf, M_BIOBUF);
1354 }
1355 #endif
1356 }
1357 } else {
1358 vm_page_t m;
1359 int desiredpages;
1360
1361 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
1362 desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
1363
1364 #if !defined(NO_B_MALLOC)
1365 if (bp->b_flags & B_MALLOC)
1366 panic("allocbuf: VMIO buffer can't be malloced");
1367 #endif
1368
1369 if (newbsize < bp->b_bufsize) {
1370 if (desiredpages < bp->b_npages) {
1371 for (i = desiredpages; i < bp->b_npages; i++) {
1372 /*
1373 * the page is not freed here -- it
1374 * is the responsibility of vnode_pager_setsize
1375 */
1376 m = bp->b_pages[i];
1377 #if defined(DIAGNOSTIC)
1378 if (m == bogus_page)
1379 panic("allocbuf: bogus page found");
1380 #endif
1381 s = splvm();
1382 while ((m->flags & PG_BUSY) || (m->busy != 0)) {
1383 m->flags |= PG_WANTED;
1384 tsleep(m, PVM, "biodep", 0);
1385 }
1386 splx(s);
1387
1388 bp->b_pages[i] = NULL;
1389 vm_page_unwire(m);
1390 }
1391 pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
1392 (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
1393 bp->b_npages = desiredpages;
1394 }
1395 } else if (newbsize > bp->b_bufsize) {
1396 vm_object_t obj;
1397 vm_offset_t tinc, toff;
1398 vm_ooffset_t off;
1399 vm_pindex_t objoff;
1400 int pageindex, curbpnpages;
1401 struct vnode *vp;
1402 int bsize;
1403
1404 vp = bp->b_vp;
1405
1406 if (vp->v_type == VBLK)
1407 bsize = DEV_BSIZE;
1408 else
1409 bsize = vp->v_mount->mnt_stat.f_iosize;
1410
1411 if (bp->b_npages < desiredpages) {
1412 obj = vp->v_object;
1413 tinc = PAGE_SIZE;
1414 if (tinc > bsize)
1415 tinc = bsize;
1416 off = (vm_ooffset_t) bp->b_lblkno * bsize;
1417 curbpnpages = bp->b_npages;
1418 doretry:
1419 bp->b_flags |= B_CACHE;
1420 bp->b_validoff = bp->b_validend = 0;
1421 for (toff = 0; toff < newbsize; toff += tinc) {
1422 int bytesinpage;
1423
1424 pageindex = toff >> PAGE_SHIFT;
1425 objoff = OFF_TO_IDX(off + toff);
1426 if (pageindex < curbpnpages) {
1427
1428 m = bp->b_pages[pageindex];
1429 #ifdef VFS_BIO_DIAG
1430 if (m->pindex != objoff)
1431 panic("allocbuf: page changed offset??!!!?");
1432 #endif
1433 bytesinpage = tinc;
1434 if (tinc > (newbsize - toff))
1435 bytesinpage = newbsize - toff;
1436 if (bp->b_flags & B_CACHE)
1437 vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1438 continue;
1439 }
1440 m = vm_page_lookup(obj, objoff);
1441 if (!m) {
1442 m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
1443 if (!m) {
1444 VM_WAIT;
1445 goto doretry;
1446 }
1447 /*
1448 * Normally it is unwise to clear PG_BUSY without
1449 * PAGE_WAKEUP -- but it is okay here, as there is
1450 * no chance for blocking between here and vm_page_alloc
1451 */
1452 m->flags &= ~PG_BUSY;
1453 vm_page_wire(m);
1454 bp->b_flags &= ~B_CACHE;
1455 } else if (m->flags & PG_BUSY) {
1456 s = splvm();
1457 if (m->flags & PG_BUSY) {
1458 m->flags |= PG_WANTED;
1459 tsleep(m, PVM, "pgtblk", 0);
1460 }
1461 splx(s);
1462 goto doretry;
1463 } else {
1464 if ((curproc != pageproc) &&
1465 ((m->queue - m->pc) == PQ_CACHE) &&
1466 ((cnt.v_free_count + cnt.v_cache_count) <
1467 (cnt.v_free_min + cnt.v_cache_min))) {
1468 pagedaemon_wakeup();
1469 }
1470 bytesinpage = tinc;
1471 if (tinc > (newbsize - toff))
1472 bytesinpage = newbsize - toff;
1473 if (bp->b_flags & B_CACHE)
1474 vfs_buf_set_valid(bp, off, toff, bytesinpage, m);
1475 vm_page_wire(m);
1476 }
1477 bp->b_pages[pageindex] = m;
1478 curbpnpages = pageindex + 1;
1479 }
1480 if (vp->v_tag == VT_NFS &&
1481 vp->v_type != VBLK &&
1482 bp->b_validend == 0)
1483 bp->b_flags &= ~B_CACHE;
1484 bp->b_data = (caddr_t) trunc_page(bp->b_data);
1485 bp->b_npages = curbpnpages;
1486 pmap_qenter((vm_offset_t) bp->b_data,
1487 bp->b_pages, bp->b_npages);
1488 ((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
1489 }
1490 }
1491 }
1492 if (bp->b_flags & B_VMIO)
1493 vmiospace += bp->b_bufsize;
1494 bufspace += (newbsize - bp->b_bufsize);
1495 bp->b_bufsize = newbsize;
1496 bp->b_bcount = size;
1497 return 1;
1498 }
1499
1500 /*
1501 * Wait for buffer I/O completion, returning error status.
1502 */
1503 int
1504 biowait(register struct buf * bp)
1505 {
1506 int s;
1507
1508 s = splbio();
1509 while ((bp->b_flags & B_DONE) == 0)
1510 tsleep(bp, PRIBIO, "biowait", 0);
1511 splx(s);
1512 if (bp->b_flags & B_EINTR) {
1513 bp->b_flags &= ~B_EINTR;
1514 return (EINTR);
1515 }
1516 if (bp->b_flags & B_ERROR) {
1517 return (bp->b_error ? bp->b_error : EIO);
1518 } else {
1519 return (0);
1520 }
1521 }
1522
1523 /*
1524 * Finish I/O on a buffer, calling an optional function.
1525 * This is usually called from interrupt level, so process blocking
1526 * is not *a good idea*.
1527 */
1528 void
1529 biodone(register struct buf * bp)
1530 {
1531 int s;
1532
1533 s = splbio();
1534 if (!(bp->b_flags & B_BUSY))
1535 panic("biodone: buffer not busy");
1536
1537 if (bp->b_flags & B_DONE) {
1538 splx(s);
1539 printf("biodone: buffer already done\n");
1540 return;
1541 }
1542 bp->b_flags |= B_DONE;
1543
1544 if ((bp->b_flags & B_READ) == 0) {
1545 vwakeup(bp);
1546 }
1547 #ifdef BOUNCE_BUFFERS
1548 if (bp->b_flags & B_BOUNCE)
1549 vm_bounce_free(bp);
1550 #endif
1551
1552 /* call optional completion function if requested */
1553 if (bp->b_flags & B_CALL) {
1554 bp->b_flags &= ~B_CALL;
1555 (*bp->b_iodone) (bp);
1556 splx(s);
1557 return;
1558 }
1559 if (bp->b_flags & B_VMIO) {
1560 int i, resid;
1561 vm_ooffset_t foff;
1562 vm_page_t m;
1563 vm_object_t obj;
1564 int iosize;
1565 struct vnode *vp = bp->b_vp;
1566
1567 if (vp->v_type == VBLK)
1568 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1569 else
1570 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1571 obj = vp->v_object;
1572 if (!obj) {
1573 panic("biodone: no object");
1574 }
1575 #if defined(VFS_BIO_DEBUG)
1576 if (obj->paging_in_progress < bp->b_npages) {
1577 printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
1578 obj->paging_in_progress, bp->b_npages);
1579 }
1580 #endif
1581 iosize = bp->b_bufsize;
1582 for (i = 0; i < bp->b_npages; i++) {
1583 int bogusflag = 0;
1584 m = bp->b_pages[i];
1585 if (m == bogus_page) {
1586 bogusflag = 1;
1587 m = vm_page_lookup(obj, OFF_TO_IDX(foff));
1588 if (!m) {
1589 #if defined(VFS_BIO_DEBUG)
1590 printf("biodone: page disappeared\n");
1591 #endif
1592 --obj->paging_in_progress;
1593 continue;
1594 }
1595 bp->b_pages[i] = m;
1596 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1597 }
1598 #if defined(VFS_BIO_DEBUG)
1599 if (OFF_TO_IDX(foff) != m->pindex) {
1600 printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
1601 }
1602 #endif
1603 resid = IDX_TO_OFF(m->pindex + 1) - foff;
1604 if (resid > iosize)
1605 resid = iosize;
1606 /*
1607 * In the write case, the valid and clean bits are
1608 * already changed correctly, so we only need to do this
1609 * here in the read case.
1610 */
1611 if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
1612 vfs_page_set_valid(bp, foff, i, m);
1613 }
1614
1615 /*
1616 * when debugging new filesystems or buffer I/O methods, this
1617 * is the most common error that pops up. if you see this, you
1618 * have not set the page busy flag correctly!!!
1619 */
1620 if (m->busy == 0) {
1621 printf("biodone: page busy < 0, "
1622 "pindex: %d, foff: 0x(%x,%x), "
1623 "resid: %d, index: %d\n",
1624 (int) m->pindex, (int)(foff >> 32),
1625 (int) foff & 0xffffffff, resid, i);
1626 if (vp->v_type != VBLK)
1627 printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
1628 bp->b_vp->v_mount->mnt_stat.f_iosize,
1629 (int) bp->b_lblkno,
1630 bp->b_flags, bp->b_npages);
1631 else
1632 printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
1633 (int) bp->b_lblkno,
1634 bp->b_flags, bp->b_npages);
1635 printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
1636 m->valid, m->dirty, m->wire_count);
1637 panic("biodone: page busy < 0\n");
1638 }
1639 --m->busy;
1640 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1641 m->flags &= ~PG_WANTED;
1642 wakeup(m);
1643 }
1644 --obj->paging_in_progress;
1645 foff += resid;
1646 iosize -= resid;
1647 }
1648 if (obj && obj->paging_in_progress == 0 &&
1649 (obj->flags & OBJ_PIPWNT)) {
1650 obj->flags &= ~OBJ_PIPWNT;
1651 wakeup(obj);
1652 }
1653 }
1654 /*
1655 * For asynchronous completions, release the buffer now. The brelse
1656 * checks for B_WANTED and will do the wakeup there if necessary - so
1657 * no need to do a wakeup here in the async case.
1658 */
1659
1660 if (bp->b_flags & B_ASYNC) {
1661 if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
1662 brelse(bp);
1663 else
1664 bqrelse(bp);
1665 } else {
1666 bp->b_flags &= ~B_WANTED;
1667 wakeup(bp);
1668 }
1669 splx(s);
1670 }
1671
1672 int
1673 count_lock_queue()
1674 {
1675 int count;
1676 struct buf *bp;
1677
1678 count = 0;
1679 for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
1680 bp != NULL;
1681 bp = TAILQ_NEXT(bp, b_freelist))
1682 count++;
1683 return (count);
1684 }
1685
1686 int vfs_update_interval = 30;
1687
1688 static void
1689 vfs_update()
1690 {
1691 while (1) {
1692 tsleep(&vfs_update_wakeup, PUSER, "update",
1693 hz * vfs_update_interval);
1694 vfs_update_wakeup = 0;
1695 sync(curproc, NULL, NULL);
1696 }
1697 }
1698
1699 static int
1700 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
1701 {
1702 int error = sysctl_handle_int(oidp,
1703 oidp->oid_arg1, oidp->oid_arg2, req);
1704 if (!error)
1705 wakeup(&vfs_update_wakeup);
1706 return error;
1707 }
1708
1709 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
1710 &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
1711
1712
1713 /*
1714 * This routine is called in lieu of iodone in the case of
1715 * incomplete I/O. This keeps the busy status for pages
1716 * consistant.
1717 */
1718 void
1719 vfs_unbusy_pages(struct buf * bp)
1720 {
1721 int i;
1722
1723 if (bp->b_flags & B_VMIO) {
1724 struct vnode *vp = bp->b_vp;
1725 vm_object_t obj = vp->v_object;
1726 vm_ooffset_t foff;
1727
1728 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1729
1730 for (i = 0; i < bp->b_npages; i++) {
1731 vm_page_t m = bp->b_pages[i];
1732
1733 if (m == bogus_page) {
1734 m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
1735 if (!m) {
1736 panic("vfs_unbusy_pages: page missing\n");
1737 }
1738 bp->b_pages[i] = m;
1739 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1740 }
1741 --obj->paging_in_progress;
1742 --m->busy;
1743 if ((m->busy == 0) && (m->flags & PG_WANTED)) {
1744 m->flags &= ~PG_WANTED;
1745 wakeup(m);
1746 }
1747 }
1748 if (obj->paging_in_progress == 0 &&
1749 (obj->flags & OBJ_PIPWNT)) {
1750 obj->flags &= ~OBJ_PIPWNT;
1751 wakeup(obj);
1752 }
1753 }
1754 }
1755
1756 /*
1757 * Set NFS' b_validoff and b_validend fields from the valid bits
1758 * of a page. If the consumer is not NFS, and the page is not
1759 * valid for the entire range, clear the B_CACHE flag to force
1760 * the consumer to re-read the page.
1761 */
1762 static void
1763 vfs_buf_set_valid(struct buf *bp,
1764 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
1765 vm_page_t m)
1766 {
1767 if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) {
1768 vm_offset_t svalid, evalid;
1769 int validbits = m->valid;
1770
1771 /*
1772 * This only bothers with the first valid range in the
1773 * page.
1774 */
1775 svalid = off;
1776 while (validbits && !(validbits & 1)) {
1777 svalid += DEV_BSIZE;
1778 validbits >>= 1;
1779 }
1780 evalid = svalid;
1781 while (validbits & 1) {
1782 evalid += DEV_BSIZE;
1783 validbits >>= 1;
1784 }
1785 /*
1786 * Make sure this range is contiguous with the range
1787 * built up from previous pages. If not, then we will
1788 * just use the range from the previous pages.
1789 */
1790 if (svalid == bp->b_validend) {
1791 bp->b_validoff = min(bp->b_validoff, svalid);
1792 bp->b_validend = max(bp->b_validend, evalid);
1793 }
1794 } else if (!vm_page_is_valid(m,
1795 (vm_offset_t) ((foff + off) & PAGE_MASK),
1796 size)) {
1797 bp->b_flags &= ~B_CACHE;
1798 }
1799 }
1800
1801 /*
1802 * Set the valid bits in a page, taking care of the b_validoff,
1803 * b_validend fields which NFS uses to optimise small reads. Off is
1804 * the offset within the file and pageno is the page index within the buf.
1805 */
1806 static void
1807 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
1808 {
1809 struct vnode *vp = bp->b_vp;
1810 vm_ooffset_t soff, eoff;
1811
1812 soff = off;
1813 eoff = off + min(PAGE_SIZE, bp->b_bufsize);
1814 vm_page_set_invalid(m,
1815 (vm_offset_t) (soff & PAGE_MASK),
1816 (vm_offset_t) (eoff - soff));
1817 if (vp->v_tag == VT_NFS && vp->v_type != VBLK) {
1818 vm_ooffset_t sv, ev;
1819 off = off - pageno * PAGE_SIZE;
1820 sv = off + ((bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
1821 ev = off + ((bp->b_validend + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1));
1822 soff = max(sv, soff);
1823 eoff = min(ev, eoff);
1824 }
1825 if (eoff > soff)
1826 vm_page_set_validclean(m,
1827 (vm_offset_t) (soff & PAGE_MASK),
1828 (vm_offset_t) (eoff - soff));
1829 }
1830
1831 /*
1832 * This routine is called before a device strategy routine.
1833 * It is used to tell the VM system that paging I/O is in
1834 * progress, and treat the pages associated with the buffer
1835 * almost as being PG_BUSY. Also the object paging_in_progress
1836 * flag is handled to make sure that the object doesn't become
1837 * inconsistant.
1838 */
1839 void
1840 vfs_busy_pages(struct buf * bp, int clear_modify)
1841 {
1842 int i;
1843
1844 if (bp->b_flags & B_VMIO) {
1845 struct vnode *vp = bp->b_vp;
1846 vm_object_t obj = vp->v_object;
1847 vm_ooffset_t foff;
1848
1849 if (vp->v_type == VBLK)
1850 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1851 else
1852 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1853 vfs_setdirty(bp);
1854 for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
1855 vm_page_t m = bp->b_pages[i];
1856
1857 if ((bp->b_flags & B_CLUSTER) == 0) {
1858 obj->paging_in_progress++;
1859 m->busy++;
1860 }
1861 vm_page_protect(m, VM_PROT_NONE);
1862 if (clear_modify)
1863 vfs_page_set_valid(bp, foff, i, m);
1864 else if (bp->b_bcount >= PAGE_SIZE) {
1865 if (m->valid && (bp->b_flags & B_CACHE) == 0) {
1866 bp->b_pages[i] = bogus_page;
1867 pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
1868 }
1869 }
1870 }
1871 }
1872 }
1873
1874 /*
1875 * Tell the VM system that the pages associated with this buffer
1876 * are clean. This is used for delayed writes where the data is
1877 * going to go to disk eventually without additional VM intevention.
1878 */
1879 void
1880 vfs_clean_pages(struct buf * bp)
1881 {
1882 int i;
1883
1884 if (bp->b_flags & B_VMIO) {
1885 struct vnode *vp = bp->b_vp;
1886 vm_object_t obj = vp->v_object;
1887 vm_ooffset_t foff;
1888
1889 if (vp->v_type == VBLK)
1890 foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
1891 else
1892 foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
1893 for (i = 0; i < bp->b_npages; i++, foff += PAGE_SIZE) {
1894 vm_page_t m = bp->b_pages[i];
1895
1896 vfs_page_set_valid(bp, foff, i, m);
1897 }
1898 }
1899 }
1900
1901 void
1902 vfs_bio_clrbuf(struct buf *bp) {
1903 int i;
1904 if( bp->b_flags & B_VMIO) {
1905 if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
1906 int mask;
1907 mask = 0;
1908 for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
1909 mask |= (1 << (i/DEV_BSIZE));
1910 if( bp->b_pages[0]->valid != mask) {
1911 bzero(bp->b_data, bp->b_bufsize);
1912 }
1913 bp->b_pages[0]->valid = mask;
1914 bp->b_resid = 0;
1915 return;
1916 }
1917 for(i=0;i<bp->b_npages;i++) {
1918 if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
1919 continue;
1920 if( bp->b_pages[i]->valid == 0) {
1921 if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
1922 bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
1923 }
1924 } else {
1925 int j;
1926 for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
1927 if( (bp->b_pages[i]->valid & (1<<j)) == 0)
1928 bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
1929 }
1930 }
1931 /* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
1932 }
1933 bp->b_resid = 0;
1934 } else {
1935 clrbuf(bp);
1936 }
1937 }
1938
1939 /*
1940 * vm_hold_load_pages and vm_hold_unload pages get pages into
1941 * a buffers address space. The pages are anonymous and are
1942 * not associated with a file object.
1943 */
1944 void
1945 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
1946 {
1947 vm_offset_t pg;
1948 vm_page_t p;
1949 int index;
1950
1951 to = round_page(to);
1952 from = round_page(from);
1953 index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
1954
1955 for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
1956
1957 tryagain:
1958
1959 p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
1960 VM_ALLOC_NORMAL);
1961 if (!p) {
1962 VM_WAIT;
1963 goto tryagain;
1964 }
1965 vm_page_wire(p);
1966 pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
1967 bp->b_pages[index] = p;
1968 PAGE_WAKEUP(p);
1969 }
1970 bp->b_npages = to >> PAGE_SHIFT;
1971 }
1972
1973 void
1974 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
1975 {
1976 vm_offset_t pg;
1977 vm_page_t p;
1978 int index;
1979
1980 from = round_page(from);
1981 to = round_page(to);
1982 index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
1983
1984 for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
1985 p = bp->b_pages[index];
1986 if (p && (index < bp->b_npages)) {
1987 if (p->busy) {
1988 printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
1989 bp->b_blkno, bp->b_lblkno);
1990 }
1991 bp->b_pages[index] = NULL;
1992 pmap_kremove(pg);
1993 vm_page_unwire(p);
1994 vm_page_free(p);
1995 }
1996 }
1997 bp->b_npages = from >> PAGE_SHIFT;
1998 }
Cache object: 3e54aeefe771eedf0898a598992b59c3
|