FreeBSD/Linux Kernel Cross Reference
sys/vm/vnode_pager.c
1 /*
2 * Copyright (c) 1990 University of Utah.
3 * Copyright (c) 1991 The Regents of the University of California.
4 * All rights reserved.
5 * Copyright (c) 1993, 1994 John S. Dyson
6 * Copyright (c) 1995, David Greenman
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91
41 * $FreeBSD: releng/5.0/sys/vm/vnode_pager.c 107347 2002-11-27 19:51:48Z alc $
42 */
43
44 /*
45 * Page to/from files (vnodes).
46 */
47
48 /*
49 * TODO:
50 * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
51 * greatly re-simplify the vnode_pager.
52 */
53
54 #include <sys/param.h>
55 #include <sys/systm.h>
56 #include <sys/proc.h>
57 #include <sys/vnode.h>
58 #include <sys/mount.h>
59 #include <sys/bio.h>
60 #include <sys/buf.h>
61 #include <sys/vmmeter.h>
62 #include <sys/conf.h>
63 #include <sys/stdint.h>
64
65 #include <vm/vm.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vm_map.h>
70 #include <vm/vnode_pager.h>
71 #include <vm/vm_extern.h>
72
73 static void vnode_pager_init(void);
74 static vm_offset_t vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
75 int *run);
76 static void vnode_pager_iodone(struct buf *bp);
77 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
78 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
79 static void vnode_pager_dealloc(vm_object_t);
80 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
81 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
82 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
83
84 struct pagerops vnodepagerops = {
85 vnode_pager_init,
86 vnode_pager_alloc,
87 vnode_pager_dealloc,
88 vnode_pager_getpages,
89 vnode_pager_putpages,
90 vnode_pager_haspage,
91 NULL
92 };
93
94 int vnode_pbuf_freecnt;
95
96 static void
97 vnode_pager_init(void)
98 {
99
100 vnode_pbuf_freecnt = nswbuf / 2 + 1;
101 }
102
103 /*
104 * Allocate (or lookup) pager for a vnode.
105 * Handle is a vnode pointer.
106 *
107 * MPSAFE
108 */
109 vm_object_t
110 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
111 vm_ooffset_t offset)
112 {
113 vm_object_t object;
114 struct vnode *vp;
115
116 /*
117 * Pageout to vnode, no can do yet.
118 */
119 if (handle == NULL)
120 return (NULL);
121
122 vp = (struct vnode *) handle;
123
124 ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc");
125
126 mtx_lock(&Giant);
127 /*
128 * Prevent race condition when allocating the object. This
129 * can happen with NFS vnodes since the nfsnode isn't locked.
130 */
131 VI_LOCK(vp);
132 while (vp->v_iflag & VI_OLOCK) {
133 vp->v_iflag |= VI_OWANT;
134 msleep(vp, VI_MTX(vp), PVM, "vnpobj", 0);
135 }
136 vp->v_iflag |= VI_OLOCK;
137 VI_UNLOCK(vp);
138
139 /*
140 * If the object is being terminated, wait for it to
141 * go away.
142 */
143 while (((object = vp->v_object) != NULL) &&
144 (object->flags & OBJ_DEAD)) {
145 tsleep(object, PVM, "vadead", 0);
146 }
147
148 if (vp->v_usecount == 0)
149 panic("vnode_pager_alloc: no vnode reference");
150
151 if (object == NULL) {
152 /*
153 * And an object of the appropriate size
154 */
155 object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
156
157 object->un_pager.vnp.vnp_size = size;
158
159 object->handle = handle;
160 vp->v_object = object;
161 } else {
162 object->ref_count++;
163 }
164 VI_LOCK(vp);
165 vp->v_usecount++;
166 vp->v_iflag &= ~VI_OLOCK;
167 if (vp->v_iflag & VI_OWANT) {
168 vp->v_iflag &= ~VI_OWANT;
169 wakeup(vp);
170 }
171 VI_UNLOCK(vp);
172 mtx_unlock(&Giant);
173 return (object);
174 }
175
176 static void
177 vnode_pager_dealloc(object)
178 vm_object_t object;
179 {
180 struct vnode *vp = object->handle;
181
182 GIANT_REQUIRED;
183 if (vp == NULL)
184 panic("vnode_pager_dealloc: pager already dealloced");
185
186 vm_object_pip_wait(object, "vnpdea");
187
188 object->handle = NULL;
189 object->type = OBJT_DEAD;
190 ASSERT_VOP_LOCKED(vp, "vnode_pager_dealloc");
191 vp->v_object = NULL;
192 vp->v_vflag &= ~(VV_TEXT | VV_OBJBUF);
193 }
194
195 static boolean_t
196 vnode_pager_haspage(object, pindex, before, after)
197 vm_object_t object;
198 vm_pindex_t pindex;
199 int *before;
200 int *after;
201 {
202 struct vnode *vp = object->handle;
203 daddr_t bn;
204 int err;
205 daddr_t reqblock;
206 int poff;
207 int bsize;
208 int pagesperblock, blocksperpage;
209
210 GIANT_REQUIRED;
211 /*
212 * If no vp or vp is doomed or marked transparent to VM, we do not
213 * have the page.
214 */
215 if (vp == NULL)
216 return FALSE;
217
218 VI_LOCK(vp);
219 if (vp->v_iflag & VI_DOOMED) {
220 VI_UNLOCK(vp);
221 return FALSE;
222 }
223 VI_UNLOCK(vp);
224 /*
225 * If filesystem no longer mounted or offset beyond end of file we do
226 * not have the page.
227 */
228 if ((vp->v_mount == NULL) ||
229 (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size))
230 return FALSE;
231
232 bsize = vp->v_mount->mnt_stat.f_iosize;
233 pagesperblock = bsize / PAGE_SIZE;
234 blocksperpage = 0;
235 if (pagesperblock > 0) {
236 reqblock = pindex / pagesperblock;
237 } else {
238 blocksperpage = (PAGE_SIZE / bsize);
239 reqblock = pindex * blocksperpage;
240 }
241 err = VOP_BMAP(vp, reqblock, (struct vnode **) 0, &bn,
242 after, before);
243 if (err)
244 return TRUE;
245 if (bn == -1)
246 return FALSE;
247 if (pagesperblock > 0) {
248 poff = pindex - (reqblock * pagesperblock);
249 if (before) {
250 *before *= pagesperblock;
251 *before += poff;
252 }
253 if (after) {
254 int numafter;
255 *after *= pagesperblock;
256 numafter = pagesperblock - (poff + 1);
257 if (IDX_TO_OFF(pindex + numafter) >
258 object->un_pager.vnp.vnp_size) {
259 numafter =
260 OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
261 pindex;
262 }
263 *after += numafter;
264 }
265 } else {
266 if (before) {
267 *before /= blocksperpage;
268 }
269
270 if (after) {
271 *after /= blocksperpage;
272 }
273 }
274 return TRUE;
275 }
276
277 /*
278 * Lets the VM system know about a change in size for a file.
279 * We adjust our own internal size and flush any cached pages in
280 * the associated object that are affected by the size change.
281 *
282 * Note: this routine may be invoked as a result of a pager put
283 * operation (possibly at object termination time), so we must be careful.
284 */
285 void
286 vnode_pager_setsize(vp, nsize)
287 struct vnode *vp;
288 vm_ooffset_t nsize;
289 {
290 vm_pindex_t nobjsize;
291 vm_object_t object = vp->v_object;
292
293 GIANT_REQUIRED;
294
295 if (object == NULL)
296 return;
297
298 /*
299 * Hasn't changed size
300 */
301 if (nsize == object->un_pager.vnp.vnp_size)
302 return;
303
304 nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
305
306 /*
307 * File has shrunk. Toss any cached pages beyond the new EOF.
308 */
309 if (nsize < object->un_pager.vnp.vnp_size) {
310 #ifdef ENABLE_VFS_IOOPT
311 vm_freeze_copyopts(object, OFF_TO_IDX(nsize), object->size);
312 #endif
313 if (nobjsize < object->size) {
314 vm_object_page_remove(object, nobjsize, object->size,
315 FALSE);
316 }
317 /*
318 * this gets rid of garbage at the end of a page that is now
319 * only partially backed by the vnode.
320 *
321 * XXX for some reason (I don't know yet), if we take a
322 * completely invalid page and mark it partially valid
323 * it can screw up NFS reads, so we don't allow the case.
324 */
325 if (nsize & PAGE_MASK) {
326 vm_page_t m;
327
328 m = vm_page_lookup(object, OFF_TO_IDX(nsize));
329 if (m && m->valid) {
330 int base = (int)nsize & PAGE_MASK;
331 int size = PAGE_SIZE - base;
332
333 /*
334 * Clear out partial-page garbage in case
335 * the page has been mapped.
336 */
337 pmap_zero_page_area(m, base, size);
338
339 vm_page_lock_queues();
340 /*
341 * XXX work around SMP data integrity race
342 * by unmapping the page from user processes.
343 * The garbage we just cleared may be mapped
344 * to a user process running on another cpu
345 * and this code is not running through normal
346 * I/O channels which handle SMP issues for
347 * us, so unmap page to synchronize all cpus.
348 *
349 * XXX should vm_pager_unmap_page() have
350 * dealt with this?
351 */
352 pmap_remove_all(m);
353
354 /*
355 * Clear out partial-page dirty bits. This
356 * has the side effect of setting the valid
357 * bits, but that is ok. There are a bunch
358 * of places in the VM system where we expected
359 * m->dirty == VM_PAGE_BITS_ALL. The file EOF
360 * case is one of them. If the page is still
361 * partially dirty, make it fully dirty.
362 *
363 * note that we do not clear out the valid
364 * bits. This would prevent bogus_page
365 * replacement from working properly.
366 */
367 vm_page_set_validclean(m, base, size);
368 if (m->dirty != 0)
369 m->dirty = VM_PAGE_BITS_ALL;
370 vm_page_unlock_queues();
371 }
372 }
373 }
374 object->un_pager.vnp.vnp_size = nsize;
375 object->size = nobjsize;
376 }
377
378 /*
379 * calculate the linear (byte) disk address of specified virtual
380 * file address
381 */
382 static vm_offset_t
383 vnode_pager_addr(vp, address, run)
384 struct vnode *vp;
385 vm_ooffset_t address;
386 int *run;
387 {
388 int rtaddress;
389 int bsize;
390 daddr_t block;
391 struct vnode *rtvp;
392 int err;
393 daddr_t vblock;
394 int voffset;
395
396 GIANT_REQUIRED;
397 if ((int) address < 0)
398 return -1;
399
400 if (vp->v_mount == NULL)
401 return -1;
402
403 bsize = vp->v_mount->mnt_stat.f_iosize;
404 vblock = address / bsize;
405 voffset = address % bsize;
406
407 err = VOP_BMAP(vp, vblock, &rtvp, &block, run, NULL);
408
409 if (err || (block == -1))
410 rtaddress = -1;
411 else {
412 rtaddress = block + voffset / DEV_BSIZE;
413 if (run) {
414 *run += 1;
415 *run *= bsize/PAGE_SIZE;
416 *run -= voffset/PAGE_SIZE;
417 }
418 }
419
420 return rtaddress;
421 }
422
423 /*
424 * interrupt routine for I/O completion
425 */
426 static void
427 vnode_pager_iodone(bp)
428 struct buf *bp;
429 {
430 bp->b_flags |= B_DONE;
431 wakeup(bp);
432 }
433
434 /*
435 * small block filesystem vnode pager input
436 */
437 static int
438 vnode_pager_input_smlfs(object, m)
439 vm_object_t object;
440 vm_page_t m;
441 {
442 int i;
443 int s;
444 struct vnode *dp, *vp;
445 struct buf *bp;
446 vm_offset_t kva;
447 int fileaddr;
448 vm_offset_t bsize;
449 int error = 0;
450
451 GIANT_REQUIRED;
452
453 vp = object->handle;
454 if (vp->v_mount == NULL)
455 return VM_PAGER_BAD;
456
457 bsize = vp->v_mount->mnt_stat.f_iosize;
458
459 VOP_BMAP(vp, 0, &dp, 0, NULL, NULL);
460
461 kva = vm_pager_map_page(m);
462
463 for (i = 0; i < PAGE_SIZE / bsize; i++) {
464 vm_ooffset_t address;
465
466 if (vm_page_bits(i * bsize, bsize) & m->valid)
467 continue;
468
469 address = IDX_TO_OFF(m->pindex) + i * bsize;
470 if (address >= object->un_pager.vnp.vnp_size) {
471 fileaddr = -1;
472 } else {
473 fileaddr = vnode_pager_addr(vp, address, NULL);
474 }
475 if (fileaddr != -1) {
476 bp = getpbuf(&vnode_pbuf_freecnt);
477
478 /* build a minimal buffer header */
479 bp->b_iocmd = BIO_READ;
480 bp->b_iodone = vnode_pager_iodone;
481 KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
482 KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
483 bp->b_rcred = crhold(curthread->td_ucred);
484 bp->b_wcred = crhold(curthread->td_ucred);
485 bp->b_data = (caddr_t) kva + i * bsize;
486 bp->b_blkno = fileaddr;
487 pbgetvp(dp, bp);
488 bp->b_bcount = bsize;
489 bp->b_bufsize = bsize;
490 bp->b_runningbufspace = bp->b_bufsize;
491 runningbufspace += bp->b_runningbufspace;
492
493 /* do the input */
494 BUF_STRATEGY(bp);
495
496 /* we definitely need to be at splvm here */
497
498 s = splvm();
499 while ((bp->b_flags & B_DONE) == 0) {
500 tsleep(bp, PVM, "vnsrd", 0);
501 }
502 splx(s);
503 if ((bp->b_ioflags & BIO_ERROR) != 0)
504 error = EIO;
505
506 /*
507 * free the buffer header back to the swap buffer pool
508 */
509 relpbuf(bp, &vnode_pbuf_freecnt);
510 if (error)
511 break;
512
513 vm_page_lock_queues();
514 vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
515 vm_page_unlock_queues();
516 } else {
517 vm_page_lock_queues();
518 vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
519 vm_page_unlock_queues();
520 bzero((caddr_t) kva + i * bsize, bsize);
521 }
522 }
523 vm_pager_unmap_page(kva);
524 vm_page_lock_queues();
525 pmap_clear_modify(m);
526 vm_page_flag_clear(m, PG_ZERO);
527 vm_page_unlock_queues();
528 if (error) {
529 return VM_PAGER_ERROR;
530 }
531 return VM_PAGER_OK;
532
533 }
534
535
536 /*
537 * old style vnode pager output routine
538 */
539 static int
540 vnode_pager_input_old(object, m)
541 vm_object_t object;
542 vm_page_t m;
543 {
544 struct uio auio;
545 struct iovec aiov;
546 int error;
547 int size;
548 vm_offset_t kva;
549 struct vnode *vp;
550
551 GIANT_REQUIRED;
552 error = 0;
553
554 /*
555 * Return failure if beyond current EOF
556 */
557 if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
558 return VM_PAGER_BAD;
559 } else {
560 size = PAGE_SIZE;
561 if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
562 size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
563
564 /*
565 * Allocate a kernel virtual address and initialize so that
566 * we can use VOP_READ/WRITE routines.
567 */
568 kva = vm_pager_map_page(m);
569
570 vp = object->handle;
571 aiov.iov_base = (caddr_t) kva;
572 aiov.iov_len = size;
573 auio.uio_iov = &aiov;
574 auio.uio_iovcnt = 1;
575 auio.uio_offset = IDX_TO_OFF(m->pindex);
576 auio.uio_segflg = UIO_SYSSPACE;
577 auio.uio_rw = UIO_READ;
578 auio.uio_resid = size;
579 auio.uio_td = curthread;
580
581 error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
582 if (!error) {
583 int count = size - auio.uio_resid;
584
585 if (count == 0)
586 error = EINVAL;
587 else if (count != PAGE_SIZE)
588 bzero((caddr_t) kva + count, PAGE_SIZE - count);
589 }
590 vm_pager_unmap_page(kva);
591 }
592 vm_page_lock_queues();
593 pmap_clear_modify(m);
594 vm_page_undirty(m);
595 vm_page_flag_clear(m, PG_ZERO);
596 if (!error)
597 m->valid = VM_PAGE_BITS_ALL;
598 vm_page_unlock_queues();
599 return error ? VM_PAGER_ERROR : VM_PAGER_OK;
600 }
601
602 /*
603 * generic vnode pager input routine
604 */
605
606 /*
607 * Local media VFS's that do not implement their own VOP_GETPAGES
608 * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
609 * to implement the previous behaviour.
610 *
611 * All other FS's should use the bypass to get to the local media
612 * backing vp's VOP_GETPAGES.
613 */
614 static int
615 vnode_pager_getpages(object, m, count, reqpage)
616 vm_object_t object;
617 vm_page_t *m;
618 int count;
619 int reqpage;
620 {
621 int rtval;
622 struct vnode *vp;
623 int bytes = count * PAGE_SIZE;
624
625 GIANT_REQUIRED;
626 vp = object->handle;
627 rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
628 KASSERT(rtval != EOPNOTSUPP,
629 ("vnode_pager: FS getpages not implemented\n"));
630 return rtval;
631 }
632
633 /*
634 * This is now called from local media FS's to operate against their
635 * own vnodes if they fail to implement VOP_GETPAGES.
636 */
637 int
638 vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
639 struct vnode *vp;
640 vm_page_t *m;
641 int bytecount;
642 int reqpage;
643 {
644 vm_object_t object;
645 vm_offset_t kva;
646 off_t foff, tfoff, nextoff;
647 int i, j, size, bsize, first, firstaddr;
648 struct vnode *dp;
649 int runpg;
650 int runend;
651 struct buf *bp;
652 int s;
653 int count;
654 int error = 0;
655
656 GIANT_REQUIRED;
657 object = vp->v_object;
658 count = bytecount / PAGE_SIZE;
659
660 if (vp->v_mount == NULL)
661 return VM_PAGER_BAD;
662
663 bsize = vp->v_mount->mnt_stat.f_iosize;
664
665 /* get the UNDERLYING device for the file with VOP_BMAP() */
666
667 /*
668 * originally, we did not check for an error return value -- assuming
669 * an fs always has a bmap entry point -- that assumption is wrong!!!
670 */
671 foff = IDX_TO_OFF(m[reqpage]->pindex);
672
673 /*
674 * if we can't bmap, use old VOP code
675 */
676 if (VOP_BMAP(vp, 0, &dp, 0, NULL, NULL)) {
677 vm_page_lock_queues();
678 for (i = 0; i < count; i++)
679 if (i != reqpage)
680 vm_page_free(m[i]);
681 vm_page_unlock_queues();
682 cnt.v_vnodein++;
683 cnt.v_vnodepgsin++;
684 return vnode_pager_input_old(object, m[reqpage]);
685
686 /*
687 * if the blocksize is smaller than a page size, then use
688 * special small filesystem code. NFS sometimes has a small
689 * blocksize, but it can handle large reads itself.
690 */
691 } else if ((PAGE_SIZE / bsize) > 1 &&
692 (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
693 vm_page_lock_queues();
694 for (i = 0; i < count; i++)
695 if (i != reqpage)
696 vm_page_free(m[i]);
697 vm_page_unlock_queues();
698 cnt.v_vnodein++;
699 cnt.v_vnodepgsin++;
700 return vnode_pager_input_smlfs(object, m[reqpage]);
701 }
702
703 /*
704 * If we have a completely valid page available to us, we can
705 * clean up and return. Otherwise we have to re-read the
706 * media.
707 */
708 if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
709 vm_page_lock_queues();
710 for (i = 0; i < count; i++)
711 if (i != reqpage)
712 vm_page_free(m[i]);
713 vm_page_unlock_queues();
714 return VM_PAGER_OK;
715 }
716 m[reqpage]->valid = 0;
717
718 /*
719 * here on direct device I/O
720 */
721 firstaddr = -1;
722
723 /*
724 * calculate the run that includes the required page
725 */
726 for (first = 0, i = 0; i < count; i = runend) {
727 firstaddr = vnode_pager_addr(vp,
728 IDX_TO_OFF(m[i]->pindex), &runpg);
729 if (firstaddr == -1) {
730 if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
731 panic("vnode_pager_getpages: unexpected missing page: firstaddr: %d, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
732 firstaddr, (uintmax_t)(foff >> 32),
733 (uintmax_t)foff,
734 (uintmax_t)
735 (object->un_pager.vnp.vnp_size >> 32),
736 (uintmax_t)object->un_pager.vnp.vnp_size);
737 }
738 vm_page_lock_queues();
739 vm_page_free(m[i]);
740 vm_page_unlock_queues();
741 runend = i + 1;
742 first = runend;
743 continue;
744 }
745 runend = i + runpg;
746 if (runend <= reqpage) {
747 vm_page_lock_queues();
748 for (j = i; j < runend; j++)
749 vm_page_free(m[j]);
750 vm_page_unlock_queues();
751 } else {
752 if (runpg < (count - first)) {
753 vm_page_lock_queues();
754 for (i = first + runpg; i < count; i++)
755 vm_page_free(m[i]);
756 vm_page_unlock_queues();
757 count = first + runpg;
758 }
759 break;
760 }
761 first = runend;
762 }
763
764 /*
765 * the first and last page have been calculated now, move input pages
766 * to be zero based...
767 */
768 if (first != 0) {
769 for (i = first; i < count; i++) {
770 m[i - first] = m[i];
771 }
772 count -= first;
773 reqpage -= first;
774 }
775
776 /*
777 * calculate the file virtual address for the transfer
778 */
779 foff = IDX_TO_OFF(m[0]->pindex);
780
781 /*
782 * calculate the size of the transfer
783 */
784 size = count * PAGE_SIZE;
785 if ((foff + size) > object->un_pager.vnp.vnp_size)
786 size = object->un_pager.vnp.vnp_size - foff;
787
788 /*
789 * round up physical size for real devices.
790 */
791 if (dp->v_type == VBLK || dp->v_type == VCHR) {
792 int secmask = dp->v_rdev->si_bsize_phys - 1;
793 KASSERT(secmask < PAGE_SIZE, ("vnode_pager_generic_getpages: sector size %d too large\n", secmask + 1));
794 size = (size + secmask) & ~secmask;
795 }
796
797 bp = getpbuf(&vnode_pbuf_freecnt);
798 kva = (vm_offset_t) bp->b_data;
799
800 /*
801 * and map the pages to be read into the kva
802 */
803 pmap_qenter(kva, m, count);
804
805 /* build a minimal buffer header */
806 bp->b_iocmd = BIO_READ;
807 bp->b_iodone = vnode_pager_iodone;
808 /* B_PHYS is not set, but it is nice to fill this in */
809 KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
810 KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
811 bp->b_rcred = crhold(curthread->td_ucred);
812 bp->b_wcred = crhold(curthread->td_ucred);
813 bp->b_blkno = firstaddr;
814 pbgetvp(dp, bp);
815 bp->b_bcount = size;
816 bp->b_bufsize = size;
817 bp->b_runningbufspace = bp->b_bufsize;
818 runningbufspace += bp->b_runningbufspace;
819
820 cnt.v_vnodein++;
821 cnt.v_vnodepgsin += count;
822
823 /* do the input */
824 BUF_STRATEGY(bp);
825
826 s = splvm();
827 /* we definitely need to be at splvm here */
828
829 while ((bp->b_flags & B_DONE) == 0) {
830 tsleep(bp, PVM, "vnread", 0);
831 }
832 splx(s);
833 if ((bp->b_ioflags & BIO_ERROR) != 0)
834 error = EIO;
835
836 if (!error) {
837 if (size != count * PAGE_SIZE)
838 bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
839 }
840 pmap_qremove(kva, count);
841
842 /*
843 * free the buffer header back to the swap buffer pool
844 */
845 relpbuf(bp, &vnode_pbuf_freecnt);
846
847 vm_page_lock_queues();
848 for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
849 vm_page_t mt;
850
851 nextoff = tfoff + PAGE_SIZE;
852 mt = m[i];
853
854 if (nextoff <= object->un_pager.vnp.vnp_size) {
855 /*
856 * Read filled up entire page.
857 */
858 mt->valid = VM_PAGE_BITS_ALL;
859 vm_page_undirty(mt); /* should be an assert? XXX */
860 pmap_clear_modify(mt);
861 } else {
862 /*
863 * Read did not fill up entire page. Since this
864 * is getpages, the page may be mapped, so we have
865 * to zero the invalid portions of the page even
866 * though we aren't setting them valid.
867 *
868 * Currently we do not set the entire page valid,
869 * we just try to clear the piece that we couldn't
870 * read.
871 */
872 vm_page_set_validclean(mt, 0,
873 object->un_pager.vnp.vnp_size - tfoff);
874 /* handled by vm_fault now */
875 /* vm_page_zero_invalid(mt, FALSE); */
876 }
877
878 vm_page_flag_clear(mt, PG_ZERO);
879 if (i != reqpage) {
880
881 /*
882 * whether or not to leave the page activated is up in
883 * the air, but we should put the page on a page queue
884 * somewhere. (it already is in the object). Result:
885 * It appears that empirical results show that
886 * deactivating pages is best.
887 */
888
889 /*
890 * just in case someone was asking for this page we
891 * now tell them that it is ok to use
892 */
893 if (!error) {
894 if (mt->flags & PG_WANTED)
895 vm_page_activate(mt);
896 else
897 vm_page_deactivate(mt);
898 vm_page_wakeup(mt);
899 } else {
900 vm_page_free(mt);
901 }
902 }
903 }
904 vm_page_unlock_queues();
905 if (error) {
906 printf("vnode_pager_getpages: I/O read error\n");
907 }
908 return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
909 }
910
911 /*
912 * EOPNOTSUPP is no longer legal. For local media VFS's that do not
913 * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
914 * vnode_pager_generic_putpages() to implement the previous behaviour.
915 *
916 * All other FS's should use the bypass to get to the local media
917 * backing vp's VOP_PUTPAGES.
918 */
919 static void
920 vnode_pager_putpages(object, m, count, sync, rtvals)
921 vm_object_t object;
922 vm_page_t *m;
923 int count;
924 boolean_t sync;
925 int *rtvals;
926 {
927 int rtval;
928 struct vnode *vp;
929 struct mount *mp;
930 int bytes = count * PAGE_SIZE;
931
932 GIANT_REQUIRED;
933 /*
934 * Force synchronous operation if we are extremely low on memory
935 * to prevent a low-memory deadlock. VOP operations often need to
936 * allocate more memory to initiate the I/O ( i.e. do a BMAP
937 * operation ). The swapper handles the case by limiting the amount
938 * of asynchronous I/O, but that sort of solution doesn't scale well
939 * for the vnode pager without a lot of work.
940 *
941 * Also, the backing vnode's iodone routine may not wake the pageout
942 * daemon up. This should be probably be addressed XXX.
943 */
944
945 if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
946 sync |= OBJPC_SYNC;
947
948 /*
949 * Call device-specific putpages function
950 */
951 vp = object->handle;
952 if (vp->v_type != VREG)
953 mp = NULL;
954 (void)vn_start_write(vp, &mp, V_WAIT);
955 rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
956 KASSERT(rtval != EOPNOTSUPP,
957 ("vnode_pager: stale FS putpages\n"));
958 vn_finished_write(mp);
959 }
960
961
962 /*
963 * This is now called from local media FS's to operate against their
964 * own vnodes if they fail to implement VOP_PUTPAGES.
965 *
966 * This is typically called indirectly via the pageout daemon and
967 * clustering has already typically occured, so in general we ask the
968 * underlying filesystem to write the data out asynchronously rather
969 * then delayed.
970 */
971 int
972 vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
973 struct vnode *vp;
974 vm_page_t *m;
975 int bytecount;
976 int flags;
977 int *rtvals;
978 {
979 int i;
980 vm_object_t object;
981 int count;
982
983 int maxsize, ncount;
984 vm_ooffset_t poffset;
985 struct uio auio;
986 struct iovec aiov;
987 int error;
988 int ioflags;
989
990 GIANT_REQUIRED;
991 object = vp->v_object;
992 count = bytecount / PAGE_SIZE;
993
994 for (i = 0; i < count; i++)
995 rtvals[i] = VM_PAGER_AGAIN;
996
997 if ((int) m[0]->pindex < 0) {
998 printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%x)\n",
999 (long)m[0]->pindex, m[0]->dirty);
1000 rtvals[0] = VM_PAGER_BAD;
1001 return VM_PAGER_BAD;
1002 }
1003
1004 maxsize = count * PAGE_SIZE;
1005 ncount = count;
1006
1007 poffset = IDX_TO_OFF(m[0]->pindex);
1008
1009 /*
1010 * If the page-aligned write is larger then the actual file we
1011 * have to invalidate pages occuring beyond the file EOF. However,
1012 * there is an edge case where a file may not be page-aligned where
1013 * the last page is partially invalid. In this case the filesystem
1014 * may not properly clear the dirty bits for the entire page (which
1015 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
1016 * With the page locked we are free to fix-up the dirty bits here.
1017 *
1018 * We do not under any circumstances truncate the valid bits, as
1019 * this will screw up bogus page replacement.
1020 */
1021 if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
1022 if (object->un_pager.vnp.vnp_size > poffset) {
1023 int pgoff;
1024
1025 maxsize = object->un_pager.vnp.vnp_size - poffset;
1026 ncount = btoc(maxsize);
1027 if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
1028 vm_page_clear_dirty(m[ncount - 1], pgoff,
1029 PAGE_SIZE - pgoff);
1030 }
1031 } else {
1032 maxsize = 0;
1033 ncount = 0;
1034 }
1035 if (ncount < count) {
1036 for (i = ncount; i < count; i++) {
1037 rtvals[i] = VM_PAGER_BAD;
1038 }
1039 }
1040 }
1041
1042 /*
1043 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
1044 * rather then a bdwrite() to prevent paging I/O from saturating
1045 * the buffer cache.
1046 */
1047 ioflags = IO_VMIO;
1048 ioflags |= (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) ? IO_SYNC: IO_ASYNC;
1049 ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
1050
1051 aiov.iov_base = (caddr_t) 0;
1052 aiov.iov_len = maxsize;
1053 auio.uio_iov = &aiov;
1054 auio.uio_iovcnt = 1;
1055 auio.uio_offset = poffset;
1056 auio.uio_segflg = UIO_NOCOPY;
1057 auio.uio_rw = UIO_WRITE;
1058 auio.uio_resid = maxsize;
1059 auio.uio_td = (struct thread *) 0;
1060 error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
1061 cnt.v_vnodeout++;
1062 cnt.v_vnodepgsout += ncount;
1063
1064 if (error) {
1065 printf("vnode_pager_putpages: I/O error %d\n", error);
1066 }
1067 if (auio.uio_resid) {
1068 printf("vnode_pager_putpages: residual I/O %d at %lu\n",
1069 auio.uio_resid, (u_long)m[0]->pindex);
1070 }
1071 for (i = 0; i < ncount; i++) {
1072 rtvals[i] = VM_PAGER_OK;
1073 }
1074 return rtvals[0];
1075 }
1076
1077 struct vnode *
1078 vnode_pager_lock(object)
1079 vm_object_t object;
1080 {
1081 struct thread *td = curthread; /* XXX */
1082
1083 GIANT_REQUIRED;
1084
1085 for (; object != NULL; object = object->backing_object) {
1086 if (object->type != OBJT_VNODE)
1087 continue;
1088 if (object->flags & OBJ_DEAD) {
1089 return NULL;
1090 }
1091
1092 /* XXX; If object->handle can change, we need to cache it. */
1093 while (vget(object->handle,
1094 LK_NOPAUSE | LK_SHARED | LK_RETRY | LK_CANRECURSE, td)){
1095 if ((object->flags & OBJ_DEAD) || (object->type != OBJT_VNODE))
1096 return NULL;
1097 printf("vnode_pager_lock: retrying\n");
1098 }
1099 return object->handle;
1100 }
1101 return NULL;
1102 }
Cache object: 3f1a8b22774181cbb8e6ef5f6c5d47c9
|