FreeBSD/Linux Kernel Cross Reference
sys/vm/vnode_pager.c
1 /*-
2 * Copyright (c) 1990 University of Utah.
3 * Copyright (c) 1991 The Regents of the University of California.
4 * All rights reserved.
5 * Copyright (c) 1993, 1994 John S. Dyson
6 * Copyright (c) 1995, David Greenman
7 *
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * from: @(#)vnode_pager.c 7.5 (Berkeley) 4/20/91
41 */
42
43 /*
44 * Page to/from files (vnodes).
45 */
46
47 /*
48 * TODO:
49 * Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
50 * greatly re-simplify the vnode_pager.
51 */
52
53 #include <sys/cdefs.h>
54 __FBSDID("$FreeBSD: releng/6.4/sys/vm/vnode_pager.c 181128 2008-08-01 19:50:07Z jhb $");
55
56 #include <sys/param.h>
57 #include <sys/systm.h>
58 #include <sys/proc.h>
59 #include <sys/vnode.h>
60 #include <sys/mount.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/vmmeter.h>
64 #include <sys/limits.h>
65 #include <sys/conf.h>
66 #include <sys/sf_buf.h>
67
68 #include <machine/atomic.h>
69
70 #include <vm/vm.h>
71 #include <vm/vm_object.h>
72 #include <vm/vm_page.h>
73 #include <vm/vm_pager.h>
74 #include <vm/vm_map.h>
75 #include <vm/vnode_pager.h>
76 #include <vm/vm_extern.h>
77
78 static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
79 daddr_t *rtaddress, int *run);
80 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
81 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
82 static void vnode_pager_dealloc(vm_object_t);
83 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
84 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
85 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
86 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t);
87
88 struct pagerops vnodepagerops = {
89 .pgo_alloc = vnode_pager_alloc,
90 .pgo_dealloc = vnode_pager_dealloc,
91 .pgo_getpages = vnode_pager_getpages,
92 .pgo_putpages = vnode_pager_putpages,
93 .pgo_haspage = vnode_pager_haspage,
94 };
95
96 int vnode_pbuf_freecnt;
97
98 /*
99 * Compatibility function for RELENG_6, in which vnode_create_vobject()
100 * takes file size as size_t due to an oversight. The type may not just
101 * change to off_t because the ABI to 3rd party modules must be preserved
102 * for RELENG_6 lifetime.
103 */
104 int
105 vnode_create_vobject(struct vnode *vp, size_t isize __unused, struct thread *td)
106 {
107
108 /*
109 * Size of 0 will indicate to vnode_create_vobject_off()
110 * VOP_GETATTR() is to be called to get the actual size.
111 */
112 return (vnode_create_vobject_off(vp, 0, td));
113 }
114
115 /*
116 * Create the VM system backing object for this vnode -- for RELENG_6 only.
117 * In HEAD, vnode_create_vobject() has been fixed to take file size as off_t
118 * and so it can be used as is.
119 */
120 int
121 vnode_create_vobject_off(struct vnode *vp, off_t isize, struct thread *td)
122 {
123 vm_object_t object;
124 vm_ooffset_t size = isize;
125 struct vattr va;
126
127 if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
128 return (0);
129
130 while ((object = vp->v_object) != NULL) {
131 VM_OBJECT_LOCK(object);
132 if (!(object->flags & OBJ_DEAD)) {
133 VM_OBJECT_UNLOCK(object);
134 return (0);
135 }
136 VOP_UNLOCK(vp, 0, td);
137 vm_object_set_flag(object, OBJ_DISCONNECTWNT);
138 msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vodead", 0);
139 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
140 }
141
142 if (size == 0) {
143 if (vn_isdisk(vp, NULL)) {
144 size = IDX_TO_OFF(INT_MAX);
145 } else {
146 if (VOP_GETATTR(vp, &va, td->td_ucred, td) != 0)
147 return (0);
148 size = va.va_size;
149 }
150 }
151
152 object = vnode_pager_alloc(vp, size, 0, 0);
153 /*
154 * Dereference the reference we just created. This assumes
155 * that the object is associated with the vp.
156 */
157 VM_OBJECT_LOCK(object);
158 object->ref_count--;
159 VM_OBJECT_UNLOCK(object);
160 vrele(vp);
161
162 KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
163
164 return (0);
165 }
166
167 void
168 vnode_destroy_vobject(struct vnode *vp)
169 {
170 struct vm_object *obj;
171
172 obj = vp->v_object;
173 if (obj == NULL)
174 return;
175 ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
176 VM_OBJECT_LOCK(obj);
177 if (obj->ref_count == 0) {
178 /*
179 * vclean() may be called twice. The first time
180 * removes the primary reference to the object,
181 * the second time goes one further and is a
182 * special-case to terminate the object.
183 *
184 * don't double-terminate the object
185 */
186 if ((obj->flags & OBJ_DEAD) == 0)
187 vm_object_terminate(obj);
188 else
189 VM_OBJECT_UNLOCK(obj);
190 } else {
191 /*
192 * Woe to the process that tries to page now :-).
193 */
194 vm_pager_deallocate(obj);
195 VM_OBJECT_UNLOCK(obj);
196 }
197 vp->v_object = NULL;
198 }
199
200
201 /*
202 * Allocate (or lookup) pager for a vnode.
203 * Handle is a vnode pointer.
204 *
205 * MPSAFE
206 */
207 vm_object_t
208 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
209 vm_ooffset_t offset)
210 {
211 vm_object_t object;
212 struct vnode *vp;
213
214 /*
215 * Pageout to vnode, no can do yet.
216 */
217 if (handle == NULL)
218 return (NULL);
219
220 vp = (struct vnode *) handle;
221
222 /*
223 * If the object is being terminated, wait for it to
224 * go away.
225 */
226 retry:
227 while ((object = vp->v_object) != NULL) {
228 VM_OBJECT_LOCK(object);
229 if ((object->flags & OBJ_DEAD) == 0)
230 break;
231 vm_object_set_flag(object, OBJ_DISCONNECTWNT);
232 msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vadead", 0);
233 }
234
235 if (vp->v_usecount == 0)
236 panic("vnode_pager_alloc: no vnode reference");
237
238 if (object == NULL) {
239 /*
240 * Add an object of the appropriate size
241 */
242 object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
243
244 object->un_pager.vnp.vnp_size = size;
245
246 object->handle = handle;
247 if (VFS_NEEDSGIANT(vp->v_mount))
248 vm_object_set_flag(object, OBJ_NEEDGIANT);
249 VI_LOCK(vp);
250 if (vp->v_object != NULL) {
251 /*
252 * Object has been created while we were sleeping
253 */
254 VI_UNLOCK(vp);
255 vm_object_destroy(object);
256 goto retry;
257 }
258 vp->v_object = object;
259 VI_UNLOCK(vp);
260 } else {
261 object->ref_count++;
262 VM_OBJECT_UNLOCK(object);
263 }
264 vref(vp);
265 return (object);
266 }
267
268 /*
269 * The object must be locked.
270 */
271 static void
272 vnode_pager_dealloc(object)
273 vm_object_t object;
274 {
275 struct vnode *vp = object->handle;
276
277 if (vp == NULL)
278 panic("vnode_pager_dealloc: pager already dealloced");
279
280 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
281 vm_object_pip_wait(object, "vnpdea");
282
283 object->handle = NULL;
284 object->type = OBJT_DEAD;
285 if (object->flags & OBJ_DISCONNECTWNT) {
286 vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
287 wakeup(object);
288 }
289 ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
290 vp->v_object = NULL;
291 vp->v_vflag &= ~VV_TEXT;
292 }
293
294 static boolean_t
295 vnode_pager_haspage(object, pindex, before, after)
296 vm_object_t object;
297 vm_pindex_t pindex;
298 int *before;
299 int *after;
300 {
301 struct vnode *vp = object->handle;
302 daddr_t bn;
303 int err;
304 daddr_t reqblock;
305 int poff;
306 int bsize;
307 int pagesperblock, blocksperpage;
308 int vfslocked;
309
310 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
311 /*
312 * If no vp or vp is doomed or marked transparent to VM, we do not
313 * have the page.
314 */
315 if (vp == NULL || vp->v_iflag & VI_DOOMED)
316 return FALSE;
317 /*
318 * If the offset is beyond end of file we do
319 * not have the page.
320 */
321 if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)
322 return FALSE;
323
324 bsize = vp->v_mount->mnt_stat.f_iosize;
325 pagesperblock = bsize / PAGE_SIZE;
326 blocksperpage = 0;
327 if (pagesperblock > 0) {
328 reqblock = pindex / pagesperblock;
329 } else {
330 blocksperpage = (PAGE_SIZE / bsize);
331 reqblock = pindex * blocksperpage;
332 }
333 VM_OBJECT_UNLOCK(object);
334 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
335 err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
336 VFS_UNLOCK_GIANT(vfslocked);
337 VM_OBJECT_LOCK(object);
338 if (err)
339 return TRUE;
340 if (bn == -1)
341 return FALSE;
342 if (pagesperblock > 0) {
343 poff = pindex - (reqblock * pagesperblock);
344 if (before) {
345 *before *= pagesperblock;
346 *before += poff;
347 }
348 if (after) {
349 int numafter;
350 *after *= pagesperblock;
351 numafter = pagesperblock - (poff + 1);
352 if (IDX_TO_OFF(pindex + numafter) >
353 object->un_pager.vnp.vnp_size) {
354 numafter =
355 OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
356 pindex;
357 }
358 *after += numafter;
359 }
360 } else {
361 if (before) {
362 *before /= blocksperpage;
363 }
364
365 if (after) {
366 *after /= blocksperpage;
367 }
368 }
369 return TRUE;
370 }
371
372 /*
373 * Lets the VM system know about a change in size for a file.
374 * We adjust our own internal size and flush any cached pages in
375 * the associated object that are affected by the size change.
376 *
377 * Note: this routine may be invoked as a result of a pager put
378 * operation (possibly at object termination time), so we must be careful.
379 */
380 void
381 vnode_pager_setsize(vp, nsize)
382 struct vnode *vp;
383 vm_ooffset_t nsize;
384 {
385 vm_object_t object;
386 vm_page_t m;
387 vm_pindex_t nobjsize;
388
389 if ((object = vp->v_object) == NULL)
390 return;
391 VM_OBJECT_LOCK(object);
392 if (nsize == object->un_pager.vnp.vnp_size) {
393 /*
394 * Hasn't changed size
395 */
396 VM_OBJECT_UNLOCK(object);
397 return;
398 }
399 nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
400 if (nsize < object->un_pager.vnp.vnp_size) {
401 /*
402 * File has shrunk. Toss any cached pages beyond the new EOF.
403 */
404 if (nobjsize < object->size)
405 vm_object_page_remove(object, nobjsize, object->size,
406 FALSE);
407 /*
408 * this gets rid of garbage at the end of a page that is now
409 * only partially backed by the vnode.
410 *
411 * XXX for some reason (I don't know yet), if we take a
412 * completely invalid page and mark it partially valid
413 * it can screw up NFS reads, so we don't allow the case.
414 */
415 if ((nsize & PAGE_MASK) &&
416 (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
417 m->valid != 0) {
418 int base = (int)nsize & PAGE_MASK;
419 int size = PAGE_SIZE - base;
420
421 /*
422 * Clear out partial-page garbage in case
423 * the page has been mapped.
424 */
425 pmap_zero_page_area(m, base, size);
426
427 /*
428 * XXX work around SMP data integrity race
429 * by unmapping the page from user processes.
430 * The garbage we just cleared may be mapped
431 * to a user process running on another cpu
432 * and this code is not running through normal
433 * I/O channels which handle SMP issues for
434 * us, so unmap page to synchronize all cpus.
435 *
436 * XXX should vm_pager_unmap_page() have
437 * dealt with this?
438 */
439 vm_page_lock_queues();
440 pmap_remove_all(m);
441
442 /*
443 * Clear out partial-page dirty bits. This
444 * has the side effect of setting the valid
445 * bits, but that is ok. There are a bunch
446 * of places in the VM system where we expected
447 * m->dirty == VM_PAGE_BITS_ALL. The file EOF
448 * case is one of them. If the page is still
449 * partially dirty, make it fully dirty.
450 *
451 * note that we do not clear out the valid
452 * bits. This would prevent bogus_page
453 * replacement from working properly.
454 */
455 vm_page_set_validclean(m, base, size);
456 if (m->dirty != 0)
457 m->dirty = VM_PAGE_BITS_ALL;
458 vm_page_unlock_queues();
459 }
460 }
461 object->un_pager.vnp.vnp_size = nsize;
462 object->size = nobjsize;
463 VM_OBJECT_UNLOCK(object);
464 }
465
466 /*
467 * calculate the linear (byte) disk address of specified virtual
468 * file address
469 */
470 static int
471 vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
472 int *run)
473 {
474 int bsize;
475 int err;
476 daddr_t vblock;
477 daddr_t voffset;
478
479 if (address < 0)
480 return -1;
481
482 if (vp->v_iflag & VI_DOOMED)
483 return -1;
484
485 bsize = vp->v_mount->mnt_stat.f_iosize;
486 vblock = address / bsize;
487 voffset = address % bsize;
488
489 err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
490 if (err == 0) {
491 if (*rtaddress != -1)
492 *rtaddress += voffset / DEV_BSIZE;
493 if (run) {
494 *run += 1;
495 *run *= bsize/PAGE_SIZE;
496 *run -= voffset/PAGE_SIZE;
497 }
498 }
499
500 return (err);
501 }
502
503 /*
504 * small block filesystem vnode pager input
505 */
506 static int
507 vnode_pager_input_smlfs(object, m)
508 vm_object_t object;
509 vm_page_t m;
510 {
511 int i;
512 struct vnode *vp;
513 struct bufobj *bo;
514 struct buf *bp;
515 struct sf_buf *sf;
516 daddr_t fileaddr;
517 vm_offset_t bsize;
518 int error = 0;
519
520 vp = object->handle;
521 if (vp->v_iflag & VI_DOOMED)
522 return VM_PAGER_BAD;
523
524 bsize = vp->v_mount->mnt_stat.f_iosize;
525
526 VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);
527
528 sf = sf_buf_alloc(m, 0);
529
530 for (i = 0; i < PAGE_SIZE / bsize; i++) {
531 vm_ooffset_t address;
532
533 if (vm_page_bits(i * bsize, bsize) & m->valid)
534 continue;
535
536 address = IDX_TO_OFF(m->pindex) + i * bsize;
537 if (address >= object->un_pager.vnp.vnp_size) {
538 fileaddr = -1;
539 } else {
540 error = vnode_pager_addr(vp, address, &fileaddr, NULL);
541 if (error)
542 break;
543 }
544 if (fileaddr != -1) {
545 bp = getpbuf(&vnode_pbuf_freecnt);
546
547 /* build a minimal buffer header */
548 bp->b_iocmd = BIO_READ;
549 bp->b_iodone = bdone;
550 KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
551 KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
552 bp->b_rcred = crhold(curthread->td_ucred);
553 bp->b_wcred = crhold(curthread->td_ucred);
554 bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
555 bp->b_blkno = fileaddr;
556 pbgetbo(bo, bp);
557 bp->b_bcount = bsize;
558 bp->b_bufsize = bsize;
559 bp->b_runningbufspace = bp->b_bufsize;
560 atomic_add_int(&runningbufspace, bp->b_runningbufspace);
561
562 /* do the input */
563 bp->b_iooffset = dbtob(bp->b_blkno);
564 bstrategy(bp);
565
566 bwait(bp, PVM, "vnsrd");
567
568 if ((bp->b_ioflags & BIO_ERROR) != 0)
569 error = EIO;
570
571 /*
572 * free the buffer header back to the swap buffer pool
573 */
574 pbrelbo(bp);
575 relpbuf(bp, &vnode_pbuf_freecnt);
576 if (error)
577 break;
578
579 VM_OBJECT_LOCK(object);
580 vm_page_lock_queues();
581 vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
582 vm_page_unlock_queues();
583 VM_OBJECT_UNLOCK(object);
584 } else {
585 VM_OBJECT_LOCK(object);
586 vm_page_lock_queues();
587 vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
588 vm_page_unlock_queues();
589 VM_OBJECT_UNLOCK(object);
590 bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
591 }
592 }
593 sf_buf_free(sf);
594 vm_page_lock_queues();
595 pmap_clear_modify(m);
596 vm_page_unlock_queues();
597 if (error) {
598 return VM_PAGER_ERROR;
599 }
600 return VM_PAGER_OK;
601
602 }
603
604
605 /*
606 * old style vnode pager input routine
607 */
608 static int
609 vnode_pager_input_old(object, m)
610 vm_object_t object;
611 vm_page_t m;
612 {
613 struct uio auio;
614 struct iovec aiov;
615 int error;
616 int size;
617 struct sf_buf *sf;
618 struct vnode *vp;
619
620 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
621 error = 0;
622
623 /*
624 * Return failure if beyond current EOF
625 */
626 if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
627 return VM_PAGER_BAD;
628 } else {
629 size = PAGE_SIZE;
630 if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
631 size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
632 vp = object->handle;
633 VM_OBJECT_UNLOCK(object);
634
635 /*
636 * Allocate a kernel virtual address and initialize so that
637 * we can use VOP_READ/WRITE routines.
638 */
639 sf = sf_buf_alloc(m, 0);
640
641 aiov.iov_base = (caddr_t)sf_buf_kva(sf);
642 aiov.iov_len = size;
643 auio.uio_iov = &aiov;
644 auio.uio_iovcnt = 1;
645 auio.uio_offset = IDX_TO_OFF(m->pindex);
646 auio.uio_segflg = UIO_SYSSPACE;
647 auio.uio_rw = UIO_READ;
648 auio.uio_resid = size;
649 auio.uio_td = curthread;
650
651 error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
652 if (!error) {
653 int count = size - auio.uio_resid;
654
655 if (count == 0)
656 error = EINVAL;
657 else if (count != PAGE_SIZE)
658 bzero((caddr_t)sf_buf_kva(sf) + count,
659 PAGE_SIZE - count);
660 }
661 sf_buf_free(sf);
662
663 VM_OBJECT_LOCK(object);
664 }
665 vm_page_lock_queues();
666 pmap_clear_modify(m);
667 vm_page_undirty(m);
668 vm_page_unlock_queues();
669 if (!error)
670 m->valid = VM_PAGE_BITS_ALL;
671 return error ? VM_PAGER_ERROR : VM_PAGER_OK;
672 }
673
674 /*
675 * generic vnode pager input routine
676 */
677
678 /*
679 * Local media VFS's that do not implement their own VOP_GETPAGES
680 * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
681 * to implement the previous behaviour.
682 *
683 * All other FS's should use the bypass to get to the local media
684 * backing vp's VOP_GETPAGES.
685 */
686 static int
687 vnode_pager_getpages(object, m, count, reqpage)
688 vm_object_t object;
689 vm_page_t *m;
690 int count;
691 int reqpage;
692 {
693 int rtval;
694 struct vnode *vp;
695 int bytes = count * PAGE_SIZE;
696 int vfslocked;
697
698 vp = object->handle;
699 VM_OBJECT_UNLOCK(object);
700 vfslocked = VFS_LOCK_GIANT(vp->v_mount);
701 rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
702 KASSERT(rtval != EOPNOTSUPP,
703 ("vnode_pager: FS getpages not implemented\n"));
704 VFS_UNLOCK_GIANT(vfslocked);
705 VM_OBJECT_LOCK(object);
706 return rtval;
707 }
708
709 /*
710 * This is now called from local media FS's to operate against their
711 * own vnodes if they fail to implement VOP_GETPAGES.
712 */
713 int
714 vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
715 struct vnode *vp;
716 vm_page_t *m;
717 int bytecount;
718 int reqpage;
719 {
720 vm_object_t object;
721 vm_offset_t kva;
722 off_t foff, tfoff, nextoff;
723 int i, j, size, bsize, first;
724 daddr_t firstaddr, reqblock;
725 struct bufobj *bo;
726 int runpg;
727 int runend;
728 struct buf *bp;
729 int count;
730 int error = 0;
731
732 object = vp->v_object;
733 count = bytecount / PAGE_SIZE;
734
735 KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
736 ("vnode_pager_generic_getpages does not support devices"));
737 if (vp->v_iflag & VI_DOOMED)
738 return VM_PAGER_BAD;
739
740 bsize = vp->v_mount->mnt_stat.f_iosize;
741
742 /* get the UNDERLYING device for the file with VOP_BMAP() */
743
744 /*
745 * originally, we did not check for an error return value -- assuming
746 * an fs always has a bmap entry point -- that assumption is wrong!!!
747 */
748 foff = IDX_TO_OFF(m[reqpage]->pindex);
749
750 /*
751 * if we can't bmap, use old VOP code
752 */
753 if (VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL)) {
754 VM_OBJECT_LOCK(object);
755 vm_page_lock_queues();
756 for (i = 0; i < count; i++)
757 if (i != reqpage)
758 vm_page_free(m[i]);
759 vm_page_unlock_queues();
760 cnt.v_vnodein++;
761 cnt.v_vnodepgsin++;
762 error = vnode_pager_input_old(object, m[reqpage]);
763 VM_OBJECT_UNLOCK(object);
764 return (error);
765
766 /*
767 * if the blocksize is smaller than a page size, then use
768 * special small filesystem code. NFS sometimes has a small
769 * blocksize, but it can handle large reads itself.
770 */
771 } else if ((PAGE_SIZE / bsize) > 1 &&
772 (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
773 VM_OBJECT_LOCK(object);
774 vm_page_lock_queues();
775 for (i = 0; i < count; i++)
776 if (i != reqpage)
777 vm_page_free(m[i]);
778 vm_page_unlock_queues();
779 VM_OBJECT_UNLOCK(object);
780 cnt.v_vnodein++;
781 cnt.v_vnodepgsin++;
782 return vnode_pager_input_smlfs(object, m[reqpage]);
783 }
784
785 /*
786 * If we have a completely valid page available to us, we can
787 * clean up and return. Otherwise we have to re-read the
788 * media.
789 */
790 VM_OBJECT_LOCK(object);
791 if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
792 vm_page_lock_queues();
793 for (i = 0; i < count; i++)
794 if (i != reqpage)
795 vm_page_free(m[i]);
796 vm_page_unlock_queues();
797 VM_OBJECT_UNLOCK(object);
798 return VM_PAGER_OK;
799 } else if (reqblock == -1) {
800 pmap_zero_page(m[reqpage]);
801 vm_page_undirty(m[reqpage]);
802 m[reqpage]->valid = VM_PAGE_BITS_ALL;
803 vm_page_lock_queues();
804 for (i = 0; i < count; i++)
805 if (i != reqpage)
806 vm_page_free(m[i]);
807 vm_page_unlock_queues();
808 VM_OBJECT_UNLOCK(object);
809 return (VM_PAGER_OK);
810 }
811 m[reqpage]->valid = 0;
812 VM_OBJECT_UNLOCK(object);
813
814 /*
815 * here on direct device I/O
816 */
817 firstaddr = -1;
818
819 /*
820 * calculate the run that includes the required page
821 */
822 for (first = 0, i = 0; i < count; i = runend) {
823 if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr,
824 &runpg) != 0) {
825 VM_OBJECT_LOCK(object);
826 vm_page_lock_queues();
827 for (; i < count; i++)
828 if (i != reqpage)
829 vm_page_free(m[i]);
830 vm_page_unlock_queues();
831 VM_OBJECT_UNLOCK(object);
832 return (VM_PAGER_ERROR);
833 }
834 if (firstaddr == -1) {
835 VM_OBJECT_LOCK(object);
836 if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
837 panic("vnode_pager_getpages: unexpected missing page: firstaddr: %jd, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
838 (intmax_t)firstaddr, (uintmax_t)(foff >> 32),
839 (uintmax_t)foff,
840 (uintmax_t)
841 (object->un_pager.vnp.vnp_size >> 32),
842 (uintmax_t)object->un_pager.vnp.vnp_size);
843 }
844 vm_page_lock_queues();
845 vm_page_free(m[i]);
846 vm_page_unlock_queues();
847 VM_OBJECT_UNLOCK(object);
848 runend = i + 1;
849 first = runend;
850 continue;
851 }
852 runend = i + runpg;
853 if (runend <= reqpage) {
854 VM_OBJECT_LOCK(object);
855 vm_page_lock_queues();
856 for (j = i; j < runend; j++)
857 vm_page_free(m[j]);
858 vm_page_unlock_queues();
859 VM_OBJECT_UNLOCK(object);
860 } else {
861 if (runpg < (count - first)) {
862 VM_OBJECT_LOCK(object);
863 vm_page_lock_queues();
864 for (i = first + runpg; i < count; i++)
865 vm_page_free(m[i]);
866 vm_page_unlock_queues();
867 VM_OBJECT_UNLOCK(object);
868 count = first + runpg;
869 }
870 break;
871 }
872 first = runend;
873 }
874
875 /*
876 * the first and last page have been calculated now, move input pages
877 * to be zero based...
878 */
879 if (first != 0) {
880 m += first;
881 count -= first;
882 reqpage -= first;
883 }
884
885 /*
886 * calculate the file virtual address for the transfer
887 */
888 foff = IDX_TO_OFF(m[0]->pindex);
889
890 /*
891 * calculate the size of the transfer
892 */
893 size = count * PAGE_SIZE;
894 KASSERT(count > 0, ("zero count"));
895 if ((foff + size) > object->un_pager.vnp.vnp_size)
896 size = object->un_pager.vnp.vnp_size - foff;
897 KASSERT(size > 0, ("zero size"));
898
899 /*
900 * round up physical size for real devices.
901 */
902 if (1) {
903 int secmask = bo->bo_bsize - 1;
904 KASSERT(secmask < PAGE_SIZE && secmask > 0,
905 ("vnode_pager_generic_getpages: sector size %d too large",
906 secmask + 1));
907 size = (size + secmask) & ~secmask;
908 }
909
910 bp = getpbuf(&vnode_pbuf_freecnt);
911 kva = (vm_offset_t) bp->b_data;
912
913 /*
914 * and map the pages to be read into the kva
915 */
916 pmap_qenter(kva, m, count);
917
918 /* build a minimal buffer header */
919 bp->b_iocmd = BIO_READ;
920 bp->b_iodone = bdone;
921 KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
922 KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
923 bp->b_rcred = crhold(curthread->td_ucred);
924 bp->b_wcred = crhold(curthread->td_ucred);
925 bp->b_blkno = firstaddr;
926 pbgetbo(bo, bp);
927 bp->b_bcount = size;
928 bp->b_bufsize = size;
929 bp->b_runningbufspace = bp->b_bufsize;
930 atomic_add_int(&runningbufspace, bp->b_runningbufspace);
931
932 cnt.v_vnodein++;
933 cnt.v_vnodepgsin += count;
934
935 /* do the input */
936 bp->b_iooffset = dbtob(bp->b_blkno);
937 bstrategy(bp);
938
939 bwait(bp, PVM, "vnread");
940
941 if ((bp->b_ioflags & BIO_ERROR) != 0)
942 error = EIO;
943
944 if (!error) {
945 if (size != count * PAGE_SIZE)
946 bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
947 }
948 pmap_qremove(kva, count);
949
950 /*
951 * free the buffer header back to the swap buffer pool
952 */
953 pbrelbo(bp);
954 relpbuf(bp, &vnode_pbuf_freecnt);
955
956 VM_OBJECT_LOCK(object);
957 vm_page_lock_queues();
958 for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
959 vm_page_t mt;
960
961 nextoff = tfoff + PAGE_SIZE;
962 mt = m[i];
963
964 if (nextoff <= object->un_pager.vnp.vnp_size) {
965 /*
966 * Read filled up entire page.
967 */
968 mt->valid = VM_PAGE_BITS_ALL;
969 vm_page_undirty(mt); /* should be an assert? XXX */
970 pmap_clear_modify(mt);
971 } else {
972 /*
973 * Read did not fill up entire page. Since this
974 * is getpages, the page may be mapped, so we have
975 * to zero the invalid portions of the page even
976 * though we aren't setting them valid.
977 *
978 * Currently we do not set the entire page valid,
979 * we just try to clear the piece that we couldn't
980 * read.
981 */
982 vm_page_set_validclean(mt, 0,
983 object->un_pager.vnp.vnp_size - tfoff);
984 /* handled by vm_fault now */
985 /* vm_page_zero_invalid(mt, FALSE); */
986 }
987
988 if (i != reqpage) {
989
990 /*
991 * whether or not to leave the page activated is up in
992 * the air, but we should put the page on a page queue
993 * somewhere. (it already is in the object). Result:
994 * It appears that empirical results show that
995 * deactivating pages is best.
996 */
997
998 /*
999 * just in case someone was asking for this page we
1000 * now tell them that it is ok to use
1001 */
1002 if (!error) {
1003 if (mt->flags & PG_WANTED)
1004 vm_page_activate(mt);
1005 else
1006 vm_page_deactivate(mt);
1007 vm_page_wakeup(mt);
1008 } else {
1009 vm_page_free(mt);
1010 }
1011 }
1012 }
1013 vm_page_unlock_queues();
1014 VM_OBJECT_UNLOCK(object);
1015 if (error) {
1016 printf("vnode_pager_getpages: I/O read error\n");
1017 }
1018 return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
1019 }
1020
1021 /*
1022 * EOPNOTSUPP is no longer legal. For local media VFS's that do not
1023 * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
1024 * vnode_pager_generic_putpages() to implement the previous behaviour.
1025 *
1026 * All other FS's should use the bypass to get to the local media
1027 * backing vp's VOP_PUTPAGES.
1028 */
1029 static void
1030 vnode_pager_putpages(object, m, count, sync, rtvals)
1031 vm_object_t object;
1032 vm_page_t *m;
1033 int count;
1034 boolean_t sync;
1035 int *rtvals;
1036 {
1037 int rtval;
1038 struct vnode *vp;
1039 struct mount *mp;
1040 int bytes = count * PAGE_SIZE;
1041
1042 /*
1043 * Force synchronous operation if we are extremely low on memory
1044 * to prevent a low-memory deadlock. VOP operations often need to
1045 * allocate more memory to initiate the I/O ( i.e. do a BMAP
1046 * operation ). The swapper handles the case by limiting the amount
1047 * of asynchronous I/O, but that sort of solution doesn't scale well
1048 * for the vnode pager without a lot of work.
1049 *
1050 * Also, the backing vnode's iodone routine may not wake the pageout
1051 * daemon up. This should be probably be addressed XXX.
1052 */
1053
1054 if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
1055 sync |= OBJPC_SYNC;
1056
1057 /*
1058 * Call device-specific putpages function
1059 */
1060 vp = object->handle;
1061 VM_OBJECT_UNLOCK(object);
1062 if (vp->v_type != VREG)
1063 mp = NULL;
1064 rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
1065 KASSERT(rtval != EOPNOTSUPP,
1066 ("vnode_pager: stale FS putpages\n"));
1067 VM_OBJECT_LOCK(object);
1068 }
1069
1070
1071 /*
1072 * This is now called from local media FS's to operate against their
1073 * own vnodes if they fail to implement VOP_PUTPAGES.
1074 *
1075 * This is typically called indirectly via the pageout daemon and
1076 * clustering has already typically occured, so in general we ask the
1077 * underlying filesystem to write the data out asynchronously rather
1078 * then delayed.
1079 */
1080 int
1081 vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
1082 struct vnode *vp;
1083 vm_page_t *m;
1084 int bytecount;
1085 int flags;
1086 int *rtvals;
1087 {
1088 int i;
1089 vm_object_t object;
1090 int count;
1091
1092 int maxsize, ncount;
1093 vm_ooffset_t poffset;
1094 struct uio auio;
1095 struct iovec aiov;
1096 int error;
1097 int ioflags;
1098 int ppscheck = 0;
1099 static struct timeval lastfail;
1100 static int curfail;
1101
1102 object = vp->v_object;
1103 count = bytecount / PAGE_SIZE;
1104
1105 for (i = 0; i < count; i++)
1106 rtvals[i] = VM_PAGER_AGAIN;
1107
1108 if ((int64_t)m[0]->pindex < 0) {
1109 printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
1110 (long)m[0]->pindex, (u_long)m[0]->dirty);
1111 rtvals[0] = VM_PAGER_BAD;
1112 return VM_PAGER_BAD;
1113 }
1114
1115 maxsize = count * PAGE_SIZE;
1116 ncount = count;
1117
1118 poffset = IDX_TO_OFF(m[0]->pindex);
1119
1120 /*
1121 * If the page-aligned write is larger then the actual file we
1122 * have to invalidate pages occuring beyond the file EOF. However,
1123 * there is an edge case where a file may not be page-aligned where
1124 * the last page is partially invalid. In this case the filesystem
1125 * may not properly clear the dirty bits for the entire page (which
1126 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
1127 * With the page locked we are free to fix-up the dirty bits here.
1128 *
1129 * We do not under any circumstances truncate the valid bits, as
1130 * this will screw up bogus page replacement.
1131 */
1132 if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
1133 if (object->un_pager.vnp.vnp_size > poffset) {
1134 int pgoff;
1135
1136 maxsize = object->un_pager.vnp.vnp_size - poffset;
1137 ncount = btoc(maxsize);
1138 if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
1139 vm_page_lock_queues();
1140 vm_page_clear_dirty(m[ncount - 1], pgoff,
1141 PAGE_SIZE - pgoff);
1142 vm_page_unlock_queues();
1143 }
1144 } else {
1145 maxsize = 0;
1146 ncount = 0;
1147 }
1148 if (ncount < count) {
1149 for (i = ncount; i < count; i++) {
1150 rtvals[i] = VM_PAGER_BAD;
1151 }
1152 }
1153 }
1154
1155 /*
1156 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
1157 * rather then a bdwrite() to prevent paging I/O from saturating
1158 * the buffer cache. Dummy-up the sequential heuristic to cause
1159 * large ranges to cluster. If neither IO_SYNC or IO_ASYNC is set,
1160 * the system decides how to cluster.
1161 */
1162 ioflags = IO_VMIO;
1163 if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
1164 ioflags |= IO_SYNC;
1165 else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
1166 ioflags |= IO_ASYNC;
1167 ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
1168 ioflags |= IO_SEQMAX << IO_SEQSHIFT;
1169
1170 aiov.iov_base = (caddr_t) 0;
1171 aiov.iov_len = maxsize;
1172 auio.uio_iov = &aiov;
1173 auio.uio_iovcnt = 1;
1174 auio.uio_offset = poffset;
1175 auio.uio_segflg = UIO_NOCOPY;
1176 auio.uio_rw = UIO_WRITE;
1177 auio.uio_resid = maxsize;
1178 auio.uio_td = (struct thread *) 0;
1179 error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
1180 cnt.v_vnodeout++;
1181 cnt.v_vnodepgsout += ncount;
1182
1183 if (error) {
1184 if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
1185 printf("vnode_pager_putpages: I/O error %d\n", error);
1186 }
1187 if (auio.uio_resid) {
1188 if (ppscheck || ppsratecheck(&lastfail, &curfail, 1))
1189 printf("vnode_pager_putpages: residual I/O %d at %lu\n",
1190 auio.uio_resid, (u_long)m[0]->pindex);
1191 }
1192 for (i = 0; i < ncount; i++) {
1193 rtvals[i] = VM_PAGER_OK;
1194 }
1195 return rtvals[0];
1196 }
1197
1198 struct vnode *
1199 vnode_pager_lock(vm_object_t first_object)
1200 {
1201 struct vnode *vp;
1202 vm_object_t backing_object, object;
1203
1204 VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
1205 for (object = first_object; object != NULL; object = backing_object) {
1206 if (object->type != OBJT_VNODE) {
1207 if ((backing_object = object->backing_object) != NULL)
1208 VM_OBJECT_LOCK(backing_object);
1209 if (object != first_object)
1210 VM_OBJECT_UNLOCK(object);
1211 continue;
1212 }
1213 retry:
1214 if (object->flags & OBJ_DEAD) {
1215 if (object != first_object)
1216 VM_OBJECT_UNLOCK(object);
1217 return NULL;
1218 }
1219 vp = object->handle;
1220 VI_LOCK(vp);
1221 VM_OBJECT_UNLOCK(object);
1222 if (first_object != object)
1223 VM_OBJECT_UNLOCK(first_object);
1224 VFS_ASSERT_GIANT(vp->v_mount);
1225 if (vget(vp, LK_CANRECURSE | LK_INTERLOCK |
1226 LK_RETRY | LK_SHARED, curthread)) {
1227 VM_OBJECT_LOCK(first_object);
1228 if (object != first_object)
1229 VM_OBJECT_LOCK(object);
1230 if (object->type != OBJT_VNODE) {
1231 if (object != first_object)
1232 VM_OBJECT_UNLOCK(object);
1233 return NULL;
1234 }
1235 printf("vnode_pager_lock: retrying\n");
1236 goto retry;
1237 }
1238 VM_OBJECT_LOCK(first_object);
1239 return (vp);
1240 }
1241 return NULL;
1242 }
Cache object: 733ed65e17eb230fbf31992d7c5e3010
|