The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_readahead.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uvm_readahead.c,v 1.13 2020/05/19 21:45:35 ad Exp $    */
    2 
    3 /*-
    4  * Copyright (c)2003, 2005, 2009 YAMAMOTO Takashi,
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 /*
   30  * uvm_object read-ahead
   31  *
   32  * TODO:
   33  *      - tune.
   34  *      - handle multiple streams.
   35  *      - find a better way to deal with PGO_LOCKED pager requests.
   36  *        (currently just ignored)
   37  *      - consider the amount of memory in the system.
   38  *      - consider the speed of the underlying device.
   39  *      - consider filesystem block size / block layout.
   40  */
   41 
   42 #include <sys/cdefs.h>
   43 __KERNEL_RCSID(0, "$NetBSD: uvm_readahead.c,v 1.13 2020/05/19 21:45:35 ad Exp $");
   44 
   45 #include <sys/param.h>
   46 #include <sys/pool.h>
   47 
   48 #include <uvm/uvm.h>
   49 #include <uvm/uvm_readahead.h>
   50 
   51 #if defined(READAHEAD_DEBUG)
   52 #define DPRINTF(a)      printf a
   53 #else /* defined(READAHEAD_DEBUG) */
   54 #define DPRINTF(a)      /* nothing */
   55 #endif /* defined(READAHEAD_DEBUG) */
   56 
   57 /*
   58  * uvm_ractx: read-ahead context.
   59  */
   60 
   61 struct uvm_ractx {
   62         int ra_flags;
   63 #define RA_VALID        1
   64         off_t ra_winstart;      /* window start offset */
   65         size_t ra_winsize;      /* window size */
   66         off_t ra_next;          /* next offset to read-ahead */
   67 };
   68 
   69 #if defined(sun2) || defined(sun3)
   70 /* XXX: on sun2 and sun3 MAXPHYS is 0xe000 */
   71 #undef MAXPHYS  
   72 #define MAXPHYS         0x8000  /* XXX */
   73 #endif
   74 
   75 #define RA_WINSIZE_INIT MAXPHYS                 /* initial window size */
   76 #define RA_WINSIZE_MAX  (MAXPHYS * 16)          /* max window size */
   77 #define RA_WINSIZE_SEQENTIAL    RA_WINSIZE_MAX  /* fixed window size used for
   78                                                    SEQUENTIAL hint */
   79 #define RA_MINSIZE      (MAXPHYS * 2)           /* min size to start i/o */
   80 #define RA_IOCHUNK      MAXPHYS                 /* read-ahead i/o chunk size */
   81 
   82 static off_t ra_startio(struct uvm_object *, off_t, size_t);
   83 static struct uvm_ractx *ra_allocctx(void);
   84 static void ra_freectx(struct uvm_ractx *);
   85 
   86 static struct pool_cache ractx_cache;
   87 
   88 /*
   89  * uvm_ra_init: initialize readahead module.
   90  */
   91 
   92 void
   93 uvm_ra_init(void)
   94 {
   95 
   96         pool_cache_bootstrap(&ractx_cache, sizeof(struct uvm_ractx), 0, 0, 0,
   97             "ractx", NULL, IPL_NONE, NULL, NULL, NULL);
   98 }
   99 
  100 static struct uvm_ractx *
  101 ra_allocctx(void)
  102 {
  103 
  104         return pool_cache_get(&ractx_cache, PR_NOWAIT);
  105 }
  106 
  107 static void
  108 ra_freectx(struct uvm_ractx *ra)
  109 {
  110 
  111         pool_cache_put(&ractx_cache, ra);
  112 }
  113 
  114 /*
  115  * ra_startio: start i/o for read-ahead.
  116  *
  117  * => start i/o for each RA_IOCHUNK sized chunk.
  118  * => return offset to which we started i/o.
  119  */
  120 
  121 static off_t
  122 ra_startio(struct uvm_object *uobj, off_t off, size_t sz)
  123 {
  124         const off_t endoff = off + sz;
  125 
  126         DPRINTF(("%s: uobj=%p, off=%" PRIu64 ", endoff=%" PRIu64 "\n",
  127             __func__, uobj, off, endoff));
  128 
  129         KASSERT(rw_write_held(uobj->vmobjlock));
  130 
  131         /*
  132          * Don't issue read-ahead if the last page of the range is already cached.
  133          * The assumption is that since the access is sequential, the intermediate
  134          * pages would have similar LRU stats, and hence likely to be still in cache
  135          * too. This speeds up I/O using cache, since it avoids lookups and temporary
  136          * allocations done by full pgo_get.
  137          */
  138         struct vm_page *pg = uvm_pagelookup(uobj, trunc_page(endoff - 1));
  139         if (pg != NULL) {
  140                 DPRINTF(("%s:  off=%" PRIu64 ", sz=%zu already cached\n",
  141                     __func__, off, sz));
  142                 return endoff;
  143         }
  144 
  145         off = trunc_page(off);
  146         while (off < endoff) {
  147                 const size_t chunksize = RA_IOCHUNK;
  148                 int error;
  149                 size_t donebytes;
  150                 int npages;
  151                 int orignpages;
  152                 size_t bytelen;
  153 
  154                 KASSERT((chunksize & (chunksize - 1)) == 0);
  155                 KASSERT((off & PAGE_MASK) == 0);
  156                 bytelen = ((off + chunksize) & -(off_t)chunksize) - off;
  157                 KASSERT((bytelen & PAGE_MASK) == 0);
  158                 npages = orignpages = bytelen >> PAGE_SHIFT;
  159                 KASSERT(npages != 0);
  160 
  161                 /*
  162                  * use UVM_ADV_RANDOM to avoid recursion.
  163                  */
  164 
  165                 error = (*uobj->pgops->pgo_get)(uobj, off, NULL,
  166                     &npages, 0, VM_PROT_READ, UVM_ADV_RANDOM, PGO_NOTIMESTAMP);
  167                 rw_enter(uobj->vmobjlock, RW_WRITER);
  168                 DPRINTF(("%s:  off=%" PRIu64 ", bytelen=%zu -> %d\n",
  169                     __func__, off, bytelen, error));
  170                 if (error != 0 && error != EBUSY) {
  171                         if (error != EINVAL) { /* maybe past EOF */
  172                                 DPRINTF(("%s: error=%d\n", __func__, error));
  173                         }
  174                         break;
  175                 }
  176                 KASSERT(orignpages == npages);
  177                 donebytes = orignpages << PAGE_SHIFT;
  178                 off += donebytes;
  179         }
  180 
  181         return off;
  182 }
  183 
  184 /* ------------------------------------------------------------ */
  185 
  186 /*
  187  * uvm_ra_allocctx: allocate a context.
  188  */
  189 
  190 struct uvm_ractx *
  191 uvm_ra_allocctx(void)
  192 {
  193         struct uvm_ractx *ra;
  194 
  195         ra = ra_allocctx();
  196         if (ra != NULL) {
  197                 ra->ra_flags = 0;
  198         }
  199 
  200         return ra;
  201 }
  202 
  203 /*
  204  * uvm_ra_freectx: free a context.
  205  */
  206 
  207 void
  208 uvm_ra_freectx(struct uvm_ractx *ra)
  209 {
  210 
  211         KASSERT(ra != NULL);
  212         ra_freectx(ra);
  213 }
  214 
  215 /*
  216  * uvm_ra_request: update a read-ahead context and start i/o if appropriate.
  217  *
  218  * => called when [reqoff, reqoff+reqsize) is requested.
  219  * => object must be locked by caller, will return locked.
  220  */
  221 
  222 void
  223 uvm_ra_request(struct uvm_ractx *ra, int advice, struct uvm_object *uobj,
  224     off_t reqoff, size_t reqsize)
  225 {
  226 
  227         KASSERT(rw_write_held(uobj->vmobjlock));
  228 
  229         if (ra == NULL || advice == UVM_ADV_RANDOM) {
  230                 return;
  231         }
  232 
  233         if (advice == UVM_ADV_SEQUENTIAL) {
  234 
  235                 /*
  236                  * always do read-ahead with a large window.
  237                  */
  238 
  239                 if ((ra->ra_flags & RA_VALID) == 0) {
  240                         ra->ra_winstart = ra->ra_next = 0;
  241                         ra->ra_flags |= RA_VALID;
  242                 }
  243                 if (reqoff < ra->ra_winstart) {
  244                         ra->ra_next = reqoff;
  245                 }
  246                 ra->ra_winsize = RA_WINSIZE_SEQENTIAL;
  247                 goto do_readahead;
  248         }
  249 
  250         /*
  251          * a request with UVM_ADV_NORMAL hint.  (ie. no hint)
  252          *
  253          * we keep a sliding window in order to determine:
  254          *      - if the previous read-ahead was successful or not.
  255          *      - how many bytes to read-ahead.
  256          */
  257 
  258         /*
  259          * if it's the first request for this context,
  260          * initialize context and return.
  261          */
  262 
  263         if ((ra->ra_flags & RA_VALID) == 0) {
  264 initialize:
  265                 ra->ra_winstart = ra->ra_next = reqoff + reqsize;
  266                 ra->ra_winsize = RA_WINSIZE_INIT;
  267                 ra->ra_flags |= RA_VALID;
  268                 goto done;
  269         }
  270 
  271         /*
  272          * if it isn't in our window,
  273          * initialize context and return.
  274          * (read-ahead miss)
  275          */
  276 
  277         if (reqoff < ra->ra_winstart ||
  278             ra->ra_winstart + ra->ra_winsize < reqoff) {
  279 
  280                 /*
  281                  * ... unless we seem to be reading the same chunk repeatedly.
  282                  *
  283                  * XXX should have some margin?
  284                  */
  285 
  286                 if (reqoff + reqsize == ra->ra_winstart) {
  287                         DPRINTF(("%s: %p: same block: off=%" PRIu64
  288                             ", size=%zd, winstart=%" PRIu64 "\n",
  289                             __func__, ra, reqoff, reqsize, ra->ra_winstart));
  290                         goto done;
  291                 }
  292                 goto initialize;
  293         }
  294 
  295         /*
  296          * it's in our window. (read-ahead hit)
  297          *      - start read-ahead i/o if appropriate.
  298          *      - advance and enlarge window.
  299          */
  300 
  301 do_readahead:
  302 
  303         /*
  304          * don't bother to read-ahead behind current request.
  305          */
  306 
  307         if (reqoff > ra->ra_next) {
  308                 ra->ra_next = reqoff;
  309         }
  310 
  311         /*
  312          * try to make [reqoff, reqoff+ra_winsize) in-core.
  313          * note that [reqoff, ra_next) is considered already done.
  314          */
  315 
  316         if (reqoff + ra->ra_winsize > ra->ra_next) {
  317                 off_t raoff = MAX(reqoff, ra->ra_next);
  318                 size_t rasize = reqoff + ra->ra_winsize - ra->ra_next;
  319 
  320 #if defined(DIAGNOSTIC)
  321                 if (rasize > RA_WINSIZE_MAX) {
  322                         printf("%s: corrupted context", __func__);
  323                         rasize = RA_WINSIZE_MAX;
  324                 }
  325 #endif /* defined(DIAGNOSTIC) */
  326 
  327                 /*
  328                  * issue read-ahead only if we can start big enough i/o.
  329                  * otherwise we end up with a stream of small i/o.
  330                  */
  331 
  332                 if (rasize >= RA_MINSIZE) {
  333                         off_t next;
  334 
  335                         next = ra_startio(uobj, raoff, rasize);
  336                         ra->ra_next = next;
  337                 }
  338         }
  339 
  340         /*
  341          * update window.
  342          *
  343          * enlarge window by reqsize, so that it grows in a predictable manner
  344          * regardless of the size of each read(2).
  345          */
  346 
  347         ra->ra_winstart = reqoff + reqsize;
  348         ra->ra_winsize = MIN(RA_WINSIZE_MAX, ra->ra_winsize + reqsize);
  349 
  350 done:;
  351 }
  352 
  353 int
  354 uvm_readahead(struct uvm_object *uobj, off_t off, off_t size)
  355 {
  356 
  357         /*
  358          * don't allow too much read-ahead.
  359          */
  360         if (size > RA_WINSIZE_MAX) {
  361                 size = RA_WINSIZE_MAX;
  362         }
  363         rw_enter(uobj->vmobjlock, RW_WRITER);
  364         ra_startio(uobj, off, size);
  365         rw_exit(uobj->vmobjlock);
  366         return 0;
  367 }

Cache object: 3a9000c4b686315f472c5d0b29c0af56


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.