The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/mm/filemap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  *      linux/mm/filemap.c
    3  *
    4  * Copyright (C) 1994-1999  Linus Torvalds
    5  */
    6 
    7 /*
    8  * This file handles the generic file mmap semantics used by
    9  * most "normal" filesystems (but you don't /have/ to use this:
   10  * the NFS filesystem used to do this differently, for example)
   11  */
   12 #include <linux/module.h>
   13 #include <linux/slab.h>
   14 #include <linux/shm.h>
   15 #include <linux/mman.h>
   16 #include <linux/locks.h>
   17 #include <linux/pagemap.h>
   18 #include <linux/swap.h>
   19 #include <linux/smp_lock.h>
   20 #include <linux/blkdev.h>
   21 #include <linux/file.h>
   22 #include <linux/swapctl.h>
   23 #include <linux/init.h>
   24 #include <linux/mm.h>
   25 #include <linux/iobuf.h>
   26 
   27 #include <asm/pgalloc.h>
   28 #include <asm/uaccess.h>
   29 #include <asm/mman.h>
   30 
   31 #include <linux/highmem.h>
   32 
   33 /*
   34  * Shared mappings implemented 30.11.1994. It's not fully working yet,
   35  * though.
   36  *
   37  * Shared mappings now work. 15.8.1995  Bruno.
   38  *
   39  * finished 'unifying' the page and buffer cache and SMP-threaded the
   40  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
   41  *
   42  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
   43  */
   44 
   45 atomic_t page_cache_size = ATOMIC_INIT(0);
   46 unsigned int page_hash_bits;
   47 struct page **page_hash_table;
   48 
   49 int vm_max_readahead = 31;
   50 int vm_min_readahead = 3;
   51 EXPORT_SYMBOL(vm_max_readahead);
   52 EXPORT_SYMBOL(vm_min_readahead);
   53 
   54 
   55 spinlock_cacheline_t pagecache_lock_cacheline  = {SPIN_LOCK_UNLOCKED};
   56 /*
   57  * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
   58  *      with the pagecache_lock held.
   59  *
   60  * Ordering:
   61  *      swap_lock ->
   62  *              pagemap_lru_lock ->
   63  *                      pagecache_lock
   64  */
   65 spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
   66 
   67 #define CLUSTER_PAGES           (1 << page_cluster)
   68 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
   69 
   70 static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
   71 static void add_page_to_hash_queue(struct page * page, struct page **p)
   72 {
   73         struct page *next = *p;
   74 
   75         *p = page;
   76         page->next_hash = next;
   77         page->pprev_hash = p;
   78         if (next)
   79                 next->pprev_hash = &page->next_hash;
   80         if (page->buffers)
   81                 PAGE_BUG(page);
   82         atomic_inc(&page_cache_size);
   83 }
   84 
   85 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
   86 {
   87         struct list_head *head = &mapping->clean_pages;
   88 
   89         mapping->nrpages++;
   90         list_add(&page->list, head);
   91         page->mapping = mapping;
   92 }
   93 
   94 static inline void remove_page_from_inode_queue(struct page * page)
   95 {
   96         struct address_space * mapping = page->mapping;
   97 
   98         if (mapping->a_ops->removepage)
   99                 mapping->a_ops->removepage(page);
  100         
  101         list_del(&page->list);
  102         page->mapping = NULL;
  103         wmb();
  104         mapping->nrpages--;
  105 }
  106 
  107 static inline void remove_page_from_hash_queue(struct page * page)
  108 {
  109         struct page *next = page->next_hash;
  110         struct page **pprev = page->pprev_hash;
  111 
  112         if (next)
  113                 next->pprev_hash = pprev;
  114         *pprev = next;
  115         page->pprev_hash = NULL;
  116         atomic_dec(&page_cache_size);
  117 }
  118 
  119 /*
  120  * Remove a page from the page cache and free it. Caller has to make
  121  * sure the page is locked and that nobody else uses it - or that usage
  122  * is safe.
  123  */
  124 void __remove_inode_page(struct page *page)
  125 {
  126         remove_page_from_inode_queue(page);
  127         remove_page_from_hash_queue(page);
  128 }
  129 
  130 void remove_inode_page(struct page *page)
  131 {
  132         if (!PageLocked(page))
  133                 PAGE_BUG(page);
  134 
  135         spin_lock(&pagecache_lock);
  136         __remove_inode_page(page);
  137         spin_unlock(&pagecache_lock);
  138 }
  139 
  140 static inline int sync_page(struct page *page)
  141 {
  142         struct address_space *mapping = page->mapping;
  143 
  144         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
  145                 return mapping->a_ops->sync_page(page);
  146         return 0;
  147 }
  148 
  149 /*
  150  * Add a page to the dirty page list.
  151  */
  152 void set_page_dirty(struct page *page)
  153 {
  154         if (!test_and_set_bit(PG_dirty, &page->flags)) {
  155                 struct address_space *mapping = page->mapping;
  156 
  157                 if (mapping) {
  158                         spin_lock(&pagecache_lock);
  159                         mapping = page->mapping;
  160                         if (mapping) {  /* may have been truncated */
  161                                 list_del(&page->list);
  162                                 list_add(&page->list, &mapping->dirty_pages);
  163                         }
  164                         spin_unlock(&pagecache_lock);
  165 
  166                         if (mapping && mapping->host)
  167                                 mark_inode_dirty_pages(mapping->host);
  168                 }
  169         }
  170 }
  171 
  172 /**
  173  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
  174  * @inode: the inode which pages we want to invalidate
  175  *
  176  * This function only removes the unlocked pages, if you want to
  177  * remove all the pages of one inode, you must call truncate_inode_pages.
  178  */
  179 
  180 void invalidate_inode_pages(struct inode * inode)
  181 {
  182         struct list_head *head, *curr;
  183         struct page * page;
  184 
  185         head = &inode->i_mapping->clean_pages;
  186 
  187         spin_lock(&pagemap_lru_lock);
  188         spin_lock(&pagecache_lock);
  189         curr = head->next;
  190 
  191         while (curr != head) {
  192                 page = list_entry(curr, struct page, list);
  193                 curr = curr->next;
  194 
  195                 /* We cannot invalidate something in dirty.. */
  196                 if (PageDirty(page))
  197                         continue;
  198 
  199                 /* ..or locked */
  200                 if (TryLockPage(page))
  201                         continue;
  202 
  203                 if (page->buffers && !try_to_free_buffers(page, 0))
  204                         goto unlock;
  205 
  206                 if (page_count(page) != 1)
  207                         goto unlock;
  208 
  209                 __lru_cache_del(page);
  210                 __remove_inode_page(page);
  211                 UnlockPage(page);
  212                 page_cache_release(page);
  213                 continue;
  214 unlock:
  215                 UnlockPage(page);
  216                 continue;
  217         }
  218 
  219         spin_unlock(&pagecache_lock);
  220         spin_unlock(&pagemap_lru_lock);
  221 }
  222 
  223 static int do_flushpage(struct page *page, unsigned long offset)
  224 {
  225         int (*flushpage) (struct page *, unsigned long);
  226         flushpage = page->mapping->a_ops->flushpage;
  227         if (flushpage)
  228                 return (*flushpage)(page, offset);
  229         return block_flushpage(page, offset);
  230 }
  231 
  232 static inline void truncate_partial_page(struct page *page, unsigned partial)
  233 {
  234         memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
  235         if (page->buffers)
  236                 do_flushpage(page, partial);
  237 }
  238 
  239 static void truncate_complete_page(struct page *page)
  240 {
  241         /* Leave it on the LRU if it gets converted into anonymous buffers */
  242         if (!page->buffers || do_flushpage(page, 0))
  243                 lru_cache_del(page);
  244 
  245         /*
  246          * We remove the page from the page cache _after_ we have
  247          * destroyed all buffer-cache references to it. Otherwise some
  248          * other process might think this inode page is not in the
  249          * page cache and creates a buffer-cache alias to it causing
  250          * all sorts of fun problems ...  
  251          */
  252         ClearPageDirty(page);
  253         ClearPageUptodate(page);
  254         remove_inode_page(page);
  255         page_cache_release(page);
  256 }
  257 
  258 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
  259 static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
  260 {
  261         struct list_head *curr;
  262         struct page * page;
  263         int unlocked = 0;
  264 
  265  restart:
  266         curr = head->prev;
  267         while (curr != head) {
  268                 unsigned long offset;
  269 
  270                 page = list_entry(curr, struct page, list);
  271                 offset = page->index;
  272 
  273                 /* Is one of the pages to truncate? */
  274                 if ((offset >= start) || (*partial && (offset + 1) == start)) {
  275                         int failed;
  276 
  277                         page_cache_get(page);
  278                         failed = TryLockPage(page);
  279 
  280                         list_del(head);
  281                         if (!failed)
  282                                 /* Restart after this page */
  283                                 list_add_tail(head, curr);
  284                         else
  285                                 /* Restart on this page */
  286                                 list_add(head, curr);
  287 
  288                         spin_unlock(&pagecache_lock);
  289                         unlocked = 1;
  290 
  291                         if (!failed) {
  292                                 if (*partial && (offset + 1) == start) {
  293                                         truncate_partial_page(page, *partial);
  294                                         *partial = 0;
  295                                 } else 
  296                                         truncate_complete_page(page);
  297 
  298                                 UnlockPage(page);
  299                         } else
  300                                 wait_on_page(page);
  301 
  302                         page_cache_release(page);
  303 
  304                         if (current->need_resched) {
  305                                 __set_current_state(TASK_RUNNING);
  306                                 schedule();
  307                         }
  308 
  309                         spin_lock(&pagecache_lock);
  310                         goto restart;
  311                 }
  312                 curr = curr->prev;
  313         }
  314         return unlocked;
  315 }
  316 
  317 
  318 /**
  319  * truncate_inode_pages - truncate *all* the pages from an offset
  320  * @mapping: mapping to truncate
  321  * @lstart: offset from with to truncate
  322  *
  323  * Truncate the page cache at a set offset, removing the pages
  324  * that are beyond that offset (and zeroing out partial pages).
  325  * If any page is locked we wait for it to become unlocked.
  326  */
  327 void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
  328 {
  329         unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  330         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
  331         int unlocked;
  332 
  333         spin_lock(&pagecache_lock);
  334         do {
  335                 unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
  336                 unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
  337                 unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
  338         } while (unlocked);
  339         /* Traversed all three lists without dropping the lock */
  340         spin_unlock(&pagecache_lock);
  341 }
  342 
  343 static inline int invalidate_this_page2(struct page * page,
  344                                         struct list_head * curr,
  345                                         struct list_head * head)
  346 {
  347         int unlocked = 1;
  348 
  349         /*
  350          * The page is locked and we hold the pagecache_lock as well
  351          * so both page_count(page) and page->buffers stays constant here.
  352          */
  353         if (page_count(page) == 1 + !!page->buffers) {
  354                 /* Restart after this page */
  355                 list_del(head);
  356                 list_add_tail(head, curr);
  357 
  358                 page_cache_get(page);
  359                 spin_unlock(&pagecache_lock);
  360                 truncate_complete_page(page);
  361         } else {
  362                 if (page->buffers) {
  363                         /* Restart after this page */
  364                         list_del(head);
  365                         list_add_tail(head, curr);
  366 
  367                         page_cache_get(page);
  368                         spin_unlock(&pagecache_lock);
  369                         block_invalidate_page(page);
  370                 } else
  371                         unlocked = 0;
  372 
  373                 ClearPageDirty(page);
  374                 ClearPageUptodate(page);
  375         }
  376 
  377         return unlocked;
  378 }
  379 
  380 static int FASTCALL(invalidate_list_pages2(struct list_head *));
  381 static int invalidate_list_pages2(struct list_head *head)
  382 {
  383         struct list_head *curr;
  384         struct page * page;
  385         int unlocked = 0;
  386 
  387  restart:
  388         curr = head->prev;
  389         while (curr != head) {
  390                 page = list_entry(curr, struct page, list);
  391 
  392                 if (!TryLockPage(page)) {
  393                         int __unlocked;
  394 
  395                         __unlocked = invalidate_this_page2(page, curr, head);
  396                         UnlockPage(page);
  397                         unlocked |= __unlocked;
  398                         if (!__unlocked) {
  399                                 curr = curr->prev;
  400                                 continue;
  401                         }
  402                 } else {
  403                         /* Restart on this page */
  404                         list_del(head);
  405                         list_add(head, curr);
  406 
  407                         page_cache_get(page);
  408                         spin_unlock(&pagecache_lock);
  409                         unlocked = 1;
  410                         wait_on_page(page);
  411                 }
  412 
  413                 page_cache_release(page);
  414                 if (current->need_resched) {
  415                         __set_current_state(TASK_RUNNING);
  416                         schedule();
  417                 }
  418 
  419                 spin_lock(&pagecache_lock);
  420                 goto restart;
  421         }
  422         return unlocked;
  423 }
  424 
  425 /**
  426  * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
  427  * free the pages because they're mapped.
  428  * @mapping: the address_space which pages we want to invalidate
  429  */
  430 void invalidate_inode_pages2(struct address_space * mapping)
  431 {
  432         int unlocked;
  433 
  434         spin_lock(&pagecache_lock);
  435         do {
  436                 unlocked = invalidate_list_pages2(&mapping->clean_pages);
  437                 unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
  438                 unlocked |= invalidate_list_pages2(&mapping->locked_pages);
  439         } while (unlocked);
  440         spin_unlock(&pagecache_lock);
  441 }
  442 
  443 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
  444 {
  445         goto inside;
  446 
  447         for (;;) {
  448                 page = page->next_hash;
  449 inside:
  450                 if (!page)
  451                         goto not_found;
  452                 if (page->mapping != mapping)
  453                         continue;
  454                 if (page->index == offset)
  455                         break;
  456         }
  457 
  458 not_found:
  459         return page;
  460 }
  461 
  462 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
  463 {
  464         struct list_head *curr;
  465         struct page *page;
  466         int retval = 0;
  467 
  468         spin_lock(&pagecache_lock);
  469         curr = head->next;
  470         while (curr != head) {
  471                 page = list_entry(curr, struct page, list);
  472                 curr = curr->next;
  473                 if (!page->buffers)
  474                         continue;
  475                 if (page->index >= end)
  476                         continue;
  477                 if (page->index < start)
  478                         continue;
  479 
  480                 page_cache_get(page);
  481                 spin_unlock(&pagecache_lock);
  482                 lock_page(page);
  483 
  484                 /* The buffers could have been free'd while we waited for the page lock */
  485                 if (page->buffers)
  486                         retval |= fn(page);
  487 
  488                 UnlockPage(page);
  489                 spin_lock(&pagecache_lock);
  490                 curr = page->list.next;
  491                 page_cache_release(page);
  492         }
  493         spin_unlock(&pagecache_lock);
  494 
  495         return retval;
  496 }
  497 
  498 /*
  499  * Two-stage data sync: first start the IO, then go back and
  500  * collect the information..
  501  */
  502 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
  503 {
  504         int retval;
  505 
  506         /* writeout dirty buffers on pages from both clean and dirty lists */
  507         retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
  508         retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
  509         retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
  510 
  511         /* now wait for locked buffers on pages from both clean and dirty lists */
  512         retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
  513         retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
  514         retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
  515 
  516         return retval;
  517 }
  518 
  519 /*
  520  * In-memory filesystems have to fail their
  521  * writepage function - and this has to be
  522  * worked around in the VM layer..
  523  *
  524  * We
  525  *  - mark the page dirty again (but do NOT
  526  *    add it back to the inode dirty list, as
  527  *    that would livelock in fdatasync)
  528  *  - activate the page so that the page stealer
  529  *    doesn't try to write it out over and over
  530  *    again.
  531  */
  532 int fail_writepage(struct page *page)
  533 {
  534         /* Only activate on memory-pressure, not fsync.. */
  535         if (PageLaunder(page)) {
  536                 activate_page(page);
  537                 SetPageReferenced(page);
  538         }
  539 
  540         /* Set the page dirty again, unlock */
  541         SetPageDirty(page);
  542         UnlockPage(page);
  543         return 0;
  544 }
  545 
  546 EXPORT_SYMBOL(fail_writepage);
  547 
  548 /**
  549  *      filemap_fdatasync - walk the list of dirty pages of the given address space
  550  *      and writepage() all of them.
  551  * 
  552  *      @mapping: address space structure to write
  553  *
  554  */
  555 int filemap_fdatasync(struct address_space * mapping)
  556 {
  557         int ret = 0;
  558         int (*writepage)(struct page *) = mapping->a_ops->writepage;
  559 
  560         spin_lock(&pagecache_lock);
  561 
  562         while (!list_empty(&mapping->dirty_pages)) {
  563                 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
  564 
  565                 list_del(&page->list);
  566                 list_add(&page->list, &mapping->locked_pages);
  567 
  568                 if (!PageDirty(page))
  569                         continue;
  570 
  571                 page_cache_get(page);
  572                 spin_unlock(&pagecache_lock);
  573 
  574                 lock_page(page);
  575 
  576                 if (PageDirty(page)) {
  577                         int err;
  578                         ClearPageDirty(page);
  579                         err = writepage(page);
  580                         if (err && !ret)
  581                                 ret = err;
  582                 } else
  583                         UnlockPage(page);
  584 
  585                 page_cache_release(page);
  586                 spin_lock(&pagecache_lock);
  587         }
  588         spin_unlock(&pagecache_lock);
  589         return ret;
  590 }
  591 
  592 /**
  593  *      filemap_fdatawait - walk the list of locked pages of the given address space
  594  *      and wait for all of them.
  595  * 
  596  *      @mapping: address space structure to wait for
  597  *
  598  */
  599 int filemap_fdatawait(struct address_space * mapping)
  600 {
  601         int ret = 0;
  602 
  603         spin_lock(&pagecache_lock);
  604 
  605         while (!list_empty(&mapping->locked_pages)) {
  606                 struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
  607 
  608                 list_del(&page->list);
  609                 list_add(&page->list, &mapping->clean_pages);
  610 
  611                 if (!PageLocked(page))
  612                         continue;
  613 
  614                 page_cache_get(page);
  615                 spin_unlock(&pagecache_lock);
  616 
  617                 ___wait_on_page(page);
  618                 if (PageError(page))
  619                         ret = -EIO;
  620 
  621                 page_cache_release(page);
  622                 spin_lock(&pagecache_lock);
  623         }
  624         spin_unlock(&pagecache_lock);
  625         return ret;
  626 }
  627 
  628 /*
  629  * Add a page to the inode page cache.
  630  *
  631  * The caller must have locked the page and 
  632  * set all the page flags correctly..
  633  */
  634 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
  635 {
  636         if (!PageLocked(page))
  637                 BUG();
  638 
  639         page->index = index;
  640         page_cache_get(page);
  641         spin_lock(&pagecache_lock);
  642         add_page_to_inode_queue(mapping, page);
  643         add_page_to_hash_queue(page, page_hash(mapping, index));
  644         spin_unlock(&pagecache_lock);
  645 
  646         lru_cache_add(page);
  647 }
  648 
  649 /*
  650  * This adds a page to the page cache, starting out as locked,
  651  * owned by us, but unreferenced, not uptodate and with no errors.
  652  */
  653 static inline void __add_to_page_cache(struct page * page,
  654         struct address_space *mapping, unsigned long offset,
  655         struct page **hash)
  656 {
  657         unsigned long flags;
  658 
  659         flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
  660         page->flags = flags | (1 << PG_locked);
  661         page_cache_get(page);
  662         page->index = offset;
  663         add_page_to_inode_queue(mapping, page);
  664         add_page_to_hash_queue(page, hash);
  665 }
  666 
  667 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
  668 {
  669         spin_lock(&pagecache_lock);
  670         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
  671         spin_unlock(&pagecache_lock);
  672         lru_cache_add(page);
  673 }
  674 
  675 int add_to_page_cache_unique(struct page * page,
  676         struct address_space *mapping, unsigned long offset,
  677         struct page **hash)
  678 {
  679         int err;
  680         struct page *alias;
  681 
  682         spin_lock(&pagecache_lock);
  683         alias = __find_page_nolock(mapping, offset, *hash);
  684 
  685         err = 1;
  686         if (!alias) {
  687                 __add_to_page_cache(page,mapping,offset,hash);
  688                 err = 0;
  689         }
  690 
  691         spin_unlock(&pagecache_lock);
  692         if (!err)
  693                 lru_cache_add(page);
  694         return err;
  695 }
  696 
  697 /*
  698  * This adds the requested page to the page cache if it isn't already there,
  699  * and schedules an I/O to read in its contents from disk.
  700  */
  701 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
  702 static int page_cache_read(struct file * file, unsigned long offset)
  703 {
  704         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
  705         struct page **hash = page_hash(mapping, offset);
  706         struct page *page; 
  707 
  708         spin_lock(&pagecache_lock);
  709         page = __find_page_nolock(mapping, offset, *hash);
  710         spin_unlock(&pagecache_lock);
  711         if (page)
  712                 return 0;
  713 
  714         page = page_cache_alloc(mapping);
  715         if (!page)
  716                 return -ENOMEM;
  717 
  718         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
  719                 int error = mapping->a_ops->readpage(file, page);
  720                 page_cache_release(page);
  721                 return error;
  722         }
  723         /*
  724          * We arrive here in the unlikely event that someone 
  725          * raced with us and added our page to the cache first.
  726          */
  727         page_cache_release(page);
  728         return 0;
  729 }
  730 
  731 /*
  732  * Read in an entire cluster at once.  A cluster is usually a 64k-
  733  * aligned block that includes the page requested in "offset."
  734  */
  735 static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
  736                                              unsigned long filesize));
  737 static int read_cluster_nonblocking(struct file * file, unsigned long offset,
  738         unsigned long filesize)
  739 {
  740         unsigned long pages = CLUSTER_PAGES;
  741 
  742         offset = CLUSTER_OFFSET(offset);
  743         while ((pages-- > 0) && (offset < filesize)) {
  744                 int error = page_cache_read(file, offset);
  745                 if (error < 0)
  746                         return error;
  747                 offset ++;
  748         }
  749 
  750         return 0;
  751 }
  752 
  753 /*
  754  * Knuth recommends primes in approximately golden ratio to the maximum
  755  * integer representable by a machine word for multiplicative hashing.
  756  * Chuck Lever verified the effectiveness of this technique:
  757  * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
  758  *
  759  * These primes are chosen to be bit-sparse, that is operations on
  760  * them can use shifts and additions instead of multiplications for
  761  * machines where multiplications are slow.
  762  */
  763 #if BITS_PER_LONG == 32
  764 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
  765 #define GOLDEN_RATIO_PRIME 0x9e370001UL
  766 #elif BITS_PER_LONG == 64
  767 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
  768 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
  769 #else
  770 #error Define GOLDEN_RATIO_PRIME for your wordsize.
  771 #endif
  772 
  773 /*
  774  * In order to wait for pages to become available there must be
  775  * waitqueues associated with pages. By using a hash table of
  776  * waitqueues where the bucket discipline is to maintain all
  777  * waiters on the same queue and wake all when any of the pages
  778  * become available, and for the woken contexts to check to be
  779  * sure the appropriate page became available, this saves space
  780  * at a cost of "thundering herd" phenomena during rare hash
  781  * collisions.
  782  */
  783 static inline wait_queue_head_t *page_waitqueue(struct page *page)
  784 {
  785         const zone_t *zone = page_zone(page);
  786         wait_queue_head_t *wait = zone->wait_table;
  787         unsigned long hash = (unsigned long)page;
  788 
  789 #if BITS_PER_LONG == 64
  790         /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
  791         unsigned long n = hash;
  792         n <<= 18;
  793         hash -= n;
  794         n <<= 33;
  795         hash -= n;
  796         n <<= 3;
  797         hash += n;
  798         n <<= 3;
  799         hash -= n;
  800         n <<= 4;
  801         hash += n;
  802         n <<= 2;
  803         hash += n;
  804 #else
  805         /* On some cpus multiply is faster, on others gcc will do shifts */
  806         hash *= GOLDEN_RATIO_PRIME;
  807 #endif
  808         hash >>= zone->wait_table_shift;
  809 
  810         return &wait[hash];
  811 }
  812 
  813 /*
  814  * This must be called after every submit_bh with end_io
  815  * callbacks that would result into the blkdev layer waking
  816  * up the page after a queue unplug.
  817  */
  818 void wakeup_page_waiters(struct page * page)
  819 {
  820         wait_queue_head_t * head;
  821 
  822         head = page_waitqueue(page);
  823         if (waitqueue_active(head))
  824                 wake_up(head);
  825 }
  826 
  827 /* 
  828  * Wait for a page to get unlocked.
  829  *
  830  * This must be called with the caller "holding" the page,
  831  * ie with increased "page->count" so that the page won't
  832  * go away during the wait..
  833  *
  834  * The waiting strategy is to get on a waitqueue determined
  835  * by hashing. Waiters will then collide, and the newly woken
  836  * task must then determine whether it was woken for the page
  837  * it really wanted, and go back to sleep on the waitqueue if
  838  * that wasn't it. With the waitqueue semantics, it never leaves
  839  * the waitqueue unless it calls, so the loop moves forward one
  840  * iteration every time there is
  841  * (1) a collision 
  842  * and
  843  * (2) one of the colliding pages is woken
  844  *
  845  * This is the thundering herd problem, but it is expected to
  846  * be very rare due to the few pages that are actually being
  847  * waited on at any given time and the quality of the hash function.
  848  */
  849 void ___wait_on_page(struct page *page)
  850 {
  851         wait_queue_head_t *waitqueue = page_waitqueue(page);
  852         struct task_struct *tsk = current;
  853         DECLARE_WAITQUEUE(wait, tsk);
  854 
  855         add_wait_queue(waitqueue, &wait);
  856         do {
  857                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
  858                 if (!PageLocked(page))
  859                         break;
  860                 sync_page(page);
  861                 schedule();
  862         } while (PageLocked(page));
  863         __set_task_state(tsk, TASK_RUNNING);
  864         remove_wait_queue(waitqueue, &wait);
  865 }
  866 
  867 /*
  868  * unlock_page() is the other half of the story just above
  869  * __wait_on_page(). Here a couple of quick checks are done
  870  * and a couple of flags are set on the page, and then all
  871  * of the waiters for all of the pages in the appropriate
  872  * wait queue are woken.
  873  */
  874 void unlock_page(struct page *page)
  875 {
  876         wait_queue_head_t *waitqueue = page_waitqueue(page);
  877         ClearPageLaunder(page);
  878         smp_mb__before_clear_bit();
  879         if (!test_and_clear_bit(PG_locked, &(page)->flags))
  880                 BUG();
  881         smp_mb__after_clear_bit(); 
  882 
  883         /*
  884          * Although the default semantics of wake_up() are
  885          * to wake all, here the specific function is used
  886          * to make it even more explicit that a number of
  887          * pages are being waited on here.
  888          */
  889         if (waitqueue_active(waitqueue))
  890                 wake_up_all(waitqueue);
  891 }
  892 
  893 /*
  894  * Get a lock on the page, assuming we need to sleep
  895  * to get it..
  896  */
  897 static void __lock_page(struct page *page)
  898 {
  899         wait_queue_head_t *waitqueue = page_waitqueue(page);
  900         struct task_struct *tsk = current;
  901         DECLARE_WAITQUEUE(wait, tsk);
  902 
  903         add_wait_queue_exclusive(waitqueue, &wait);
  904         for (;;) {
  905                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
  906                 if (PageLocked(page)) {
  907                         sync_page(page);
  908                         schedule();
  909                 }
  910                 if (!TryLockPage(page))
  911                         break;
  912         }
  913         __set_task_state(tsk, TASK_RUNNING);
  914         remove_wait_queue(waitqueue, &wait);
  915 }
  916 
  917 /*
  918  * Get an exclusive lock on the page, optimistically
  919  * assuming it's not locked..
  920  */
  921 void lock_page(struct page *page)
  922 {
  923         if (TryLockPage(page))
  924                 __lock_page(page);
  925 }
  926 
  927 /*
  928  * a rather lightweight function, finding and getting a reference to a
  929  * hashed page atomically.
  930  */
  931 struct page * __find_get_page(struct address_space *mapping,
  932                               unsigned long offset, struct page **hash)
  933 {
  934         struct page *page;
  935 
  936         /*
  937          * We scan the hash list read-only. Addition to and removal from
  938          * the hash-list needs a held write-lock.
  939          */
  940         spin_lock(&pagecache_lock);
  941         page = __find_page_nolock(mapping, offset, *hash);
  942         if (page)
  943                 page_cache_get(page);
  944         spin_unlock(&pagecache_lock);
  945         return page;
  946 }
  947 
  948 /*
  949  * Same as above, but trylock it instead of incrementing the count.
  950  */
  951 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
  952 {
  953         struct page *page;
  954         struct page **hash = page_hash(mapping, offset);
  955 
  956         spin_lock(&pagecache_lock);
  957         page = __find_page_nolock(mapping, offset, *hash);
  958         if (page) {
  959                 if (TryLockPage(page))
  960                         page = NULL;
  961         }
  962         spin_unlock(&pagecache_lock);
  963         return page;
  964 }
  965 
  966 /*
  967  * Must be called with the pagecache lock held,
  968  * will return with it held (but it may be dropped
  969  * during blocking operations..
  970  */
  971 static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
  972 static struct page * __find_lock_page_helper(struct address_space *mapping,
  973                                         unsigned long offset, struct page *hash)
  974 {
  975         struct page *page;
  976 
  977         /*
  978          * We scan the hash list read-only. Addition to and removal from
  979          * the hash-list needs a held write-lock.
  980          */
  981 repeat:
  982         page = __find_page_nolock(mapping, offset, hash);
  983         if (page) {
  984                 page_cache_get(page);
  985                 if (TryLockPage(page)) {
  986                         spin_unlock(&pagecache_lock);
  987                         lock_page(page);
  988                         spin_lock(&pagecache_lock);
  989 
  990                         /* Has the page been re-allocated while we slept? */
  991                         if (page->mapping != mapping || page->index != offset) {
  992                                 UnlockPage(page);
  993                                 page_cache_release(page);
  994                                 goto repeat;
  995                         }
  996                 }
  997         }
  998         return page;
  999 }
 1000 
 1001 /*
 1002  * Same as the above, but lock the page too, verifying that
 1003  * it's still valid once we own it.
 1004  */
 1005 struct page * __find_lock_page (struct address_space *mapping,
 1006                                 unsigned long offset, struct page **hash)
 1007 {
 1008         struct page *page;
 1009 
 1010         spin_lock(&pagecache_lock);
 1011         page = __find_lock_page_helper(mapping, offset, *hash);
 1012         spin_unlock(&pagecache_lock);
 1013         return page;
 1014 }
 1015 
 1016 /*
 1017  * Same as above, but create the page if required..
 1018  */
 1019 struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
 1020 {
 1021         struct page *page;
 1022         struct page **hash = page_hash(mapping, index);
 1023 
 1024         spin_lock(&pagecache_lock);
 1025         page = __find_lock_page_helper(mapping, index, *hash);
 1026         spin_unlock(&pagecache_lock);
 1027         if (!page) {
 1028                 struct page *newpage = alloc_page(gfp_mask);
 1029                 if (newpage) {
 1030                         spin_lock(&pagecache_lock);
 1031                         page = __find_lock_page_helper(mapping, index, *hash);
 1032                         if (likely(!page)) {
 1033                                 page = newpage;
 1034                                 __add_to_page_cache(page, mapping, index, hash);
 1035                                 newpage = NULL;
 1036                         }
 1037                         spin_unlock(&pagecache_lock);
 1038                         if (newpage == NULL)
 1039                                 lru_cache_add(page);
 1040                         else 
 1041                                 page_cache_release(newpage);
 1042                 }
 1043         }
 1044         return page;    
 1045 }
 1046 
 1047 /*
 1048  * Same as grab_cache_page, but do not wait if the page is unavailable.
 1049  * This is intended for speculative data generators, where the data can
 1050  * be regenerated if the page couldn't be grabbed.  This routine should
 1051  * be safe to call while holding the lock for another page.
 1052  */
 1053 struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 1054 {
 1055         struct page *page, **hash;
 1056 
 1057         hash = page_hash(mapping, index);
 1058         page = __find_get_page(mapping, index, hash);
 1059 
 1060         if ( page ) {
 1061                 if ( !TryLockPage(page) ) {
 1062                         /* Page found and locked */
 1063                         /* This test is overly paranoid, but what the heck... */
 1064                         if ( unlikely(page->mapping != mapping || page->index != index) ) {
 1065                                 /* Someone reallocated this page under us. */
 1066                                 UnlockPage(page);
 1067                                 page_cache_release(page);
 1068                                 return NULL;
 1069                         } else {
 1070                                 return page;
 1071                         }
 1072                 } else {
 1073                         /* Page locked by someone else */
 1074                         page_cache_release(page);
 1075                         return NULL;
 1076                 }
 1077         }
 1078 
 1079         page = page_cache_alloc(mapping);
 1080         if ( unlikely(!page) )
 1081                 return NULL;    /* Failed to allocate a page */
 1082 
 1083         if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
 1084                 /* Someone else grabbed the page already. */
 1085                 page_cache_release(page);
 1086                 return NULL;
 1087         }
 1088 
 1089         return page;
 1090 }
 1091 
 1092 #if 0
 1093 #define PROFILE_READAHEAD
 1094 #define DEBUG_READAHEAD
 1095 #endif
 1096 
 1097 /*
 1098  * Read-ahead profiling information
 1099  * --------------------------------
 1100  * Every PROFILE_MAXREADCOUNT, the following information is written 
 1101  * to the syslog:
 1102  *   Percentage of asynchronous read-ahead.
 1103  *   Average of read-ahead fields context value.
 1104  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
 1105  * to the syslog.
 1106  */
 1107 
 1108 #ifdef PROFILE_READAHEAD
 1109 
 1110 #define PROFILE_MAXREADCOUNT 1000
 1111 
 1112 static unsigned long total_reada;
 1113 static unsigned long total_async;
 1114 static unsigned long total_ramax;
 1115 static unsigned long total_ralen;
 1116 static unsigned long total_rawin;
 1117 
 1118 static void profile_readahead(int async, struct file *filp)
 1119 {
 1120         unsigned long flags;
 1121 
 1122         ++total_reada;
 1123         if (async)
 1124                 ++total_async;
 1125 
 1126         total_ramax     += filp->f_ramax;
 1127         total_ralen     += filp->f_ralen;
 1128         total_rawin     += filp->f_rawin;
 1129 
 1130         if (total_reada > PROFILE_MAXREADCOUNT) {
 1131                 save_flags(flags);
 1132                 cli();
 1133                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
 1134                         restore_flags(flags);
 1135                         return;
 1136                 }
 1137 
 1138                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
 1139                         total_ramax/total_reada,
 1140                         total_ralen/total_reada,
 1141                         total_rawin/total_reada,
 1142                         (total_async*100)/total_reada);
 1143 #ifdef DEBUG_READAHEAD
 1144                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
 1145                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
 1146 #endif
 1147 
 1148                 total_reada     = 0;
 1149                 total_async     = 0;
 1150                 total_ramax     = 0;
 1151                 total_ralen     = 0;
 1152                 total_rawin     = 0;
 1153 
 1154                 restore_flags(flags);
 1155         }
 1156 }
 1157 #endif  /* defined PROFILE_READAHEAD */
 1158 
 1159 /*
 1160  * Read-ahead context:
 1161  * -------------------
 1162  * The read ahead context fields of the "struct file" are the following:
 1163  * - f_raend : position of the first byte after the last page we tried to
 1164  *             read ahead.
 1165  * - f_ramax : current read-ahead maximum size.
 1166  * - f_ralen : length of the current IO read block we tried to read-ahead.
 1167  * - f_rawin : length of the current read-ahead window.
 1168  *              if last read-ahead was synchronous then
 1169  *                      f_rawin = f_ralen
 1170  *              otherwise (was asynchronous)
 1171  *                      f_rawin = previous value of f_ralen + f_ralen
 1172  *
 1173  * Read-ahead limits:
 1174  * ------------------
 1175  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 1176  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 1177  *
 1178  * Synchronous read-ahead benefits:
 1179  * --------------------------------
 1180  * Using reasonable IO xfer length from peripheral devices increase system 
 1181  * performances.
 1182  * Reasonable means, in this context, not too large but not too small.
 1183  * The actual maximum value is:
 1184  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
 1185  *      and 32K if defined (4K page size assumed).
 1186  *
 1187  * Asynchronous read-ahead benefits:
 1188  * ---------------------------------
 1189  * Overlapping next read request and user process execution increase system 
 1190  * performance.
 1191  *
 1192  * Read-ahead risks:
 1193  * -----------------
 1194  * We have to guess which further data are needed by the user process.
 1195  * If these data are often not really needed, it's bad for system 
 1196  * performances.
 1197  * However, we know that files are often accessed sequentially by 
 1198  * application programs and it seems that it is possible to have some good 
 1199  * strategy in that guessing.
 1200  * We only try to read-ahead files that seems to be read sequentially.
 1201  *
 1202  * Asynchronous read-ahead risks:
 1203  * ------------------------------
 1204  * In order to maximize overlapping, we must start some asynchronous read 
 1205  * request from the device, as soon as possible.
 1206  * We must be very careful about:
 1207  * - The number of effective pending IO read requests.
 1208  *   ONE seems to be the only reasonable value.
 1209  * - The total memory pool usage for the file access stream.
 1210  *   This maximum memory usage is implicitly 2 IO read chunks:
 1211  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
 1212  *   64k if defined (4K page size assumed).
 1213  */
 1214 
 1215 static inline int get_max_readahead(struct inode * inode)
 1216 {
 1217         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
 1218                 return vm_max_readahead;
 1219         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
 1220 }
 1221 
 1222 static void generic_file_readahead(int reada_ok,
 1223         struct file * filp, struct inode * inode,
 1224         struct page * page)
 1225 {
 1226         unsigned long end_index;
 1227         unsigned long index = page->index;
 1228         unsigned long max_ahead, ahead;
 1229         unsigned long raend;
 1230         int max_readahead = get_max_readahead(inode);
 1231 
 1232         end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 1233 
 1234         raend = filp->f_raend;
 1235         max_ahead = 0;
 1236 
 1237 /*
 1238  * The current page is locked.
 1239  * If the current position is inside the previous read IO request, do not
 1240  * try to reread previously read ahead pages.
 1241  * Otherwise decide or not to read ahead some pages synchronously.
 1242  * If we are not going to read ahead, set the read ahead context for this 
 1243  * page only.
 1244  */
 1245         if (PageLocked(page)) {
 1246                 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
 1247                         raend = index;
 1248                         if (raend < end_index)
 1249                                 max_ahead = filp->f_ramax;
 1250                         filp->f_rawin = 0;
 1251                         filp->f_ralen = 1;
 1252                         if (!max_ahead) {
 1253                                 filp->f_raend  = index + filp->f_ralen;
 1254                                 filp->f_rawin += filp->f_ralen;
 1255                         }
 1256                 }
 1257         }
 1258 /*
 1259  * The current page is not locked.
 1260  * If we were reading ahead and,
 1261  * if the current max read ahead size is not zero and,
 1262  * if the current position is inside the last read-ahead IO request,
 1263  *   it is the moment to try to read ahead asynchronously.
 1264  * We will later force unplug device in order to force asynchronous read IO.
 1265  */
 1266         else if (reada_ok && filp->f_ramax && raend >= 1 &&
 1267                  index <= raend && index + filp->f_ralen >= raend) {
 1268 /*
 1269  * Add ONE page to max_ahead in order to try to have about the same IO max size
 1270  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
 1271  * Compute the position of the last page we have tried to read in order to 
 1272  * begin to read ahead just at the next page.
 1273  */
 1274                 raend -= 1;
 1275                 if (raend < end_index)
 1276                         max_ahead = filp->f_ramax + 1;
 1277 
 1278                 if (max_ahead) {
 1279                         filp->f_rawin = filp->f_ralen;
 1280                         filp->f_ralen = 0;
 1281                         reada_ok      = 2;
 1282                 }
 1283         }
 1284 /*
 1285  * Try to read ahead pages.
 1286  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 1287  * scheduler, will work enough for us to avoid too bad actuals IO requests.
 1288  */
 1289         ahead = 0;
 1290         while (ahead < max_ahead) {
 1291                 ahead ++;
 1292                 if ((raend + ahead) >= end_index)
 1293                         break;
 1294                 if (page_cache_read(filp, raend + ahead) < 0)
 1295                         break;
 1296         }
 1297 /*
 1298  * If we tried to read ahead some pages,
 1299  * If we tried to read ahead asynchronously,
 1300  *   Try to force unplug of the device in order to start an asynchronous
 1301  *   read IO request.
 1302  * Update the read-ahead context.
 1303  * Store the length of the current read-ahead window.
 1304  * Double the current max read ahead size.
 1305  *   That heuristic avoid to do some large IO for files that are not really
 1306  *   accessed sequentially.
 1307  */
 1308         if (ahead) {
 1309                 filp->f_ralen += ahead;
 1310                 filp->f_rawin += filp->f_ralen;
 1311                 filp->f_raend = raend + ahead + 1;
 1312 
 1313                 filp->f_ramax += filp->f_ramax;
 1314 
 1315                 if (filp->f_ramax > max_readahead)
 1316                         filp->f_ramax = max_readahead;
 1317 
 1318 #ifdef PROFILE_READAHEAD
 1319                 profile_readahead((reada_ok == 2), filp);
 1320 #endif
 1321         }
 1322 
 1323         return;
 1324 }
 1325 
 1326 /*
 1327  * Mark a page as having seen activity.
 1328  *
 1329  * If it was already so marked, move it to the active queue and drop
 1330  * the referenced bit.  Otherwise, just mark it for future action..
 1331  */
 1332 void mark_page_accessed(struct page *page)
 1333 {
 1334         if (!PageActive(page) && PageReferenced(page)) {
 1335                 activate_page(page);
 1336                 ClearPageReferenced(page);
 1337         } else
 1338                 SetPageReferenced(page);
 1339 }
 1340 
 1341 /*
 1342  * This is a generic file read routine, and uses the
 1343  * inode->i_op->readpage() function for the actual low-level
 1344  * stuff.
 1345  *
 1346  * This is really ugly. But the goto's actually try to clarify some
 1347  * of the logic when it comes to error handling etc.
 1348  */
 1349 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
 1350 {
 1351         struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
 1352         struct inode *inode = mapping->host;
 1353         unsigned long index, offset;
 1354         struct page *cached_page;
 1355         int reada_ok;
 1356         int error;
 1357         int max_readahead = get_max_readahead(inode);
 1358 
 1359         cached_page = NULL;
 1360         index = *ppos >> PAGE_CACHE_SHIFT;
 1361         offset = *ppos & ~PAGE_CACHE_MASK;
 1362 
 1363 /*
 1364  * If the current position is outside the previous read-ahead window, 
 1365  * we reset the current read-ahead context and set read ahead max to zero
 1366  * (will be set to just needed value later),
 1367  * otherwise, we assume that the file accesses are sequential enough to
 1368  * continue read-ahead.
 1369  */
 1370         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
 1371                 reada_ok = 0;
 1372                 filp->f_raend = 0;
 1373                 filp->f_ralen = 0;
 1374                 filp->f_ramax = 0;
 1375                 filp->f_rawin = 0;
 1376         } else {
 1377                 reada_ok = 1;
 1378         }
 1379 /*
 1380  * Adjust the current value of read-ahead max.
 1381  * If the read operation stay in the first half page, force no readahead.
 1382  * Otherwise try to increase read ahead max just enough to do the read request.
 1383  * Then, at least MIN_READAHEAD if read ahead is ok,
 1384  * and at most MAX_READAHEAD in all cases.
 1385  */
 1386         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
 1387                 filp->f_ramax = 0;
 1388         } else {
 1389                 unsigned long needed;
 1390 
 1391                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
 1392 
 1393                 if (filp->f_ramax < needed)
 1394                         filp->f_ramax = needed;
 1395 
 1396                 if (reada_ok && filp->f_ramax < vm_min_readahead)
 1397                                 filp->f_ramax = vm_min_readahead;
 1398                 if (filp->f_ramax > max_readahead)
 1399                         filp->f_ramax = max_readahead;
 1400         }
 1401 
 1402         for (;;) {
 1403                 struct page *page, **hash;
 1404                 unsigned long end_index, nr, ret;
 1405 
 1406                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 1407                         
 1408                 if (index > end_index)
 1409                         break;
 1410                 nr = PAGE_CACHE_SIZE;
 1411                 if (index == end_index) {
 1412                         nr = inode->i_size & ~PAGE_CACHE_MASK;
 1413                         if (nr <= offset)
 1414                                 break;
 1415                 }
 1416 
 1417                 nr = nr - offset;
 1418 
 1419                 /*
 1420                  * Try to find the data in the page cache..
 1421                  */
 1422                 hash = page_hash(mapping, index);
 1423 
 1424                 spin_lock(&pagecache_lock);
 1425                 page = __find_page_nolock(mapping, index, *hash);
 1426                 if (!page)
 1427                         goto no_cached_page;
 1428 found_page:
 1429                 page_cache_get(page);
 1430                 spin_unlock(&pagecache_lock);
 1431 
 1432                 if (!Page_Uptodate(page))
 1433                         goto page_not_up_to_date;
 1434                 generic_file_readahead(reada_ok, filp, inode, page);
 1435 page_ok:
 1436                 /* If users can be writing to this page using arbitrary
 1437                  * virtual addresses, take care about potential aliasing
 1438                  * before reading the page on the kernel side.
 1439                  */
 1440                 if (mapping->i_mmap_shared != NULL)
 1441                         flush_dcache_page(page);
 1442 
 1443                 /*
 1444                  * Mark the page accessed if we read the
 1445                  * beginning or we just did an lseek.
 1446                  */
 1447                 if (!offset || !filp->f_reada)
 1448                         mark_page_accessed(page);
 1449 
 1450                 /*
 1451                  * Ok, we have the page, and it's up-to-date, so
 1452                  * now we can copy it to user space...
 1453                  *
 1454                  * The actor routine returns how many bytes were actually used..
 1455                  * NOTE! This may not be the same as how much of a user buffer
 1456                  * we filled up (we may be padding etc), so we can only update
 1457                  * "pos" here (the actor routine has to update the user buffer
 1458                  * pointers and the remaining count).
 1459                  */
 1460                 ret = actor(desc, page, offset, nr);
 1461                 offset += ret;
 1462                 index += offset >> PAGE_CACHE_SHIFT;
 1463                 offset &= ~PAGE_CACHE_MASK;
 1464 
 1465                 page_cache_release(page);
 1466                 if (ret == nr && desc->count)
 1467                         continue;
 1468                 break;
 1469 
 1470 /*
 1471  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
 1472  */
 1473 page_not_up_to_date:
 1474                 generic_file_readahead(reada_ok, filp, inode, page);
 1475 
 1476                 if (Page_Uptodate(page))
 1477                         goto page_ok;
 1478 
 1479                 /* Get exclusive access to the page ... */
 1480                 lock_page(page);
 1481 
 1482                 /* Did it get unhashed before we got the lock? */
 1483                 if (!page->mapping) {
 1484                         UnlockPage(page);
 1485                         page_cache_release(page);
 1486                         continue;
 1487                 }
 1488 
 1489                 /* Did somebody else fill it already? */
 1490                 if (Page_Uptodate(page)) {
 1491                         UnlockPage(page);
 1492                         goto page_ok;
 1493                 }
 1494 
 1495 readpage:
 1496                 /* ... and start the actual read. The read will unlock the page. */
 1497                 error = mapping->a_ops->readpage(filp, page);
 1498 
 1499                 if (!error) {
 1500                         if (Page_Uptodate(page))
 1501                                 goto page_ok;
 1502 
 1503                         /* Again, try some read-ahead while waiting for the page to finish.. */
 1504                         generic_file_readahead(reada_ok, filp, inode, page);
 1505                         wait_on_page(page);
 1506                         if (Page_Uptodate(page))
 1507                                 goto page_ok;
 1508                         error = -EIO;
 1509                 }
 1510 
 1511                 /* UHHUH! A synchronous read error occurred. Report it */
 1512                 desc->error = error;
 1513                 page_cache_release(page);
 1514                 break;
 1515 
 1516 no_cached_page:
 1517                 /*
 1518                  * Ok, it wasn't cached, so we need to create a new
 1519                  * page..
 1520                  *
 1521                  * We get here with the page cache lock held.
 1522                  */
 1523                 if (!cached_page) {
 1524                         spin_unlock(&pagecache_lock);
 1525                         cached_page = page_cache_alloc(mapping);
 1526                         if (!cached_page) {
 1527                                 desc->error = -ENOMEM;
 1528                                 break;
 1529                         }
 1530 
 1531                         /*
 1532                          * Somebody may have added the page while we
 1533                          * dropped the page cache lock. Check for that.
 1534                          */
 1535                         spin_lock(&pagecache_lock);
 1536                         page = __find_page_nolock(mapping, index, *hash);
 1537                         if (page)
 1538                                 goto found_page;
 1539                 }
 1540 
 1541                 /*
 1542                  * Ok, add the new page to the hash-queues...
 1543                  */
 1544                 page = cached_page;
 1545                 __add_to_page_cache(page, mapping, index, hash);
 1546                 spin_unlock(&pagecache_lock);
 1547                 lru_cache_add(page);            
 1548                 cached_page = NULL;
 1549 
 1550                 goto readpage;
 1551         }
 1552 
 1553         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
 1554         filp->f_reada = 1;
 1555         if (cached_page)
 1556                 page_cache_release(cached_page);
 1557         UPDATE_ATIME(inode);
 1558 }
 1559 
 1560 static inline int have_mapping_directIO(struct address_space * mapping)
 1561 {
 1562         return mapping->a_ops->direct_IO || mapping->a_ops->direct_fileIO;
 1563 }
 1564 
 1565 /* Switch between old and new directIO formats */
 1566 static inline int do_call_directIO(int rw, struct file *filp, struct kiobuf *iobuf, unsigned long offset, int blocksize)
 1567 {
 1568         struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
 1569 
 1570         if (mapping->a_ops->direct_fileIO)
 1571                 return mapping->a_ops->direct_fileIO(rw, filp, iobuf, offset, blocksize);
 1572         return mapping->a_ops->direct_IO(rw, mapping->host, iobuf, offset, blocksize);
 1573 }
 1574 
 1575 /*
 1576  * i_sem and i_alloc_sem should be held already.  i_sem may be dropped
 1577  * later once we've mapped the new IO.  i_alloc_sem is kept until the IO
 1578  * completes.
 1579  */
 1580 
 1581 static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
 1582 {
 1583         ssize_t retval;
 1584         int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress;
 1585         struct kiobuf * iobuf;
 1586         struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
 1587         struct inode * inode = mapping->host;
 1588         loff_t size = inode->i_size;
 1589 
 1590         new_iobuf = 0;
 1591         iobuf = filp->f_iobuf;
 1592         if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
 1593                 /*
 1594                  * A parallel read/write is using the preallocated iobuf
 1595                  * so just run slow and allocate a new one.
 1596                  */
 1597                 retval = alloc_kiovec(1, &iobuf);
 1598                 if (retval)
 1599                         goto out;
 1600                 new_iobuf = 1;
 1601         }
 1602 
 1603         blocksize = 1 << inode->i_blkbits;
 1604         blocksize_bits = inode->i_blkbits;
 1605         blocksize_mask = blocksize - 1;
 1606         chunk_size = KIO_MAX_ATOMIC_IO << 10;
 1607 
 1608         retval = -EINVAL;
 1609         if ((offset & blocksize_mask) || (count & blocksize_mask) || ((unsigned long) buf & blocksize_mask))
 1610                 goto out_free;
 1611         if (!have_mapping_directIO(mapping))
 1612                 goto out_free;
 1613 
 1614         if ((rw == READ) && (offset + count > size))
 1615                 count = size - offset;
 1616 
 1617         /*
 1618          * Flush to disk exclusively the _data_, metadata must remain
 1619          * completly asynchronous or performance will go to /dev/null.
 1620          */
 1621         retval = filemap_fdatasync(mapping);
 1622         if (retval == 0)
 1623                 retval = fsync_inode_data_buffers(inode);
 1624         if (retval == 0)
 1625                 retval = filemap_fdatawait(mapping);
 1626         if (retval < 0)
 1627                 goto out_free;
 1628 
 1629         progress = retval = 0;
 1630         while (count > 0) {
 1631                 iosize = count;
 1632                 if (iosize > chunk_size)
 1633                         iosize = chunk_size;
 1634 
 1635                 retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
 1636                 if (retval)
 1637                         break;
 1638 
 1639                 retval = do_call_directIO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize);
 1640 
 1641                 if (rw == READ && retval > 0)
 1642                         mark_dirty_kiobuf(iobuf, retval);
 1643                 
 1644                 if (retval >= 0) {
 1645                         count -= retval;
 1646                         buf += retval;
 1647                         /* warning: weird semantics here, we're reporting a read behind the end of the file */
 1648                         progress += retval;
 1649                 }
 1650 
 1651                 unmap_kiobuf(iobuf);
 1652 
 1653                 if (retval != iosize)
 1654                         break;
 1655         }
 1656 
 1657         if (progress)
 1658                 retval = progress;
 1659 
 1660  out_free:
 1661         if (!new_iobuf)
 1662                 clear_bit(0, &filp->f_iobuf_lock);
 1663         else
 1664                 free_kiovec(1, &iobuf);
 1665  out:   
 1666         return retval;
 1667 }
 1668 
 1669 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
 1670 {
 1671         char *kaddr;
 1672         unsigned long left, count = desc->count;
 1673 
 1674         if (size > count)
 1675                 size = count;
 1676 
 1677         kaddr = kmap(page);
 1678         left = __copy_to_user(desc->buf, kaddr + offset, size);
 1679         kunmap(page);
 1680         
 1681         if (left) {
 1682                 size -= left;
 1683                 desc->error = -EFAULT;
 1684         }
 1685         desc->count = count - size;
 1686         desc->written += size;
 1687         desc->buf += size;
 1688         return size;
 1689 }
 1690 
 1691 /*
 1692  * This is the "read()" routine for all filesystems
 1693  * that can use the page cache directly.
 1694  */
 1695 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
 1696 {
 1697         ssize_t retval;
 1698 
 1699         if ((ssize_t) count < 0)
 1700                 return -EINVAL;
 1701 
 1702         if (filp->f_flags & O_DIRECT)
 1703                 goto o_direct;
 1704 
 1705         retval = -EFAULT;
 1706         if (access_ok(VERIFY_WRITE, buf, count)) {
 1707                 retval = 0;
 1708 
 1709                 if (count) {
 1710                         read_descriptor_t desc;
 1711 
 1712                         desc.written = 0;
 1713                         desc.count = count;
 1714                         desc.buf = buf;
 1715                         desc.error = 0;
 1716                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
 1717 
 1718                         retval = desc.written;
 1719                         if (!retval)
 1720                                 retval = desc.error;
 1721                 }
 1722         }
 1723  out:
 1724         return retval;
 1725 
 1726  o_direct:
 1727         {
 1728                 loff_t pos = *ppos, size;
 1729                 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
 1730                 struct inode *inode = mapping->host;
 1731 
 1732                 retval = 0;
 1733                 if (!count)
 1734                         goto out; /* skip atime */
 1735                 down_read(&inode->i_alloc_sem);
 1736                 down(&inode->i_sem);
 1737                 size = inode->i_size;
 1738                 if (pos < size) {
 1739                         retval = generic_file_direct_IO(READ, filp, buf, count, pos);
 1740                         if (retval > 0)
 1741                                 *ppos = pos + retval;
 1742                 }
 1743                 up(&inode->i_sem);
 1744                 up_read(&inode->i_alloc_sem);
 1745                 UPDATE_ATIME(filp->f_dentry->d_inode);
 1746                 goto out;
 1747         }
 1748 }
 1749 
 1750 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
 1751 {
 1752         ssize_t written;
 1753         unsigned long count = desc->count;
 1754         struct file *file = (struct file *) desc->buf;
 1755 
 1756         if (size > count)
 1757                 size = count;
 1758 
 1759         if (file->f_op->sendpage) {
 1760                 written = file->f_op->sendpage(file, page, offset,
 1761                                                size, &file->f_pos, size<count);
 1762         } else {
 1763                 char *kaddr;
 1764                 mm_segment_t old_fs;
 1765 
 1766                 old_fs = get_fs();
 1767                 set_fs(KERNEL_DS);
 1768 
 1769                 kaddr = kmap(page);
 1770                 written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
 1771                 kunmap(page);
 1772 
 1773                 set_fs(old_fs);
 1774         }
 1775         if (written < 0) {
 1776                 desc->error = written;
 1777                 written = 0;
 1778         }
 1779         desc->count = count - written;
 1780         desc->written += written;
 1781         return written;
 1782 }
 1783 
 1784 static ssize_t common_sendfile(int out_fd, int in_fd, loff_t *offset, size_t count)
 1785 {
 1786         ssize_t retval;
 1787         struct file * in_file, * out_file;
 1788         struct inode * in_inode, * out_inode;
 1789 
 1790         /*
 1791          * Get input file, and verify that it is ok..
 1792          */
 1793         retval = -EBADF;
 1794         in_file = fget(in_fd);
 1795         if (!in_file)
 1796                 goto out;
 1797         if (!(in_file->f_mode & FMODE_READ))
 1798                 goto fput_in;
 1799         retval = -EINVAL;
 1800         in_inode = in_file->f_dentry->d_inode;
 1801         if (!in_inode)
 1802                 goto fput_in;
 1803         if (!in_inode->i_mapping->a_ops->readpage)
 1804                 goto fput_in;
 1805         retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
 1806         if (retval)
 1807                 goto fput_in;
 1808 
 1809         /*
 1810          * Get output file, and verify that it is ok..
 1811          */
 1812         retval = -EBADF;
 1813         out_file = fget(out_fd);
 1814         if (!out_file)
 1815                 goto fput_in;
 1816         if (!(out_file->f_mode & FMODE_WRITE))
 1817                 goto fput_out;
 1818         retval = -EINVAL;
 1819         if (!out_file->f_op || !out_file->f_op->write)
 1820                 goto fput_out;
 1821         out_inode = out_file->f_dentry->d_inode;
 1822         retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
 1823         if (retval)
 1824                 goto fput_out;
 1825 
 1826         retval = 0;
 1827         if (count) {
 1828                 read_descriptor_t desc;
 1829                 
 1830                 if (!offset)
 1831                         offset = &in_file->f_pos;
 1832 
 1833                 desc.written = 0;
 1834                 desc.count = count;
 1835                 desc.buf = (char *) out_file;
 1836                 desc.error = 0;
 1837                 do_generic_file_read(in_file, offset, &desc, file_send_actor);
 1838 
 1839                 retval = desc.written;
 1840                 if (!retval)
 1841                         retval = desc.error;
 1842         }
 1843 
 1844 fput_out:
 1845         fput(out_file);
 1846 fput_in:
 1847         fput(in_file);
 1848 out:
 1849         return retval;
 1850 }
 1851 
 1852 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
 1853 {
 1854         loff_t pos, *ppos = NULL;
 1855         ssize_t ret;
 1856         if (offset) {
 1857                 off_t off;
 1858                 if (unlikely(get_user(off, offset)))
 1859                         return -EFAULT;
 1860                 pos = off;
 1861                 ppos = &pos;
 1862         }
 1863         ret = common_sendfile(out_fd, in_fd, ppos, count);
 1864         if (offset)
 1865                 put_user((off_t)pos, offset);
 1866         return ret;
 1867 }
 1868 
 1869 asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t *offset, size_t count)
 1870 {
 1871         loff_t pos, *ppos = NULL;
 1872         ssize_t ret;
 1873         if (offset) {
 1874                 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
 1875                         return -EFAULT;
 1876                 ppos = &pos;
 1877         }
 1878         ret = common_sendfile(out_fd, in_fd, ppos, count);
 1879         if (offset)
 1880                 put_user(pos, offset);
 1881         return ret;
 1882 }
 1883 
 1884 static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
 1885 {
 1886         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 1887         unsigned long max;
 1888 
 1889         if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
 1890                 return -EINVAL;
 1891 
 1892         /* Limit it to the size of the file.. */
 1893         max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
 1894         if (index > max)
 1895                 return 0;
 1896         max -= index;
 1897         if (nr > max)
 1898                 nr = max;
 1899 
 1900         /* And limit it to a sane percentage of the inactive list.. */
 1901         max = nr_inactive_pages / 2;
 1902         if (nr > max)
 1903                 nr = max;
 1904 
 1905         while (nr) {
 1906                 page_cache_read(file, index);
 1907                 index++;
 1908                 nr--;
 1909         }
 1910         return 0;
 1911 }
 1912 
 1913 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 1914 {
 1915         ssize_t ret;
 1916         struct file *file;
 1917 
 1918         ret = -EBADF;
 1919         file = fget(fd);
 1920         if (file) {
 1921                 if (file->f_mode & FMODE_READ) {
 1922                         unsigned long start = offset >> PAGE_CACHE_SHIFT;
 1923                         unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
 1924                         ret = do_readahead(file, start, len);
 1925                 }
 1926                 fput(file);
 1927         }
 1928         return ret;
 1929 }
 1930 
 1931 /*
 1932  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
 1933  * sure this is sequential access, we don't need a flexible read-ahead
 1934  * window size -- we can always use a large fixed size window.
 1935  */
 1936 static void nopage_sequential_readahead(struct vm_area_struct * vma,
 1937         unsigned long pgoff, unsigned long filesize)
 1938 {
 1939         unsigned long ra_window;
 1940 
 1941         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
 1942         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
 1943 
 1944         /* vm_raend is zero if we haven't read ahead in this area yet.  */
 1945         if (vma->vm_raend == 0)
 1946                 vma->vm_raend = vma->vm_pgoff + ra_window;
 1947 
 1948         /*
 1949          * If we've just faulted the page half-way through our window,
 1950          * then schedule reads for the next window, and release the
 1951          * pages in the previous window.
 1952          */
 1953         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
 1954                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
 1955                 unsigned long end = start + ra_window;
 1956 
 1957                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
 1958                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
 1959                 if (start > end)
 1960                         return;
 1961 
 1962                 while ((start < end) && (start < filesize)) {
 1963                         if (read_cluster_nonblocking(vma->vm_file,
 1964                                                         start, filesize) < 0)
 1965                                 break;
 1966                         start += CLUSTER_PAGES;
 1967                 }
 1968                 run_task_queue(&tq_disk);
 1969 
 1970                 /* if we're far enough past the beginning of this area,
 1971                    recycle pages that are in the previous window. */
 1972                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
 1973                         unsigned long window = ra_window << PAGE_SHIFT;
 1974 
 1975                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
 1976                         end -= window + window;
 1977                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
 1978                 }
 1979 
 1980                 vma->vm_raend += ra_window;
 1981         }
 1982 
 1983         return;
 1984 }
 1985 
 1986 /*
 1987  * filemap_nopage() is invoked via the vma operations vector for a
 1988  * mapped memory region to read in file data during a page fault.
 1989  *
 1990  * The goto's are kind of ugly, but this streamlines the normal case of having
 1991  * it in the page cache, and handles the special cases reasonably without
 1992  * having a lot of duplicated code.
 1993  */
 1994 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
 1995 {
 1996         int error;
 1997         struct file *file = area->vm_file;
 1998         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 1999         struct inode *inode = mapping->host;
 2000         struct page *page, **hash;
 2001         unsigned long size, pgoff, endoff;
 2002 
 2003         pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
 2004         endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
 2005 
 2006 retry_all:
 2007         /*
 2008          * An external ptracer can access pages that normally aren't
 2009          * accessible..
 2010          */
 2011         size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 2012         if ((pgoff >= size) && (area->vm_mm == current->mm))
 2013                 return NULL;
 2014 
 2015         /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
 2016         if (size > endoff)
 2017                 size = endoff;
 2018 
 2019         /*
 2020          * Do we have something in the page cache already?
 2021          */
 2022         hash = page_hash(mapping, pgoff);
 2023 retry_find:
 2024         page = __find_get_page(mapping, pgoff, hash);
 2025         if (!page)
 2026                 goto no_cached_page;
 2027 
 2028         /*
 2029          * Ok, found a page in the page cache, now we need to check
 2030          * that it's up-to-date.
 2031          */
 2032         if (!Page_Uptodate(page))
 2033                 goto page_not_uptodate;
 2034 
 2035 success:
 2036         /*
 2037          * Try read-ahead for sequential areas.
 2038          */
 2039         if (VM_SequentialReadHint(area))
 2040                 nopage_sequential_readahead(area, pgoff, size);
 2041 
 2042         /*
 2043          * Found the page and have a reference on it, need to check sharing
 2044          * and possibly copy it over to another page..
 2045          */
 2046         mark_page_accessed(page);
 2047         flush_page_to_ram(page);
 2048         return page;
 2049 
 2050 no_cached_page:
 2051         /*
 2052          * If the requested offset is within our file, try to read a whole 
 2053          * cluster of pages at once.
 2054          *
 2055          * Otherwise, we're off the end of a privately mapped file,
 2056          * so we need to map a zero page.
 2057          */
 2058         if ((pgoff < size) && !VM_RandomReadHint(area))
 2059                 error = read_cluster_nonblocking(file, pgoff, size);
 2060         else
 2061                 error = page_cache_read(file, pgoff);
 2062 
 2063         /*
 2064          * The page we want has now been added to the page cache.
 2065          * In the unlikely event that someone removed it in the
 2066          * meantime, we'll just come back here and read it again.
 2067          */
 2068         if (error >= 0)
 2069                 goto retry_find;
 2070 
 2071         /*
 2072          * An error return from page_cache_read can result if the
 2073          * system is low on memory, or a problem occurs while trying
 2074          * to schedule I/O.
 2075          */
 2076         if (error == -ENOMEM)
 2077                 return NOPAGE_OOM;
 2078         return NULL;
 2079 
 2080 page_not_uptodate:
 2081         lock_page(page);
 2082 
 2083         /* Did it get unhashed while we waited for it? */
 2084         if (!page->mapping) {
 2085                 UnlockPage(page);
 2086                 page_cache_release(page);
 2087                 goto retry_all;
 2088         }
 2089 
 2090         /* Did somebody else get it up-to-date? */
 2091         if (Page_Uptodate(page)) {
 2092                 UnlockPage(page);
 2093                 goto success;
 2094         }
 2095 
 2096         if (!mapping->a_ops->readpage(file, page)) {
 2097                 wait_on_page(page);
 2098                 if (Page_Uptodate(page))
 2099                         goto success;
 2100         }
 2101 
 2102         /*
 2103          * Umm, take care of errors if the page isn't up-to-date.
 2104          * Try to re-read it _once_. We do this synchronously,
 2105          * because there really aren't any performance issues here
 2106          * and we need to check for errors.
 2107          */
 2108         lock_page(page);
 2109 
 2110         /* Somebody truncated the page on us? */
 2111         if (!page->mapping) {
 2112                 UnlockPage(page);
 2113                 page_cache_release(page);
 2114                 goto retry_all;
 2115         }
 2116 
 2117         /* Somebody else successfully read it in? */
 2118         if (Page_Uptodate(page)) {
 2119                 UnlockPage(page);
 2120                 goto success;
 2121         }
 2122         ClearPageError(page);
 2123         if (!mapping->a_ops->readpage(file, page)) {
 2124                 wait_on_page(page);
 2125                 if (Page_Uptodate(page))
 2126                         goto success;
 2127         }
 2128 
 2129         /*
 2130          * Things didn't work out. Return zero to tell the
 2131          * mm layer so, possibly freeing the page cache page first.
 2132          */
 2133         page_cache_release(page);
 2134         return NULL;
 2135 }
 2136 
 2137 /* Called with mm->page_table_lock held to protect against other
 2138  * threads/the swapper from ripping pte's out from under us.
 2139  */
 2140 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
 2141         unsigned long address, unsigned int flags)
 2142 {
 2143         pte_t pte = *ptep;
 2144 
 2145         if (pte_present(pte)) {
 2146                 struct page *page = pte_page(pte);
 2147                 if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
 2148                         flush_tlb_page(vma, address);
 2149                         set_page_dirty(page);
 2150                 }
 2151         }
 2152         return 0;
 2153 }
 2154 
 2155 static inline int filemap_sync_pte_range(pmd_t * pmd,
 2156         unsigned long address, unsigned long size, 
 2157         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
 2158 {
 2159         pte_t * pte;
 2160         unsigned long end;
 2161         int error;
 2162 
 2163         if (pmd_none(*pmd))
 2164                 return 0;
 2165         if (pmd_bad(*pmd)) {
 2166                 pmd_ERROR(*pmd);
 2167                 pmd_clear(pmd);
 2168                 return 0;
 2169         }
 2170         pte = pte_offset(pmd, address);
 2171         offset += address & PMD_MASK;
 2172         address &= ~PMD_MASK;
 2173         end = address + size;
 2174         if (end > PMD_SIZE)
 2175                 end = PMD_SIZE;
 2176         error = 0;
 2177         do {
 2178                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
 2179                 address += PAGE_SIZE;
 2180                 pte++;
 2181         } while (address && (address < end));
 2182         return error;
 2183 }
 2184 
 2185 static inline int filemap_sync_pmd_range(pgd_t * pgd,
 2186         unsigned long address, unsigned long size, 
 2187         struct vm_area_struct *vma, unsigned int flags)
 2188 {
 2189         pmd_t * pmd;
 2190         unsigned long offset, end;
 2191         int error;
 2192 
 2193         if (pgd_none(*pgd))
 2194                 return 0;
 2195         if (pgd_bad(*pgd)) {
 2196                 pgd_ERROR(*pgd);
 2197                 pgd_clear(pgd);
 2198                 return 0;
 2199         }
 2200         pmd = pmd_offset(pgd, address);
 2201         offset = address & PGDIR_MASK;
 2202         address &= ~PGDIR_MASK;
 2203         end = address + size;
 2204         if (end > PGDIR_SIZE)
 2205                 end = PGDIR_SIZE;
 2206         error = 0;
 2207         do {
 2208                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
 2209                 address = (address + PMD_SIZE) & PMD_MASK;
 2210                 pmd++;
 2211         } while (address && (address < end));
 2212         return error;
 2213 }
 2214 
 2215 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
 2216         size_t size, unsigned int flags)
 2217 {
 2218         pgd_t * dir;
 2219         unsigned long end = address + size;
 2220         int error = 0;
 2221 
 2222         /* Aquire the lock early; it may be possible to avoid dropping
 2223          * and reaquiring it repeatedly.
 2224          */
 2225         spin_lock(&vma->vm_mm->page_table_lock);
 2226 
 2227         dir = pgd_offset(vma->vm_mm, address);
 2228         flush_cache_range(vma->vm_mm, end - size, end);
 2229         if (address >= end)
 2230                 BUG();
 2231         do {
 2232                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
 2233                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
 2234                 dir++;
 2235         } while (address && (address < end));
 2236         flush_tlb_range(vma->vm_mm, end - size, end);
 2237 
 2238         spin_unlock(&vma->vm_mm->page_table_lock);
 2239 
 2240         return error;
 2241 }
 2242 
 2243 static struct vm_operations_struct generic_file_vm_ops = {
 2244         nopage:         filemap_nopage,
 2245 };
 2246 
 2247 /* This is used for a general mmap of a disk file */
 2248 
 2249 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 2250 {
 2251         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 2252         struct inode *inode = mapping->host;
 2253 
 2254         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
 2255                 if (!mapping->a_ops->writepage)
 2256                         return -EINVAL;
 2257         }
 2258         if (!mapping->a_ops->readpage)
 2259                 return -ENOEXEC;
 2260         UPDATE_ATIME(inode);
 2261         vma->vm_ops = &generic_file_vm_ops;
 2262         return 0;
 2263 }
 2264 
 2265 /*
 2266  * The msync() system call.
 2267  */
 2268 
 2269 /*
 2270  * MS_SYNC syncs the entire file - including mappings.
 2271  *
 2272  * MS_ASYNC initiates writeout of just the dirty mapped data.
 2273  * This provides no guarantee of file integrity - things like indirect
 2274  * blocks may not have started writeout.  MS_ASYNC is primarily useful
 2275  * where the application knows that it has finished with the data and
 2276  * wishes to intelligently schedule its own I/O traffic.
 2277  */
 2278 static int msync_interval(struct vm_area_struct * vma,
 2279         unsigned long start, unsigned long end, int flags)
 2280 {
 2281         int ret = 0;
 2282         struct file * file = vma->vm_file;
 2283 
 2284         if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
 2285                 return -EBUSY;
 2286 
 2287         if (file && (vma->vm_flags & VM_SHARED)) {
 2288                 ret = filemap_sync(vma, start, end-start, flags);
 2289 
 2290                 if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
 2291                         struct inode * inode = file->f_dentry->d_inode;
 2292 
 2293                         down(&inode->i_sem);
 2294                         ret = filemap_fdatasync(inode->i_mapping);
 2295                         if (flags & MS_SYNC) {
 2296                                 int err;
 2297 
 2298                                 if (file->f_op && file->f_op->fsync) {
 2299                                         err = file->f_op->fsync(file, file->f_dentry, 1);
 2300                                         if (err && !ret)
 2301                                                 ret = err;
 2302                                 }
 2303                                 err = filemap_fdatawait(inode->i_mapping);
 2304                                 if (err && !ret)
 2305                                         ret = err;
 2306                         }
 2307                         up(&inode->i_sem);
 2308                 }
 2309         }
 2310         return ret;
 2311 }
 2312 
 2313 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 2314 {
 2315         unsigned long end;
 2316         struct vm_area_struct * vma;
 2317         int unmapped_error, error = -EINVAL;
 2318 
 2319         down_read(&current->mm->mmap_sem);
 2320         if (start & ~PAGE_MASK)
 2321                 goto out;
 2322         len = (len + ~PAGE_MASK) & PAGE_MASK;
 2323         end = start + len;
 2324         if (end < start)
 2325                 goto out;
 2326         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
 2327                 goto out;
 2328         if ((flags & MS_ASYNC) && (flags & MS_SYNC))
 2329                 goto out;
 2330 
 2331         error = 0;
 2332         if (end == start)
 2333                 goto out;
 2334         /*
 2335          * If the interval [start,end) covers some unmapped address ranges,
 2336          * just ignore them, but return -ENOMEM at the end.
 2337          */
 2338         vma = find_vma(current->mm, start);
 2339         unmapped_error = 0;
 2340         for (;;) {
 2341                 /* Still start < end. */
 2342                 error = -ENOMEM;
 2343                 if (!vma)
 2344                         goto out;
 2345                 /* Here start < vma->vm_end. */
 2346                 if (start < vma->vm_start) {
 2347                         unmapped_error = -ENOMEM;
 2348                         start = vma->vm_start;
 2349                 }
 2350                 /* Here vma->vm_start <= start < vma->vm_end. */
 2351                 if (end <= vma->vm_end) {
 2352                         if (start < end) {
 2353                                 error = msync_interval(vma, start, end, flags);
 2354                                 if (error)
 2355                                         goto out;
 2356                         }
 2357                         error = unmapped_error;
 2358                         goto out;
 2359                 }
 2360                 /* Here vma->vm_start <= start < vma->vm_end < end. */
 2361                 error = msync_interval(vma, start, vma->vm_end, flags);
 2362                 if (error)
 2363                         goto out;
 2364                 start = vma->vm_end;
 2365                 vma = vma->vm_next;
 2366         }
 2367 out:
 2368         up_read(&current->mm->mmap_sem);
 2369         return error;
 2370 }
 2371 
 2372 static inline void setup_read_behavior(struct vm_area_struct * vma,
 2373         int behavior)
 2374 {
 2375         VM_ClearReadHint(vma);
 2376         switch(behavior) {
 2377                 case MADV_SEQUENTIAL:
 2378                         vma->vm_flags |= VM_SEQ_READ;
 2379                         break;
 2380                 case MADV_RANDOM:
 2381                         vma->vm_flags |= VM_RAND_READ;
 2382                         break;
 2383                 default:
 2384                         break;
 2385         }
 2386         return;
 2387 }
 2388 
 2389 static long madvise_fixup_start(struct vm_area_struct * vma,
 2390         unsigned long end, int behavior)
 2391 {
 2392         struct vm_area_struct * n;
 2393         struct mm_struct * mm = vma->vm_mm;
 2394 
 2395         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 2396         if (!n)
 2397                 return -EAGAIN;
 2398         *n = *vma;
 2399         n->vm_end = end;
 2400         setup_read_behavior(n, behavior);
 2401         n->vm_raend = 0;
 2402         if (n->vm_file)
 2403                 get_file(n->vm_file);
 2404         if (n->vm_ops && n->vm_ops->open)
 2405                 n->vm_ops->open(n);
 2406         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
 2407         lock_vma_mappings(vma);
 2408         spin_lock(&mm->page_table_lock);
 2409         vma->vm_start = end;
 2410         __insert_vm_struct(mm, n);
 2411         spin_unlock(&mm->page_table_lock);
 2412         unlock_vma_mappings(vma);
 2413         return 0;
 2414 }
 2415 
 2416 static long madvise_fixup_end(struct vm_area_struct * vma,
 2417         unsigned long start, int behavior)
 2418 {
 2419         struct vm_area_struct * n;
 2420         struct mm_struct * mm = vma->vm_mm;
 2421 
 2422         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 2423         if (!n)
 2424                 return -EAGAIN;
 2425         *n = *vma;
 2426         n->vm_start = start;
 2427         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
 2428         setup_read_behavior(n, behavior);
 2429         n->vm_raend = 0;
 2430         if (n->vm_file)
 2431                 get_file(n->vm_file);
 2432         if (n->vm_ops && n->vm_ops->open)
 2433                 n->vm_ops->open(n);
 2434         lock_vma_mappings(vma);
 2435         spin_lock(&mm->page_table_lock);
 2436         vma->vm_end = start;
 2437         __insert_vm_struct(mm, n);
 2438         spin_unlock(&mm->page_table_lock);
 2439         unlock_vma_mappings(vma);
 2440         return 0;
 2441 }
 2442 
 2443 static long madvise_fixup_middle(struct vm_area_struct * vma,
 2444         unsigned long start, unsigned long end, int behavior)
 2445 {
 2446         struct vm_area_struct * left, * right;
 2447         struct mm_struct * mm = vma->vm_mm;
 2448 
 2449         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 2450         if (!left)
 2451                 return -EAGAIN;
 2452         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 2453         if (!right) {
 2454                 kmem_cache_free(vm_area_cachep, left);
 2455                 return -EAGAIN;
 2456         }
 2457         *left = *vma;
 2458         *right = *vma;
 2459         left->vm_end = start;
 2460         right->vm_start = end;
 2461         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
 2462         left->vm_raend = 0;
 2463         right->vm_raend = 0;
 2464         if (vma->vm_file)
 2465                 atomic_add(2, &vma->vm_file->f_count);
 2466 
 2467         if (vma->vm_ops && vma->vm_ops->open) {
 2468                 vma->vm_ops->open(left);
 2469                 vma->vm_ops->open(right);
 2470         }
 2471         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
 2472         vma->vm_raend = 0;
 2473         lock_vma_mappings(vma);
 2474         spin_lock(&mm->page_table_lock);
 2475         vma->vm_start = start;
 2476         vma->vm_end = end;
 2477         setup_read_behavior(vma, behavior);
 2478         __insert_vm_struct(mm, left);
 2479         __insert_vm_struct(mm, right);
 2480         spin_unlock(&mm->page_table_lock);
 2481         unlock_vma_mappings(vma);
 2482         return 0;
 2483 }
 2484 
 2485 /*
 2486  * We can potentially split a vm area into separate
 2487  * areas, each area with its own behavior.
 2488  */
 2489 static long madvise_behavior(struct vm_area_struct * vma,
 2490         unsigned long start, unsigned long end, int behavior)
 2491 {
 2492         int error = 0;
 2493 
 2494         /* This caps the number of vma's this process can own */
 2495         if (vma->vm_mm->map_count > max_map_count)
 2496                 return -ENOMEM;
 2497 
 2498         if (start == vma->vm_start) {
 2499                 if (end == vma->vm_end) {
 2500                         setup_read_behavior(vma, behavior);
 2501                         vma->vm_raend = 0;
 2502                 } else
 2503                         error = madvise_fixup_start(vma, end, behavior);
 2504         } else {
 2505                 if (end == vma->vm_end)
 2506                         error = madvise_fixup_end(vma, start, behavior);
 2507                 else
 2508                         error = madvise_fixup_middle(vma, start, end, behavior);
 2509         }
 2510 
 2511         return error;
 2512 }
 2513 
 2514 /*
 2515  * Schedule all required I/O operations, then run the disk queue
 2516  * to make sure they are started.  Do not wait for completion.
 2517  */
 2518 static long madvise_willneed(struct vm_area_struct * vma,
 2519         unsigned long start, unsigned long end)
 2520 {
 2521         long error = -EBADF;
 2522         struct file * file;
 2523         struct inode * inode;
 2524         unsigned long size, rlim_rss;
 2525 
 2526         /* Doesn't work if there's no mapped file. */
 2527         if (!vma->vm_file)
 2528                 return error;
 2529         file = vma->vm_file;
 2530         inode = file->f_dentry->d_inode;
 2531         if (!inode->i_mapping->a_ops->readpage)
 2532                 return error;
 2533         size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 2534 
 2535         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 2536         if (end > vma->vm_end)
 2537                 end = vma->vm_end;
 2538         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 2539 
 2540         /* Make sure this doesn't exceed the process's max rss. */
 2541         error = -EIO;
 2542         rlim_rss = current->rlim ?  current->rlim[RLIMIT_RSS].rlim_cur :
 2543                                 LONG_MAX; /* default: see resource.h */
 2544         if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
 2545                 return error;
 2546 
 2547         /* round to cluster boundaries if this isn't a "random" area. */
 2548         if (!VM_RandomReadHint(vma)) {
 2549                 start = CLUSTER_OFFSET(start);
 2550                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
 2551 
 2552                 while ((start < end) && (start < size)) {
 2553                         error = read_cluster_nonblocking(file, start, size);
 2554                         start += CLUSTER_PAGES;
 2555                         if (error < 0)
 2556                                 break;
 2557                 }
 2558         } else {
 2559                 while ((start < end) && (start < size)) {
 2560                         error = page_cache_read(file, start);
 2561                         start++;
 2562                         if (error < 0)
 2563                                 break;
 2564                 }
 2565         }
 2566 
 2567         /* Don't wait for someone else to push these requests. */
 2568         run_task_queue(&tq_disk);
 2569 
 2570         return error;
 2571 }
 2572 
 2573 /*
 2574  * Application no longer needs these pages.  If the pages are dirty,
 2575  * it's OK to just throw them away.  The app will be more careful about
 2576  * data it wants to keep.  Be sure to free swap resources too.  The
 2577  * zap_page_range call sets things up for refill_inactive to actually free
 2578  * these pages later if no one else has touched them in the meantime,
 2579  * although we could add these pages to a global reuse list for
 2580  * refill_inactive to pick up before reclaiming other pages.
 2581  *
 2582  * NB: This interface discards data rather than pushes it out to swap,
 2583  * as some implementations do.  This has performance implications for
 2584  * applications like large transactional databases which want to discard
 2585  * pages in anonymous maps after committing to backing store the data
 2586  * that was kept in them.  There is no reason to write this data out to
 2587  * the swap area if the application is discarding it.
 2588  *
 2589  * An interface that causes the system to free clean pages and flush
 2590  * dirty pages is already available as msync(MS_INVALIDATE).
 2591  */
 2592 static long madvise_dontneed(struct vm_area_struct * vma,
 2593         unsigned long start, unsigned long end)
 2594 {
 2595         if (vma->vm_flags & VM_LOCKED)
 2596                 return -EINVAL;
 2597 
 2598         zap_page_range(vma->vm_mm, start, end - start);
 2599         return 0;
 2600 }
 2601 
 2602 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
 2603         unsigned long end, int behavior)
 2604 {
 2605         long error = -EBADF;
 2606 
 2607         switch (behavior) {
 2608         case MADV_NORMAL:
 2609         case MADV_SEQUENTIAL:
 2610         case MADV_RANDOM:
 2611                 error = madvise_behavior(vma, start, end, behavior);
 2612                 break;
 2613 
 2614         case MADV_WILLNEED:
 2615                 error = madvise_willneed(vma, start, end);
 2616                 break;
 2617 
 2618         case MADV_DONTNEED:
 2619                 error = madvise_dontneed(vma, start, end);
 2620                 break;
 2621 
 2622         default:
 2623                 error = -EINVAL;
 2624                 break;
 2625         }
 2626                 
 2627         return error;
 2628 }
 2629 
 2630 /*
 2631  * The madvise(2) system call.
 2632  *
 2633  * Applications can use madvise() to advise the kernel how it should
 2634  * handle paging I/O in this VM area.  The idea is to help the kernel
 2635  * use appropriate read-ahead and caching techniques.  The information
 2636  * provided is advisory only, and can be safely disregarded by the
 2637  * kernel without affecting the correct operation of the application.
 2638  *
 2639  * behavior values:
 2640  *  MADV_NORMAL - the default behavior is to read clusters.  This
 2641  *              results in some read-ahead and read-behind.
 2642  *  MADV_RANDOM - the system should read the minimum amount of data
 2643  *              on any access, since it is unlikely that the appli-
 2644  *              cation will need more than what it asks for.
 2645  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 2646  *              once, so they can be aggressively read ahead, and
 2647  *              can be freed soon after they are accessed.
 2648  *  MADV_WILLNEED - the application is notifying the system to read
 2649  *              some pages ahead.
 2650  *  MADV_DONTNEED - the application is finished with the given range,
 2651  *              so the kernel can free resources associated with it.
 2652  *
 2653  * return values:
 2654  *  zero    - success
 2655  *  -EINVAL - start + len < 0, start is not page-aligned,
 2656  *              "behavior" is not a valid value, or application
 2657  *              is attempting to release locked or shared pages.
 2658  *  -ENOMEM - addresses in the specified range are not currently
 2659  *              mapped, or are outside the AS of the process.
 2660  *  -EIO    - an I/O error occurred while paging in data.
 2661  *  -EBADF  - map exists, but area maps something that isn't a file.
 2662  *  -EAGAIN - a kernel resource was temporarily unavailable.
 2663  */
 2664 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
 2665 {
 2666         unsigned long end;
 2667         struct vm_area_struct * vma;
 2668         int unmapped_error = 0;
 2669         int error = -EINVAL;
 2670 
 2671         down_write(&current->mm->mmap_sem);
 2672 
 2673         if (start & ~PAGE_MASK)
 2674                 goto out;
 2675         len = (len + ~PAGE_MASK) & PAGE_MASK;
 2676         end = start + len;
 2677         if (end < start)
 2678                 goto out;
 2679 
 2680         error = 0;
 2681         if (end == start)
 2682                 goto out;
 2683 
 2684         /*
 2685          * If the interval [start,end) covers some unmapped address
 2686          * ranges, just ignore them, but return -ENOMEM at the end.
 2687          */
 2688         vma = find_vma(current->mm, start);
 2689         for (;;) {
 2690                 /* Still start < end. */
 2691                 error = -ENOMEM;
 2692                 if (!vma)
 2693                         goto out;
 2694 
 2695                 /* Here start < vma->vm_end. */
 2696                 if (start < vma->vm_start) {
 2697                         unmapped_error = -ENOMEM;
 2698                         start = vma->vm_start;
 2699                 }
 2700 
 2701                 /* Here vma->vm_start <= start < vma->vm_end. */
 2702                 if (end <= vma->vm_end) {
 2703                         if (start < end) {
 2704                                 error = madvise_vma(vma, start, end,
 2705                                                         behavior);
 2706                                 if (error)
 2707                                         goto out;
 2708                         }
 2709                         error = unmapped_error;
 2710                         goto out;
 2711                 }
 2712 
 2713                 /* Here vma->vm_start <= start < vma->vm_end < end. */
 2714                 error = madvise_vma(vma, start, vma->vm_end, behavior);
 2715                 if (error)
 2716                         goto out;
 2717                 start = vma->vm_end;
 2718                 vma = vma->vm_next;
 2719         }
 2720 
 2721 out:
 2722         up_write(&current->mm->mmap_sem);
 2723         return error;
 2724 }
 2725 
 2726 /*
 2727  * Later we can get more picky about what "in core" means precisely.
 2728  * For now, simply check to see if the page is in the page cache,
 2729  * and is up to date; i.e. that no page-in operation would be required
 2730  * at this time if an application were to map and access this page.
 2731  */
 2732 static unsigned char mincore_page(struct vm_area_struct * vma,
 2733         unsigned long pgoff)
 2734 {
 2735         unsigned char present = 0;
 2736         struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
 2737         struct page * page, ** hash = page_hash(as, pgoff);
 2738 
 2739         spin_lock(&pagecache_lock);
 2740         page = __find_page_nolock(as, pgoff, *hash);
 2741         if ((page) && (Page_Uptodate(page)))
 2742                 present = 1;
 2743         spin_unlock(&pagecache_lock);
 2744 
 2745         return present;
 2746 }
 2747 
 2748 static long mincore_vma(struct vm_area_struct * vma,
 2749         unsigned long start, unsigned long end, unsigned char * vec)
 2750 {
 2751         long error, i, remaining;
 2752         unsigned char * tmp;
 2753 
 2754         error = -ENOMEM;
 2755         if (!vma->vm_file)
 2756                 return error;
 2757 
 2758         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 2759         if (end > vma->vm_end)
 2760                 end = vma->vm_end;
 2761         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 2762 
 2763         error = -EAGAIN;
 2764         tmp = (unsigned char *) __get_free_page(GFP_KERNEL);
 2765         if (!tmp)
 2766                 return error;
 2767 
 2768         /* (end - start) is # of pages, and also # of bytes in "vec */
 2769         remaining = (end - start),
 2770 
 2771         error = 0;
 2772         for (i = 0; remaining > 0; remaining -= PAGE_SIZE, i++) {
 2773                 int j = 0;
 2774                 long thispiece = (remaining < PAGE_SIZE) ?
 2775                                                 remaining : PAGE_SIZE;
 2776 
 2777                 while (j < thispiece)
 2778                         tmp[j++] = mincore_page(vma, start++);
 2779 
 2780                 if (copy_to_user(vec + PAGE_SIZE * i, tmp, thispiece)) {
 2781                         error = -EFAULT;
 2782                         break;
 2783                 }
 2784         }
 2785 
 2786         free_page((unsigned long) tmp);
 2787         return error;
 2788 }
 2789 
 2790 /*
 2791  * The mincore(2) system call.
 2792  *
 2793  * mincore() returns the memory residency status of the pages in the
 2794  * current process's address space specified by [addr, addr + len).
 2795  * The status is returned in a vector of bytes.  The least significant
 2796  * bit of each byte is 1 if the referenced page is in memory, otherwise
 2797  * it is zero.
 2798  *
 2799  * Because the status of a page can change after mincore() checks it
 2800  * but before it returns to the application, the returned vector may
 2801  * contain stale information.  Only locked pages are guaranteed to
 2802  * remain in memory.
 2803  *
 2804  * return values:
 2805  *  zero    - success
 2806  *  -EFAULT - vec points to an illegal address
 2807  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE,
 2808  *              or len has a nonpositive value
 2809  *  -ENOMEM - Addresses in the range [addr, addr + len] are
 2810  *              invalid for the address space of this process, or
 2811  *              specify one or more pages which are not currently
 2812  *              mapped
 2813  *  -EAGAIN - A kernel resource was temporarily unavailable.
 2814  */
 2815 asmlinkage long sys_mincore(unsigned long start, size_t len,
 2816         unsigned char * vec)
 2817 {
 2818         int index = 0;
 2819         unsigned long end;
 2820         struct vm_area_struct * vma;
 2821         int unmapped_error = 0;
 2822         long error = -EINVAL;
 2823 
 2824         down_read(&current->mm->mmap_sem);
 2825 
 2826         if (start & ~PAGE_CACHE_MASK)
 2827                 goto out;
 2828         len = (len + ~PAGE_CACHE_MASK) & PAGE_CACHE_MASK;
 2829         end = start + len;
 2830         if (end < start)
 2831                 goto out;
 2832 
 2833         error = 0;
 2834         if (end == start)
 2835                 goto out;
 2836 
 2837         /*
 2838          * If the interval [start,end) covers some unmapped address
 2839          * ranges, just ignore them, but return -ENOMEM at the end.
 2840          */
 2841         vma = find_vma(current->mm, start);
 2842         for (;;) {
 2843                 /* Still start < end. */
 2844                 error = -ENOMEM;
 2845                 if (!vma)
 2846                         goto out;
 2847 
 2848                 /* Here start < vma->vm_end. */
 2849                 if (start < vma->vm_start) {
 2850                         unmapped_error = -ENOMEM;
 2851                         start = vma->vm_start;
 2852                 }
 2853 
 2854                 /* Here vma->vm_start <= start < vma->vm_end. */
 2855                 if (end <= vma->vm_end) {
 2856                         if (start < end) {
 2857                                 error = mincore_vma(vma, start, end,
 2858                                                         &vec[index]);
 2859                                 if (error)
 2860                                         goto out;
 2861                         }
 2862                         error = unmapped_error;
 2863                         goto out;
 2864                 }
 2865 
 2866                 /* Here vma->vm_start <= start < vma->vm_end < end. */
 2867                 error = mincore_vma(vma, start, vma->vm_end, &vec[index]);
 2868                 if (error)
 2869                         goto out;
 2870                 index += (vma->vm_end - start) >> PAGE_CACHE_SHIFT;
 2871                 start = vma->vm_end;
 2872                 vma = vma->vm_next;
 2873         }
 2874 
 2875 out:
 2876         up_read(&current->mm->mmap_sem);
 2877         return error;
 2878 }
 2879 
 2880 static inline
 2881 struct page *__read_cache_page(struct address_space *mapping,
 2882                                 unsigned long index,
 2883                                 int (*filler)(void *,struct page*),
 2884                                 void *data)
 2885 {
 2886         struct page **hash = page_hash(mapping, index);
 2887         struct page *page, *cached_page = NULL;
 2888         int err;
 2889 repeat:
 2890         page = __find_get_page(mapping, index, hash);
 2891         if (!page) {
 2892                 if (!cached_page) {
 2893                         cached_page = page_cache_alloc(mapping);
 2894                         if (!cached_page)
 2895                                 return ERR_PTR(-ENOMEM);
 2896                 }
 2897                 page = cached_page;
 2898                 if (add_to_page_cache_unique(page, mapping, index, hash))
 2899                         goto repeat;
 2900                 cached_page = NULL;
 2901                 err = filler(data, page);
 2902                 if (err < 0) {
 2903                         page_cache_release(page);
 2904                         page = ERR_PTR(err);
 2905                 }
 2906         }
 2907         if (cached_page)
 2908                 page_cache_release(cached_page);
 2909         return page;
 2910 }
 2911 
 2912 /*
 2913  * Read into the page cache. If a page already exists,
 2914  * and Page_Uptodate() is not set, try to fill the page.
 2915  */
 2916 struct page *read_cache_page(struct address_space *mapping,
 2917                                 unsigned long index,
 2918                                 int (*filler)(void *,struct page*),
 2919                                 void *data)
 2920 {
 2921         struct page *page;
 2922         int err;
 2923 
 2924 retry:
 2925         page = __read_cache_page(mapping, index, filler, data);
 2926         if (IS_ERR(page))
 2927                 goto out;
 2928         mark_page_accessed(page);
 2929         if (Page_Uptodate(page))
 2930                 goto out;
 2931 
 2932         lock_page(page);
 2933         if (!page->mapping) {
 2934                 UnlockPage(page);
 2935                 page_cache_release(page);
 2936                 goto retry;
 2937         }
 2938         if (Page_Uptodate(page)) {
 2939                 UnlockPage(page);
 2940                 goto out;
 2941         }
 2942         err = filler(data, page);
 2943         if (err < 0) {
 2944                 page_cache_release(page);
 2945                 page = ERR_PTR(err);
 2946         }
 2947  out:
 2948         return page;
 2949 }
 2950 
 2951 static inline struct page * __grab_cache_page(struct address_space *mapping,
 2952                                 unsigned long index, struct page **cached_page)
 2953 {
 2954         struct page *page, **hash = page_hash(mapping, index);
 2955 repeat:
 2956         page = __find_lock_page(mapping, index, hash);
 2957         if (!page) {
 2958                 if (!*cached_page) {
 2959                         *cached_page = page_cache_alloc(mapping);
 2960                         if (!*cached_page)
 2961                                 return NULL;
 2962                 }
 2963                 page = *cached_page;
 2964                 if (add_to_page_cache_unique(page, mapping, index, hash))
 2965                         goto repeat;
 2966                 *cached_page = NULL;
 2967         }
 2968         return page;
 2969 }
 2970 
 2971 inline void remove_suid(struct inode *inode)
 2972 {
 2973         unsigned int mode;
 2974 
 2975         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
 2976         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
 2977 
 2978         /* was any of the uid bits set? */
 2979         mode &= inode->i_mode;
 2980         if (mode && !capable(CAP_FSETID)) {
 2981                 inode->i_mode &= ~mode;
 2982                 mark_inode_dirty(inode);
 2983         }
 2984 }
 2985 
 2986 /*
 2987  * precheck_file_write():
 2988  * Check the conditions on a file descriptor prior to beginning a write
 2989  * on it.  Contains the common precheck code for both buffered and direct
 2990  * IO.
 2991  */
 2992 int precheck_file_write(struct file *file, struct inode *inode,
 2993                         size_t *count, loff_t *ppos)
 2994 {
 2995         ssize_t         err;
 2996         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 2997         loff_t          pos = *ppos;
 2998         
 2999         err = -EINVAL;
 3000         if (pos < 0)
 3001                 goto out;
 3002 
 3003         err = file->f_error;
 3004         if (err) {
 3005                 file->f_error = 0;
 3006                 goto out;
 3007         }
 3008 
 3009         /* FIXME: this is for backwards compatibility with 2.4 */
 3010         if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
 3011                 *ppos = pos = inode->i_size;
 3012 
 3013         /*
 3014          * Check whether we've reached the file size limit.
 3015          */
 3016         err = -EFBIG;
 3017         
 3018         if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
 3019                 if (pos >= limit) {
 3020                         send_sig(SIGXFSZ, current, 0);
 3021                         goto out;
 3022                 }
 3023                 if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) {
 3024                         /* send_sig(SIGXFSZ, current, 0); */
 3025                         *count = limit - (u32)pos;
 3026                 }
 3027         }
 3028 
 3029         /*
 3030          *      LFS rule 
 3031          */
 3032         if ( pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
 3033                 if (pos >= MAX_NON_LFS) {
 3034                         send_sig(SIGXFSZ, current, 0);
 3035                         goto out;
 3036                 }
 3037                 if (*count > MAX_NON_LFS - (u32)pos) {
 3038                         /* send_sig(SIGXFSZ, current, 0); */
 3039                         *count = MAX_NON_LFS - (u32)pos;
 3040                 }
 3041         }
 3042 
 3043         /*
 3044          *      Are we about to exceed the fs block limit ?
 3045          *
 3046          *      If we have written data it becomes a short write
 3047          *      If we have exceeded without writing data we send
 3048          *      a signal and give them an EFBIG.
 3049          *
 3050          *      Linus frestrict idea will clean these up nicely..
 3051          */
 3052          
 3053         if (!S_ISBLK(inode->i_mode)) {
 3054                 if (pos >= inode->i_sb->s_maxbytes)
 3055                 {
 3056                         if (*count || pos > inode->i_sb->s_maxbytes) {
 3057                                 send_sig(SIGXFSZ, current, 0);
 3058                                 err = -EFBIG;
 3059                                 goto out;
 3060                         }
 3061                         /* zero-length writes at ->s_maxbytes are OK */
 3062                 }
 3063 
 3064                 if (pos + *count > inode->i_sb->s_maxbytes)
 3065                         *count = inode->i_sb->s_maxbytes - pos;
 3066         } else {
 3067                 if (is_read_only(inode->i_rdev)) {
 3068                         err = -EPERM;
 3069                         goto out;
 3070                 }
 3071                 if (pos >= inode->i_size) {
 3072                         if (*count || pos > inode->i_size) {
 3073                                 err = -ENOSPC;
 3074                                 goto out;
 3075                         }
 3076                 }
 3077 
 3078                 if (pos + *count > inode->i_size)
 3079                         *count = inode->i_size - pos;
 3080         }
 3081 
 3082         err = 0;
 3083 out:
 3084         return err;
 3085 }
 3086 
 3087 /*
 3088  * Write to a file through the page cache. 
 3089  *
 3090  * We currently put everything into the page cache prior to writing it.
 3091  * This is not a problem when writing full pages. With partial pages,
 3092  * however, we first have to read the data into the cache, then
 3093  * dirty the page, and finally schedule it for writing. Alternatively, we
 3094  * could write-through just the portion of data that would go into that
 3095  * page, but that would kill performance for applications that write data
 3096  * line by line, and it's prone to race conditions.
 3097  *
 3098  * Note that this routine doesn't try to keep track of dirty pages. Each
 3099  * file system has to do this all by itself, unfortunately.
 3100  *                                                      okir@monad.swb.de
 3101  */
 3102 ssize_t
 3103 do_generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
 3104 {
 3105         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 3106         struct inode    *inode = mapping->host;
 3107         loff_t          pos;
 3108         struct page     *page, *cached_page;
 3109         ssize_t         written;
 3110         long            status = 0;
 3111         ssize_t         err;
 3112         unsigned        bytes;
 3113 
 3114         cached_page = NULL;
 3115         pos = *ppos;
 3116         written = 0;
 3117 
 3118         err = precheck_file_write(file, inode, &count, &pos);
 3119         if (err != 0 || count == 0)
 3120                 goto out;
 3121 
 3122         remove_suid(inode);
 3123         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 3124         mark_inode_dirty_sync(inode);
 3125 
 3126         do {
 3127                 unsigned long index, offset;
 3128                 long page_fault;
 3129                 char *kaddr;
 3130 
 3131                 /*
 3132                  * Try to find the page in the cache. If it isn't there,
 3133                  * allocate a free page.
 3134                  */
 3135                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 3136                 index = pos >> PAGE_CACHE_SHIFT;
 3137                 bytes = PAGE_CACHE_SIZE - offset;
 3138                 if (bytes > count)
 3139                         bytes = count;
 3140 
 3141                 /*
 3142                  * Bring in the user page that we will copy from _first_.
 3143                  * Otherwise there's a nasty deadlock on copying from the
 3144                  * same page as we're writing to, without it being marked
 3145                  * up-to-date.
 3146                  */
 3147                 { volatile unsigned char dummy;
 3148                         __get_user(dummy, buf);
 3149                         __get_user(dummy, buf+bytes-1);
 3150                 }
 3151 
 3152                 status = -ENOMEM;       /* we'll assign it later anyway */
 3153                 page = __grab_cache_page(mapping, index, &cached_page);
 3154                 if (!page)
 3155                         break;
 3156 
 3157                 /* We have exclusive IO access to the page.. */
 3158                 if (!PageLocked(page)) {
 3159                         PAGE_BUG(page);
 3160                 }
 3161 
 3162                 kaddr = kmap(page);
 3163                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
 3164                 if (status)
 3165                         goto sync_failure;
 3166                 page_fault = __copy_from_user(kaddr+offset, buf, bytes);
 3167                 flush_dcache_page(page);
 3168                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
 3169                 if (page_fault)
 3170                         goto fail_write;
 3171                 if (!status)
 3172                         status = bytes;
 3173 
 3174                 if (status >= 0) {
 3175                         written += status;
 3176                         count -= status;
 3177                         pos += status;
 3178                         buf += status;
 3179                 }
 3180 unlock:
 3181                 kunmap(page);
 3182                 /* Mark it unlocked again and drop the page.. */
 3183                 SetPageReferenced(page);
 3184                 UnlockPage(page);
 3185                 page_cache_release(page);
 3186 
 3187                 if (status < 0)
 3188                         break;
 3189         } while (count);
 3190 done:
 3191         *ppos = pos;
 3192 
 3193         if (cached_page)
 3194                 page_cache_release(cached_page);
 3195 
 3196         /* For now, when the user asks for O_SYNC, we'll actually
 3197          * provide O_DSYNC. */
 3198         if (status >= 0) {
 3199                 if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
 3200                         status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
 3201         }
 3202         
 3203         err = written ? written : status;
 3204 out:
 3205 
 3206         return err;
 3207 fail_write:
 3208         status = -EFAULT;
 3209         goto unlock;
 3210 
 3211 sync_failure:
 3212         /*
 3213          * If blocksize < pagesize, prepare_write() may have instantiated a
 3214          * few blocks outside i_size.  Trim these off again.
 3215          */
 3216         kunmap(page);
 3217         UnlockPage(page);
 3218         page_cache_release(page);
 3219         if (pos + bytes > inode->i_size)
 3220                 vmtruncate(inode, inode->i_size);
 3221         goto done;
 3222 }
 3223 
 3224 ssize_t
 3225 do_generic_direct_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
 3226 {
 3227         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 3228         struct inode    *inode = mapping->host;
 3229         loff_t          pos;
 3230         ssize_t         written;
 3231         long            status = 0;
 3232         ssize_t         err;
 3233 
 3234         pos = *ppos;
 3235         written = 0;
 3236 
 3237         err = precheck_file_write(file, inode, &count, &pos);
 3238         if (err != 0 || count == 0)
 3239                 goto out;
 3240 
 3241         if (!file->f_flags & O_DIRECT)
 3242                 BUG();
 3243 
 3244         remove_suid(inode);
 3245         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 3246         mark_inode_dirty_sync(inode);
 3247 
 3248         written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
 3249         if (written > 0) {
 3250                 loff_t end = pos + written;
 3251                 if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
 3252                         inode->i_size = end;
 3253                         mark_inode_dirty(inode);
 3254                 }
 3255                 *ppos = end;
 3256                 invalidate_inode_pages2(mapping);
 3257         }
 3258         /*
 3259          * Sync the fs metadata but not the minor inode changes and
 3260          * of course not the data as we did direct DMA for the IO.
 3261          */
 3262         if (written >= 0 && file->f_flags & O_SYNC)
 3263                 status = generic_osync_inode(inode, OSYNC_METADATA);
 3264 
 3265         err = written ? written : status;
 3266 out:
 3267         return err;
 3268 }
 3269 
 3270 static int do_odirect_fallback(struct file *file, struct inode *inode,
 3271                                const char *buf, size_t count, loff_t *ppos)
 3272 {
 3273         ssize_t ret;
 3274         int err;
 3275 
 3276         down(&inode->i_sem);
 3277         ret = do_generic_file_write(file, buf, count, ppos);
 3278         if (ret > 0) {
 3279                 err = do_fdatasync(file);
 3280                 if (err)
 3281                         ret = err;
 3282         }
 3283         up(&inode->i_sem);
 3284         return ret;
 3285 }
 3286 
 3287 ssize_t
 3288 generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
 3289 {
 3290         struct inode    *inode = file->f_dentry->d_inode->i_mapping->host;
 3291         ssize_t         err;
 3292 
 3293         if ((ssize_t) count < 0)
 3294                 return -EINVAL;
 3295 
 3296         if (!access_ok(VERIFY_READ, buf, count))
 3297                 return -EFAULT;
 3298 
 3299         if (file->f_flags & O_DIRECT) {
 3300                 /* do_generic_direct_write may drop i_sem during the
 3301                    actual IO */
 3302                 down_read(&inode->i_alloc_sem);
 3303                 down(&inode->i_sem);
 3304                 err = do_generic_direct_write(file, buf, count, ppos);
 3305                 up(&inode->i_sem);
 3306                 up_read(&inode->i_alloc_sem);
 3307                 if (unlikely(err == -ENOTBLK))
 3308                         err = do_odirect_fallback(file, inode, buf, count, ppos);
 3309         } else {
 3310                 down(&inode->i_sem);
 3311                 err = do_generic_file_write(file, buf, count, ppos);
 3312                 up(&inode->i_sem);
 3313         }
 3314 
 3315         return err;
 3316 }
 3317 
 3318 void __init page_cache_init(unsigned long mempages)
 3319 {
 3320         unsigned long htable_size, order;
 3321 
 3322         htable_size = mempages;
 3323         htable_size *= sizeof(struct page *);
 3324         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
 3325                 ;
 3326 
 3327         do {
 3328                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
 3329 
 3330                 page_hash_bits = 0;
 3331                 while((tmp >>= 1UL) != 0UL)
 3332                         page_hash_bits++;
 3333 
 3334                 page_hash_table = (struct page **)
 3335                         __get_free_pages(GFP_ATOMIC, order);
 3336         } while(page_hash_table == NULL && --order > 0);
 3337 
 3338         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
 3339                (1 << page_hash_bits), order, (PAGE_SIZE << order));
 3340         if (!page_hash_table)
 3341                 panic("Failed to allocate page hash table\n");
 3342         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
 3343 }

Cache object: 2fc44c78fe8166d7cbe850d9c0575914


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.