The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/mm/memory.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  *  linux/mm/memory.c
    3  *
    4  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
    5  */
    6 
    7 /*
    8  * demand-loading started 01.12.91 - seems it is high on the list of
    9  * things wanted, and it should be easy to implement. - Linus
   10  */
   11 
   12 /*
   13  * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
   14  * pages started 02.12.91, seems to work. - Linus.
   15  *
   16  * Tested sharing by executing about 30 /bin/sh: under the old kernel it
   17  * would have taken more than the 6M I have free, but it worked well as
   18  * far as I could see.
   19  *
   20  * Also corrected some "invalidate()"s - I wasn't doing enough of them.
   21  */
   22 
   23 /*
   24  * Real VM (paging to/from disk) started 18.12.91. Much more work and
   25  * thought has to go into this. Oh, well..
   26  * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
   27  *              Found it. Everything seems to work now.
   28  * 20.12.91  -  Ok, making the swap-device changeable like the root.
   29  */
   30 
   31 /*
   32  * 05.04.94  -  Multi-page memory management added for v1.1.
   33  *              Idea by Alex Bligh (alex@cconcepts.co.uk)
   34  *
   35  * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
   36  *              (Gerhard.Wichert@pdb.siemens.de)
   37  */
   38 
   39 #include <linux/mm.h>
   40 #include <linux/mman.h>
   41 #include <linux/swap.h>
   42 #include <linux/smp_lock.h>
   43 #include <linux/swapctl.h>
   44 #include <linux/iobuf.h>
   45 #include <linux/highmem.h>
   46 #include <linux/pagemap.h>
   47 #include <linux/module.h>
   48 
   49 #include <asm/pgalloc.h>
   50 #include <asm/uaccess.h>
   51 #include <asm/tlb.h>
   52 
   53 unsigned long max_mapnr;
   54 unsigned long num_physpages;
   55 unsigned long num_mappedpages;
   56 void * high_memory;
   57 struct page *highmem_start_page;
   58 
   59 /*
   60  * We special-case the C-O-W ZERO_PAGE, because it's such
   61  * a common occurrence (no need to read the page to know
   62  * that it's zero - better for the cache and memory subsystem).
   63  */
   64 static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
   65 {
   66         if (from == ZERO_PAGE(address)) {
   67                 clear_user_highpage(to, address);
   68                 return;
   69         }
   70         copy_user_highpage(to, from, address);
   71 }
   72 
   73 mem_map_t * mem_map;
   74 
   75 /*
   76  * Called by TLB shootdown 
   77  */
   78 void __free_pte(pte_t pte)
   79 {
   80         struct page *page = pte_page(pte);
   81         if ((!VALID_PAGE(page)) || PageReserved(page))
   82                 return;
   83         if (pte_dirty(pte))
   84                 set_page_dirty(page);           
   85         free_page_and_swap_cache(page);
   86 }
   87 
   88 
   89 /*
   90  * Note: this doesn't free the actual pages themselves. That
   91  * has been handled earlier when unmapping all the memory regions.
   92  */
   93 static inline void free_one_pmd(pmd_t * dir)
   94 {
   95         pte_t * pte;
   96 
   97         if (pmd_none(*dir))
   98                 return;
   99         if (pmd_bad(*dir)) {
  100                 pmd_ERROR(*dir);
  101                 pmd_clear(dir);
  102                 return;
  103         }
  104         pte = pte_offset(dir, 0);
  105         pmd_clear(dir);
  106         pte_free(pte);
  107 }
  108 
  109 static inline void free_one_pgd(pgd_t * dir)
  110 {
  111         int j;
  112         pmd_t * pmd;
  113 
  114         if (pgd_none(*dir))
  115                 return;
  116         if (pgd_bad(*dir)) {
  117                 pgd_ERROR(*dir);
  118                 pgd_clear(dir);
  119                 return;
  120         }
  121         pmd = pmd_offset(dir, 0);
  122         pgd_clear(dir);
  123         for (j = 0; j < PTRS_PER_PMD ; j++) {
  124                 prefetchw(pmd+j+(PREFETCH_STRIDE/16));
  125                 free_one_pmd(pmd+j);
  126         }
  127         pmd_free(pmd);
  128 }
  129 
  130 /* Low and high watermarks for page table cache.
  131    The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
  132  */
  133 int pgt_cache_water[2] = { 25, 50 };
  134 
  135 /* Returns the number of pages freed */
  136 int check_pgt_cache(void)
  137 {
  138         return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
  139 }
  140 
  141 
  142 /*
  143  * This function clears all user-level page tables of a process - this
  144  * is needed by execve(), so that old pages aren't in the way.
  145  */
  146 void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
  147 {
  148         pgd_t * page_dir = mm->pgd;
  149 
  150         spin_lock(&mm->page_table_lock);
  151         page_dir += first;
  152         do {
  153                 free_one_pgd(page_dir);
  154                 page_dir++;
  155         } while (--nr);
  156         spin_unlock(&mm->page_table_lock);
  157 
  158         /* keep the page table cache within bounds */
  159         check_pgt_cache();
  160 }
  161 
  162 #define PTE_TABLE_MASK  ((PTRS_PER_PTE-1) * sizeof(pte_t))
  163 #define PMD_TABLE_MASK  ((PTRS_PER_PMD-1) * sizeof(pmd_t))
  164 
  165 /*
  166  * copy one vm_area from one task to the other. Assumes the page tables
  167  * already present in the new task to be cleared in the whole range
  168  * covered by this vma.
  169  *
  170  * 08Jan98 Merged into one routine from several inline routines to reduce
  171  *         variable count and make things faster. -jj
  172  *
  173  * dst->page_table_lock is held on entry and exit,
  174  * but may be dropped within pmd_alloc() and pte_alloc().
  175  */
  176 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
  177                         struct vm_area_struct *vma)
  178 {
  179         pgd_t * src_pgd, * dst_pgd;
  180         unsigned long address = vma->vm_start;
  181         unsigned long end = vma->vm_end;
  182         unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
  183 
  184         src_pgd = pgd_offset(src, address)-1;
  185         dst_pgd = pgd_offset(dst, address)-1;
  186 
  187         for (;;) {
  188                 pmd_t * src_pmd, * dst_pmd;
  189 
  190                 src_pgd++; dst_pgd++;
  191                 
  192                 /* copy_pmd_range */
  193                 
  194                 if (pgd_none(*src_pgd))
  195                         goto skip_copy_pmd_range;
  196                 if (pgd_bad(*src_pgd)) {
  197                         pgd_ERROR(*src_pgd);
  198                         pgd_clear(src_pgd);
  199 skip_copy_pmd_range:    address = (address + PGDIR_SIZE) & PGDIR_MASK;
  200                         if (!address || (address >= end))
  201                                 goto out;
  202                         continue;
  203                 }
  204 
  205                 src_pmd = pmd_offset(src_pgd, address);
  206                 dst_pmd = pmd_alloc(dst, dst_pgd, address);
  207                 if (!dst_pmd)
  208                         goto nomem;
  209 
  210                 do {
  211                         pte_t * src_pte, * dst_pte;
  212                 
  213                         /* copy_pte_range */
  214                 
  215                         if (pmd_none(*src_pmd))
  216                                 goto skip_copy_pte_range;
  217                         if (pmd_bad(*src_pmd)) {
  218                                 pmd_ERROR(*src_pmd);
  219                                 pmd_clear(src_pmd);
  220 skip_copy_pte_range:            address = (address + PMD_SIZE) & PMD_MASK;
  221                                 if (address >= end)
  222                                         goto out;
  223                                 goto cont_copy_pmd_range;
  224                         }
  225 
  226                         src_pte = pte_offset(src_pmd, address);
  227                         dst_pte = pte_alloc(dst, dst_pmd, address);
  228                         if (!dst_pte)
  229                                 goto nomem;
  230 
  231                         spin_lock(&src->page_table_lock);                       
  232                         do {
  233                                 pte_t pte = *src_pte;
  234                                 struct page *ptepage;
  235                                 
  236                                 /* copy_one_pte */
  237 
  238                                 if (pte_none(pte))
  239                                         goto cont_copy_pte_range_noset;
  240                                 if (!pte_present(pte)) {
  241                                         swap_duplicate(pte_to_swp_entry(pte));
  242                                         goto cont_copy_pte_range;
  243                                 }
  244                                 ptepage = pte_page(pte);
  245                                 if ((!VALID_PAGE(ptepage)) || 
  246                                     PageReserved(ptepage))
  247                                         goto cont_copy_pte_range;
  248 
  249                                 /* If it's a COW mapping, write protect it both in the parent and the child */
  250                                 if (cow && pte_write(pte)) {
  251                                         ptep_set_wrprotect(src_pte);
  252                                         pte = *src_pte;
  253                                 }
  254 
  255                                 /* If it's a shared mapping, mark it clean in the child */
  256                                 if (vma->vm_flags & VM_SHARED)
  257                                         pte = pte_mkclean(pte);
  258                                 pte = pte_mkold(pte);
  259                                 get_page(ptepage);
  260                                 dst->rss++;
  261 
  262 cont_copy_pte_range:            set_pte(dst_pte, pte);
  263 cont_copy_pte_range_noset:      address += PAGE_SIZE;
  264                                 if (address >= end)
  265                                         goto out_unlock;
  266                                 src_pte++;
  267                                 dst_pte++;
  268                         } while ((unsigned long)src_pte & PTE_TABLE_MASK);
  269                         spin_unlock(&src->page_table_lock);
  270                 
  271 cont_copy_pmd_range:    src_pmd++;
  272                         dst_pmd++;
  273                 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
  274         }
  275 out_unlock:
  276         spin_unlock(&src->page_table_lock);
  277 out:
  278         return 0;
  279 nomem:
  280         return -ENOMEM;
  281 }
  282 
  283 /*
  284  * Return indicates whether a page was freed so caller can adjust rss
  285  */
  286 static inline void forget_pte(pte_t page)
  287 {
  288         if (!pte_none(page)) {
  289                 printk("forget_pte: old mapping existed!\n");
  290                 BUG();
  291         }
  292 }
  293 
  294 static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
  295 {
  296         unsigned long offset;
  297         pte_t * ptep;
  298         int freed = 0;
  299 
  300         if (pmd_none(*pmd))
  301                 return 0;
  302         if (pmd_bad(*pmd)) {
  303                 pmd_ERROR(*pmd);
  304                 pmd_clear(pmd);
  305                 return 0;
  306         }
  307         ptep = pte_offset(pmd, address);
  308         offset = address & ~PMD_MASK;
  309         if (offset + size > PMD_SIZE)
  310                 size = PMD_SIZE - offset;
  311         size &= PAGE_MASK;
  312         for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
  313                 pte_t pte = *ptep;
  314                 if (pte_none(pte))
  315                         continue;
  316                 if (pte_present(pte)) {
  317                         struct page *page = pte_page(pte);
  318                         if (VALID_PAGE(page) && !PageReserved(page))
  319                                 freed ++;
  320                         /* This will eventually call __free_pte on the pte. */
  321                         tlb_remove_page(tlb, ptep, address + offset);
  322                 } else {
  323                         free_swap_and_cache(pte_to_swp_entry(pte));
  324                         pte_clear(ptep);
  325                 }
  326         }
  327 
  328         return freed;
  329 }
  330 
  331 static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size)
  332 {
  333         pmd_t * pmd;
  334         unsigned long end;
  335         int freed;
  336 
  337         if (pgd_none(*dir))
  338                 return 0;
  339         if (pgd_bad(*dir)) {
  340                 pgd_ERROR(*dir);
  341                 pgd_clear(dir);
  342                 return 0;
  343         }
  344         pmd = pmd_offset(dir, address);
  345         end = address + size;
  346         if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
  347                 end = ((address + PGDIR_SIZE) & PGDIR_MASK);
  348         freed = 0;
  349         do {
  350                 freed += zap_pte_range(tlb, pmd, address, end - address);
  351                 address = (address + PMD_SIZE) & PMD_MASK; 
  352                 pmd++;
  353         } while (address < end);
  354         return freed;
  355 }
  356 
  357 /*
  358  * remove user pages in a given range.
  359  */
  360 void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
  361 {
  362         mmu_gather_t *tlb;
  363         pgd_t * dir;
  364         unsigned long start = address, end = address + size;
  365         int freed = 0;
  366 
  367         dir = pgd_offset(mm, address);
  368 
  369         /*
  370          * This is a long-lived spinlock. That's fine.
  371          * There's no contention, because the page table
  372          * lock only protects against kswapd anyway, and
  373          * even if kswapd happened to be looking at this
  374          * process we _want_ it to get stuck.
  375          */
  376         if (address >= end)
  377                 BUG();
  378         spin_lock(&mm->page_table_lock);
  379         flush_cache_range(mm, address, end);
  380         tlb = tlb_gather_mmu(mm);
  381 
  382         do {
  383                 freed += zap_pmd_range(tlb, dir, address, end - address);
  384                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
  385                 dir++;
  386         } while (address && (address < end));
  387 
  388         /* this will flush any remaining tlb entries */
  389         tlb_finish_mmu(tlb, start, end);
  390 
  391         /*
  392          * Update rss for the mm_struct (not necessarily current->mm)
  393          * Notice that rss is an unsigned long.
  394          */
  395         if (mm->rss > freed)
  396                 mm->rss -= freed;
  397         else
  398                 mm->rss = 0;
  399         spin_unlock(&mm->page_table_lock);
  400 }
  401 
  402 /*
  403  * Do a quick page-table lookup for a single page. 
  404  */
  405 static struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) 
  406 {
  407         pgd_t *pgd;
  408         pmd_t *pmd;
  409         pte_t *ptep, pte;
  410 
  411         pgd = pgd_offset(mm, address);
  412         if (pgd_none(*pgd) || pgd_bad(*pgd))
  413                 goto out;
  414 
  415         pmd = pmd_offset(pgd, address);
  416         if (pmd_none(*pmd) || pmd_bad(*pmd))
  417                 goto out;
  418 
  419         ptep = pte_offset(pmd, address);
  420         if (!ptep)
  421                 goto out;
  422 
  423         pte = *ptep;
  424         if (pte_present(pte)) {
  425                 if (!write ||
  426                     (pte_write(pte) && pte_dirty(pte)))
  427                         return pte_page(pte);
  428         }
  429 
  430 out:
  431         return 0;
  432 }
  433 
  434 /* 
  435  * Given a physical address, is there a useful struct page pointing to
  436  * it?  This may become more complex in the future if we start dealing
  437  * with IO-aperture pages in kiobufs.
  438  */
  439 
  440 static inline struct page * get_page_map(struct page *page)
  441 {
  442         if (!VALID_PAGE(page))
  443                 return 0;
  444         return page;
  445 }
  446 
  447 /*
  448  * Please read Documentation/cachetlb.txt before using this function,
  449  * accessing foreign memory spaces can cause cache coherency problems.
  450  *
  451  * Accessing a VM_IO area is even more dangerous, therefore the function
  452  * fails if pages is != NULL and a VM_IO area is found.
  453  */
  454 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
  455                 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas)
  456 {
  457         int i;
  458         unsigned int flags;
  459 
  460         /*
  461          * Require read or write permissions.
  462          * If 'force' is set, we only require the "MAY" flags.
  463          */
  464         flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
  465         flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
  466         i = 0;
  467 
  468         do {
  469                 struct vm_area_struct * vma;
  470 
  471                 vma = find_extend_vma(mm, start);
  472 
  473                 if ( !vma || (pages && vma->vm_flags & VM_IO) || !(flags & vma->vm_flags) )
  474                         return i ? : -EFAULT;
  475 
  476                 spin_lock(&mm->page_table_lock);
  477                 do {
  478                         struct page *map;
  479                         while (!(map = follow_page(mm, start, write))) {
  480                                 spin_unlock(&mm->page_table_lock);
  481                                 switch (handle_mm_fault(mm, vma, start, write)) {
  482                                 case 1:
  483                                         tsk->min_flt++;
  484                                         break;
  485                                 case 2:
  486                                         tsk->maj_flt++;
  487                                         break;
  488                                 case 0:
  489                                         if (i) return i;
  490                                         return -EFAULT;
  491                                 default:
  492                                         if (i) return i;
  493                                         return -ENOMEM;
  494                                 }
  495                                 spin_lock(&mm->page_table_lock);
  496                         }
  497                         if (pages) {
  498                                 pages[i] = get_page_map(map);
  499                                 /* FIXME: call the correct function,
  500                                  * depending on the type of the found page
  501                                  */
  502                                 if (!pages[i])
  503                                         goto bad_page;
  504                                 page_cache_get(pages[i]);
  505                         }
  506                         if (vmas)
  507                                 vmas[i] = vma;
  508                         i++;
  509                         start += PAGE_SIZE;
  510                         len--;
  511                 } while(len && start < vma->vm_end);
  512                 spin_unlock(&mm->page_table_lock);
  513         } while(len);
  514 out:
  515         return i;
  516 
  517         /*
  518          * We found an invalid page in the VMA.  Release all we have
  519          * so far and fail.
  520          */
  521 bad_page:
  522         spin_unlock(&mm->page_table_lock);
  523         while (i--)
  524                 page_cache_release(pages[i]);
  525         i = -EFAULT;
  526         goto out;
  527 }
  528 
  529 EXPORT_SYMBOL(get_user_pages);
  530 
  531 /*
  532  * Force in an entire range of pages from the current process's user VA,
  533  * and pin them in physical memory.  
  534  */
  535 #define dprintk(x...)
  536 
  537 int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
  538 {
  539         int pgcount, err;
  540         struct mm_struct *      mm;
  541         
  542         /* Make sure the iobuf is not already mapped somewhere. */
  543         if (iobuf->nr_pages)
  544                 return -EINVAL;
  545 
  546         mm = current->mm;
  547         dprintk ("map_user_kiobuf: begin\n");
  548         
  549         pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE;
  550         /* mapping 0 bytes is not permitted */
  551         if (!pgcount) BUG();
  552         err = expand_kiobuf(iobuf, pgcount);
  553         if (err)
  554                 return err;
  555 
  556         iobuf->locked = 0;
  557         iobuf->offset = va & (PAGE_SIZE-1);
  558         iobuf->length = len;
  559         
  560         /* Try to fault in all of the necessary pages */
  561         down_read(&mm->mmap_sem);
  562         /* rw==READ means read from disk, write into memory area */
  563         err = get_user_pages(current, mm, va, pgcount,
  564                         (rw==READ), 0, iobuf->maplist, NULL);
  565         up_read(&mm->mmap_sem);
  566         if (err < 0) {
  567                 unmap_kiobuf(iobuf);
  568                 dprintk ("map_user_kiobuf: end %d\n", err);
  569                 return err;
  570         }
  571         iobuf->nr_pages = err;
  572         while (pgcount--) {
  573                 /* FIXME: flush superflous for rw==READ,
  574                  * probably wrong function for rw==WRITE
  575                  */
  576                 flush_dcache_page(iobuf->maplist[pgcount]);
  577         }
  578         dprintk ("map_user_kiobuf: end OK\n");
  579         return 0;
  580 }
  581 
  582 /*
  583  * Mark all of the pages in a kiobuf as dirty 
  584  *
  585  * We need to be able to deal with short reads from disk: if an IO error
  586  * occurs, the number of bytes read into memory may be less than the
  587  * size of the kiobuf, so we have to stop marking pages dirty once the
  588  * requested byte count has been reached.
  589  *
  590  * Must be called from process context - set_page_dirty() takes VFS locks.
  591  */
  592 
  593 void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes)
  594 {
  595         int index, offset, remaining;
  596         struct page *page;
  597         
  598         index = iobuf->offset >> PAGE_SHIFT;
  599         offset = iobuf->offset & ~PAGE_MASK;
  600         remaining = bytes;
  601         if (remaining > iobuf->length)
  602                 remaining = iobuf->length;
  603         
  604         while (remaining > 0 && index < iobuf->nr_pages) {
  605                 page = iobuf->maplist[index];
  606                 
  607                 if (!PageReserved(page))
  608                         set_page_dirty(page);
  609 
  610                 remaining -= (PAGE_SIZE - offset);
  611                 offset = 0;
  612                 index++;
  613         }
  614 }
  615 
  616 /*
  617  * Unmap all of the pages referenced by a kiobuf.  We release the pages,
  618  * and unlock them if they were locked. 
  619  */
  620 
  621 void unmap_kiobuf (struct kiobuf *iobuf) 
  622 {
  623         int i;
  624         struct page *map;
  625         
  626         for (i = 0; i < iobuf->nr_pages; i++) {
  627                 map = iobuf->maplist[i];
  628                 if (map) {
  629                         if (iobuf->locked)
  630                                 UnlockPage(map);
  631                         /* FIXME: cache flush missing for rw==READ
  632                          * FIXME: call the correct reference counting function
  633                          */
  634                         page_cache_release(map);
  635                 }
  636         }
  637         
  638         iobuf->nr_pages = 0;
  639         iobuf->locked = 0;
  640 }
  641 
  642 
  643 /*
  644  * Lock down all of the pages of a kiovec for IO.
  645  *
  646  * If any page is mapped twice in the kiovec, we return the error -EINVAL.
  647  *
  648  * The optional wait parameter causes the lock call to block until all
  649  * pages can be locked if set.  If wait==0, the lock operation is
  650  * aborted if any locked pages are found and -EAGAIN is returned.
  651  */
  652 
  653 int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
  654 {
  655         struct kiobuf *iobuf;
  656         int i, j;
  657         struct page *page, **ppage;
  658         int doublepage = 0;
  659         int repeat = 0;
  660         
  661  repeat:
  662         
  663         for (i = 0; i < nr; i++) {
  664                 iobuf = iovec[i];
  665 
  666                 if (iobuf->locked)
  667                         continue;
  668 
  669                 ppage = iobuf->maplist;
  670                 for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
  671                         page = *ppage;
  672                         if (!page)
  673                                 continue;
  674                         
  675                         if (TryLockPage(page)) {
  676                                 while (j--) {
  677                                         struct page *tmp = *--ppage;
  678                                         if (tmp)
  679                                                 UnlockPage(tmp);
  680                                 }
  681                                 goto retry;
  682                         }
  683                 }
  684                 iobuf->locked = 1;
  685         }
  686 
  687         return 0;
  688         
  689  retry:
  690         
  691         /* 
  692          * We couldn't lock one of the pages.  Undo the locking so far,
  693          * wait on the page we got to, and try again.  
  694          */
  695         
  696         unlock_kiovec(nr, iovec);
  697         if (!wait)
  698                 return -EAGAIN;
  699         
  700         /* 
  701          * Did the release also unlock the page we got stuck on?
  702          */
  703         if (!PageLocked(page)) {
  704                 /* 
  705                  * If so, we may well have the page mapped twice
  706                  * in the IO address range.  Bad news.  Of
  707                  * course, it _might_ just be a coincidence,
  708                  * but if it happens more than once, chances
  709                  * are we have a double-mapped page. 
  710                  */
  711                 if (++doublepage >= 3) 
  712                         return -EINVAL;
  713                 
  714                 /* Try again...  */
  715                 wait_on_page(page);
  716         }
  717         
  718         if (++repeat < 16)
  719                 goto repeat;
  720         return -EAGAIN;
  721 }
  722 
  723 /*
  724  * Unlock all of the pages of a kiovec after IO.
  725  */
  726 
  727 int unlock_kiovec(int nr, struct kiobuf *iovec[])
  728 {
  729         struct kiobuf *iobuf;
  730         int i, j;
  731         struct page *page, **ppage;
  732         
  733         for (i = 0; i < nr; i++) {
  734                 iobuf = iovec[i];
  735 
  736                 if (!iobuf->locked)
  737                         continue;
  738                 iobuf->locked = 0;
  739                 
  740                 ppage = iobuf->maplist;
  741                 for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
  742                         page = *ppage;
  743                         if (!page)
  744                                 continue;
  745                         UnlockPage(page);
  746                 }
  747         }
  748         return 0;
  749 }
  750 
  751 static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
  752                                      unsigned long size, pgprot_t prot)
  753 {
  754         unsigned long end;
  755 
  756         address &= ~PMD_MASK;
  757         end = address + size;
  758         if (end > PMD_SIZE)
  759                 end = PMD_SIZE;
  760         do {
  761                 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
  762                 pte_t oldpage = ptep_get_and_clear(pte);
  763                 set_pte(pte, zero_pte);
  764                 forget_pte(oldpage);
  765                 address += PAGE_SIZE;
  766                 pte++;
  767         } while (address && (address < end));
  768 }
  769 
  770 static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
  771                                     unsigned long size, pgprot_t prot)
  772 {
  773         unsigned long end;
  774 
  775         address &= ~PGDIR_MASK;
  776         end = address + size;
  777         if (end > PGDIR_SIZE)
  778                 end = PGDIR_SIZE;
  779         do {
  780                 pte_t * pte = pte_alloc(mm, pmd, address);
  781                 if (!pte)
  782                         return -ENOMEM;
  783                 zeromap_pte_range(pte, address, end - address, prot);
  784                 address = (address + PMD_SIZE) & PMD_MASK;
  785                 pmd++;
  786         } while (address && (address < end));
  787         return 0;
  788 }
  789 
  790 int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
  791 {
  792         int error = 0;
  793         pgd_t * dir;
  794         unsigned long beg = address;
  795         unsigned long end = address + size;
  796         struct mm_struct *mm = current->mm;
  797 
  798         dir = pgd_offset(mm, address);
  799         flush_cache_range(mm, beg, end);
  800         if (address >= end)
  801                 BUG();
  802 
  803         spin_lock(&mm->page_table_lock);
  804         do {
  805                 pmd_t *pmd = pmd_alloc(mm, dir, address);
  806                 error = -ENOMEM;
  807                 if (!pmd)
  808                         break;
  809                 error = zeromap_pmd_range(mm, pmd, address, end - address, prot);
  810                 if (error)
  811                         break;
  812                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
  813                 dir++;
  814         } while (address && (address < end));
  815         spin_unlock(&mm->page_table_lock);
  816         flush_tlb_range(mm, beg, end);
  817         return error;
  818 }
  819 
  820 /*
  821  * maps a range of physical memory into the requested pages. the old
  822  * mappings are removed. any references to nonexistent pages results
  823  * in null mappings (currently treated as "copy-on-access")
  824  */
  825 static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
  826         unsigned long phys_addr, pgprot_t prot)
  827 {
  828         unsigned long end;
  829 
  830         address &= ~PMD_MASK;
  831         end = address + size;
  832         if (end > PMD_SIZE)
  833                 end = PMD_SIZE;
  834         do {
  835                 struct page *page;
  836                 pte_t oldpage;
  837                 oldpage = ptep_get_and_clear(pte);
  838 
  839                 page = virt_to_page(__va(phys_addr));
  840                 if ((!VALID_PAGE(page)) || PageReserved(page))
  841                         set_pte(pte, mk_pte_phys(phys_addr, prot));
  842                 forget_pte(oldpage);
  843                 address += PAGE_SIZE;
  844                 phys_addr += PAGE_SIZE;
  845                 pte++;
  846         } while (address && (address < end));
  847 }
  848 
  849 static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
  850         unsigned long phys_addr, pgprot_t prot)
  851 {
  852         unsigned long end;
  853 
  854         address &= ~PGDIR_MASK;
  855         end = address + size;
  856         if (end > PGDIR_SIZE)
  857                 end = PGDIR_SIZE;
  858         phys_addr -= address;
  859         do {
  860                 pte_t * pte = pte_alloc(mm, pmd, address);
  861                 if (!pte)
  862                         return -ENOMEM;
  863                 remap_pte_range(pte, address, end - address, address + phys_addr, prot);
  864                 address = (address + PMD_SIZE) & PMD_MASK;
  865                 pmd++;
  866         } while (address && (address < end));
  867         return 0;
  868 }
  869 
  870 /*  Note: this is only safe if the mm semaphore is held when called. */
  871 int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
  872 {
  873         int error = 0;
  874         pgd_t * dir;
  875         unsigned long beg = from;
  876         unsigned long end = from + size;
  877         struct mm_struct *mm = current->mm;
  878 
  879         phys_addr -= from;
  880         dir = pgd_offset(mm, from);
  881         flush_cache_range(mm, beg, end);
  882         if (from >= end)
  883                 BUG();
  884 
  885         spin_lock(&mm->page_table_lock);
  886         do {
  887                 pmd_t *pmd = pmd_alloc(mm, dir, from);
  888                 error = -ENOMEM;
  889                 if (!pmd)
  890                         break;
  891                 error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot);
  892                 if (error)
  893                         break;
  894                 from = (from + PGDIR_SIZE) & PGDIR_MASK;
  895                 dir++;
  896         } while (from && (from < end));
  897         spin_unlock(&mm->page_table_lock);
  898         flush_tlb_range(mm, beg, end);
  899         return error;
  900 }
  901 
  902 /*
  903  * Establish a new mapping:
  904  *  - flush the old one
  905  *  - update the page tables
  906  *  - inform the TLB about the new one
  907  *
  908  * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
  909  */
  910 static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
  911 {
  912         set_pte(page_table, entry);
  913         flush_tlb_page(vma, address);
  914         update_mmu_cache(vma, address, entry);
  915 }
  916 
  917 /*
  918  * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
  919  */
  920 static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
  921                 pte_t *page_table)
  922 {
  923         flush_page_to_ram(new_page);
  924         flush_cache_page(vma, address);
  925         establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
  926 }
  927 
  928 /*
  929  * This routine handles present pages, when users try to write
  930  * to a shared page. It is done by copying the page to a new address
  931  * and decrementing the shared-page counter for the old page.
  932  *
  933  * Goto-purists beware: the only reason for goto's here is that it results
  934  * in better assembly code.. The "default" path will see no jumps at all.
  935  *
  936  * Note that this routine assumes that the protection checks have been
  937  * done by the caller (the low-level page fault routine in most cases).
  938  * Thus we can safely just mark it writable once we've done any necessary
  939  * COW.
  940  *
  941  * We also mark the page dirty at this point even though the page will
  942  * change only once the write actually happens. This avoids a few races,
  943  * and potentially makes it more efficient.
  944  *
  945  * We hold the mm semaphore and the page_table_lock on entry and exit
  946  * with the page_table_lock released.
  947  */
  948 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
  949         unsigned long address, pte_t *page_table, pte_t pte)
  950 {
  951         struct page *old_page, *new_page;
  952 
  953         old_page = pte_page(pte);
  954         if (!VALID_PAGE(old_page))
  955                 goto bad_wp_page;
  956 
  957         if (!TryLockPage(old_page)) {
  958                 int reuse = can_share_swap_page(old_page);
  959                 unlock_page(old_page);
  960                 if (reuse) {
  961                         flush_cache_page(vma, address);
  962                         establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
  963                         spin_unlock(&mm->page_table_lock);
  964                         return 1;       /* Minor fault */
  965                 }
  966         }
  967 
  968         /*
  969          * Ok, we need to copy. Oh, well..
  970          */
  971         page_cache_get(old_page);
  972         spin_unlock(&mm->page_table_lock);
  973 
  974         new_page = alloc_page(GFP_HIGHUSER);
  975         if (!new_page)
  976                 goto no_mem;
  977         copy_cow_page(old_page,new_page,address);
  978 
  979         /*
  980          * Re-check the pte - we dropped the lock
  981          */
  982         spin_lock(&mm->page_table_lock);
  983         if (pte_same(*page_table, pte)) {
  984                 if (PageReserved(old_page))
  985                         ++mm->rss;
  986                 break_cow(vma, new_page, address, page_table);
  987                 lru_cache_add(new_page);
  988 
  989                 /* Free the old page.. */
  990                 new_page = old_page;
  991         }
  992         spin_unlock(&mm->page_table_lock);
  993         page_cache_release(new_page);
  994         page_cache_release(old_page);
  995         return 1;       /* Minor fault */
  996 
  997 bad_wp_page:
  998         spin_unlock(&mm->page_table_lock);
  999         printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
 1000         return -1;
 1001 no_mem:
 1002         page_cache_release(old_page);
 1003         return -1;
 1004 }
 1005 
 1006 static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff)
 1007 {
 1008         do {
 1009                 struct mm_struct *mm = mpnt->vm_mm;
 1010                 unsigned long start = mpnt->vm_start;
 1011                 unsigned long end = mpnt->vm_end;
 1012                 unsigned long len = end - start;
 1013                 unsigned long diff;
 1014 
 1015                 /* mapping wholly truncated? */
 1016                 if (mpnt->vm_pgoff >= pgoff) {
 1017                         zap_page_range(mm, start, len);
 1018                         continue;
 1019                 }
 1020 
 1021                 /* mapping wholly unaffected? */
 1022                 len = len >> PAGE_SHIFT;
 1023                 diff = pgoff - mpnt->vm_pgoff;
 1024                 if (diff >= len)
 1025                         continue;
 1026 
 1027                 /* Ok, partially affected.. */
 1028                 start += diff << PAGE_SHIFT;
 1029                 len = (len - diff) << PAGE_SHIFT;
 1030                 zap_page_range(mm, start, len);
 1031         } while ((mpnt = mpnt->vm_next_share) != NULL);
 1032 }
 1033 
 1034 /*
 1035  * Handle all mappings that got truncated by a "truncate()"
 1036  * system call.
 1037  *
 1038  * NOTE! We have to be ready to update the memory sharing
 1039  * between the file and the memory map for a potential last
 1040  * incomplete page.  Ugly, but necessary.
 1041  */
 1042 int vmtruncate(struct inode * inode, loff_t offset)
 1043 {
 1044         unsigned long pgoff;
 1045         struct address_space *mapping = inode->i_mapping;
 1046         unsigned long limit;
 1047 
 1048         if (inode->i_size < offset)
 1049                 goto do_expand;
 1050         inode->i_size = offset;
 1051         spin_lock(&mapping->i_shared_lock);
 1052         if (!mapping->i_mmap && !mapping->i_mmap_shared)
 1053                 goto out_unlock;
 1054 
 1055         pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 1056         if (mapping->i_mmap != NULL)
 1057                 vmtruncate_list(mapping->i_mmap, pgoff);
 1058         if (mapping->i_mmap_shared != NULL)
 1059                 vmtruncate_list(mapping->i_mmap_shared, pgoff);
 1060 
 1061 out_unlock:
 1062         spin_unlock(&mapping->i_shared_lock);
 1063         truncate_inode_pages(mapping, offset);
 1064         goto out_truncate;
 1065 
 1066 do_expand:
 1067         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
 1068         if (limit != RLIM_INFINITY && offset > limit)
 1069                 goto out_sig;
 1070         if (offset > inode->i_sb->s_maxbytes)
 1071                 goto out;
 1072         inode->i_size = offset;
 1073 
 1074 out_truncate:
 1075         if (inode->i_op && inode->i_op->truncate) {
 1076                 lock_kernel();
 1077                 inode->i_op->truncate(inode);
 1078                 unlock_kernel();
 1079         }
 1080         return 0;
 1081 out_sig:
 1082         send_sig(SIGXFSZ, current, 0);
 1083 out:
 1084         return -EFBIG;
 1085 }
 1086 
 1087 /* 
 1088  * Primitive swap readahead code. We simply read an aligned block of
 1089  * (1 << page_cluster) entries in the swap area. This method is chosen
 1090  * because it doesn't cost us any seek time.  We also make sure to queue
 1091  * the 'original' request together with the readahead ones...  
 1092  */
 1093 void swapin_readahead(swp_entry_t entry)
 1094 {
 1095         int i, num;
 1096         struct page *new_page;
 1097         unsigned long offset;
 1098 
 1099         /*
 1100          * Get the number of handles we should do readahead io to.
 1101          */
 1102         num = valid_swaphandles(entry, &offset);
 1103         for (i = 0; i < num; offset++, i++) {
 1104                 /* Ok, do the async read-ahead now */
 1105                 new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset));
 1106                 if (!new_page)
 1107                         break;
 1108                 page_cache_release(new_page);
 1109         }
 1110         return;
 1111 }
 1112 
 1113 /*
 1114  * We hold the mm semaphore and the page_table_lock on entry and
 1115  * should release the pagetable lock on exit..
 1116  */
 1117 static int do_swap_page(struct mm_struct * mm,
 1118         struct vm_area_struct * vma, unsigned long address,
 1119         pte_t * page_table, pte_t orig_pte, int write_access)
 1120 {
 1121         struct page *page;
 1122         swp_entry_t entry = pte_to_swp_entry(orig_pte);
 1123         pte_t pte;
 1124         int ret = 1;
 1125 
 1126         spin_unlock(&mm->page_table_lock);
 1127         page = lookup_swap_cache(entry);
 1128         if (!page) {
 1129                 swapin_readahead(entry);
 1130                 page = read_swap_cache_async(entry);
 1131                 if (!page) {
 1132                         /*
 1133                          * Back out if somebody else faulted in this pte while
 1134                          * we released the page table lock.
 1135                          */
 1136                         int retval;
 1137                         spin_lock(&mm->page_table_lock);
 1138                         retval = pte_same(*page_table, orig_pte) ? -1 : 1;
 1139                         spin_unlock(&mm->page_table_lock);
 1140                         return retval;
 1141                 }
 1142 
 1143                 /* Had to read the page from swap area: Major fault */
 1144                 ret = 2;
 1145         }
 1146 
 1147         mark_page_accessed(page);
 1148 
 1149         lock_page(page);
 1150 
 1151         /*
 1152          * Back out if somebody else faulted in this pte while we
 1153          * released the page table lock.
 1154          */
 1155         spin_lock(&mm->page_table_lock);
 1156         if (!pte_same(*page_table, orig_pte)) {
 1157                 spin_unlock(&mm->page_table_lock);
 1158                 unlock_page(page);
 1159                 page_cache_release(page);
 1160                 return 1;
 1161         }
 1162 
 1163         /* The page isn't present yet, go ahead with the fault. */
 1164                 
 1165         swap_free(entry);
 1166         if (vm_swap_full())
 1167                 remove_exclusive_swap_page(page);
 1168 
 1169         mm->rss++;
 1170         pte = mk_pte(page, vma->vm_page_prot);
 1171         if (write_access && can_share_swap_page(page))
 1172                 pte = pte_mkdirty(pte_mkwrite(pte));
 1173         unlock_page(page);
 1174 
 1175         flush_page_to_ram(page);
 1176         flush_icache_page(vma, page);
 1177         set_pte(page_table, pte);
 1178 
 1179         /* No need to invalidate - it was non-present before */
 1180         update_mmu_cache(vma, address, pte);
 1181         spin_unlock(&mm->page_table_lock);
 1182         return ret;
 1183 }
 1184 
 1185 /*
 1186  * We are called with the MM semaphore and page_table_lock
 1187  * spinlock held to protect against concurrent faults in
 1188  * multithreaded programs. 
 1189  */
 1190 static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
 1191 {
 1192         pte_t entry;
 1193 
 1194         /* Read-only mapping of ZERO_PAGE. */
 1195         entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
 1196 
 1197         /* ..except if it's a write access */
 1198         if (write_access) {
 1199                 struct page *page;
 1200 
 1201                 /* Allocate our own private page. */
 1202                 spin_unlock(&mm->page_table_lock);
 1203 
 1204                 page = alloc_page(GFP_HIGHUSER);
 1205                 if (!page)
 1206                         goto no_mem;
 1207                 clear_user_highpage(page, addr);
 1208 
 1209                 spin_lock(&mm->page_table_lock);
 1210                 if (!pte_none(*page_table)) {
 1211                         page_cache_release(page);
 1212                         spin_unlock(&mm->page_table_lock);
 1213                         return 1;
 1214                 }
 1215                 mm->rss++;
 1216                 flush_page_to_ram(page);
 1217                 entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 1218                 lru_cache_add(page);
 1219                 mark_page_accessed(page);
 1220         }
 1221 
 1222         set_pte(page_table, entry);
 1223 
 1224         /* No need to invalidate - it was non-present before */
 1225         update_mmu_cache(vma, addr, entry);
 1226         spin_unlock(&mm->page_table_lock);
 1227         return 1;       /* Minor fault */
 1228 
 1229 no_mem:
 1230         return -1;
 1231 }
 1232 
 1233 /*
 1234  * do_no_page() tries to create a new page mapping. It aggressively
 1235  * tries to share with existing pages, but makes a separate copy if
 1236  * the "write_access" parameter is true in order to avoid the next
 1237  * page fault.
 1238  *
 1239  * As this is called only for pages that do not currently exist, we
 1240  * do not need to flush old virtual caches or the TLB.
 1241  *
 1242  * This is called with the MM semaphore held and the page table
 1243  * spinlock held. Exit with the spinlock released.
 1244  */
 1245 static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
 1246         unsigned long address, int write_access, pte_t *page_table)
 1247 {
 1248         struct page * new_page;
 1249         pte_t entry;
 1250 
 1251         if (!vma->vm_ops || !vma->vm_ops->nopage)
 1252                 return do_anonymous_page(mm, vma, page_table, write_access, address);
 1253         spin_unlock(&mm->page_table_lock);
 1254 
 1255         new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
 1256 
 1257         if (new_page == NULL)   /* no page was available -- SIGBUS */
 1258                 return 0;
 1259         if (new_page == NOPAGE_OOM)
 1260                 return -1;
 1261 
 1262         /*
 1263          * Should we do an early C-O-W break?
 1264          */
 1265         if (write_access && !(vma->vm_flags & VM_SHARED)) {
 1266                 struct page * page = alloc_page(GFP_HIGHUSER);
 1267                 if (!page) {
 1268                         page_cache_release(new_page);
 1269                         return -1;
 1270                 }
 1271                 copy_user_highpage(page, new_page, address);
 1272                 page_cache_release(new_page);
 1273                 lru_cache_add(page);
 1274                 new_page = page;
 1275         }
 1276 
 1277         spin_lock(&mm->page_table_lock);
 1278         /*
 1279          * This silly early PAGE_DIRTY setting removes a race
 1280          * due to the bad i386 page protection. But it's valid
 1281          * for other architectures too.
 1282          *
 1283          * Note that if write_access is true, we either now have
 1284          * an exclusive copy of the page, or this is a shared mapping,
 1285          * so we can make it writable and dirty to avoid having to
 1286          * handle that later.
 1287          */
 1288         /* Only go through if we didn't race with anybody else... */
 1289         if (pte_none(*page_table)) {
 1290                 ++mm->rss;
 1291                 flush_page_to_ram(new_page);
 1292                 flush_icache_page(vma, new_page);
 1293                 entry = mk_pte(new_page, vma->vm_page_prot);
 1294                 if (write_access)
 1295                         entry = pte_mkwrite(pte_mkdirty(entry));
 1296                 set_pte(page_table, entry);
 1297         } else {
 1298                 /* One of our sibling threads was faster, back out. */
 1299                 page_cache_release(new_page);
 1300                 spin_unlock(&mm->page_table_lock);
 1301                 return 1;
 1302         }
 1303 
 1304         /* no need to invalidate: a not-present page shouldn't be cached */
 1305         update_mmu_cache(vma, address, entry);
 1306         spin_unlock(&mm->page_table_lock);
 1307         return 2;       /* Major fault */
 1308 }
 1309 
 1310 /*
 1311  * These routines also need to handle stuff like marking pages dirty
 1312  * and/or accessed for architectures that don't do it in hardware (most
 1313  * RISC architectures).  The early dirtying is also good on the i386.
 1314  *
 1315  * There is also a hook called "update_mmu_cache()" that architectures
 1316  * with external mmu caches can use to update those (ie the Sparc or
 1317  * PowerPC hashed page tables that act as extended TLBs).
 1318  *
 1319  * Note the "page_table_lock". It is to protect against kswapd removing
 1320  * pages from under us. Note that kswapd only ever _removes_ pages, never
 1321  * adds them. As such, once we have noticed that the page is not present,
 1322  * we can drop the lock early.
 1323  *
 1324  * The adding of pages is protected by the MM semaphore (which we hold),
 1325  * so we don't need to worry about a page being suddenly been added into
 1326  * our VM.
 1327  *
 1328  * We enter with the pagetable spinlock held, we are supposed to
 1329  * release it when done.
 1330  */
 1331 static inline int handle_pte_fault(struct mm_struct *mm,
 1332         struct vm_area_struct * vma, unsigned long address,
 1333         int write_access, pte_t * pte)
 1334 {
 1335         pte_t entry;
 1336 
 1337         entry = *pte;
 1338         if (!pte_present(entry)) {
 1339                 /*
 1340                  * If it truly wasn't present, we know that kswapd
 1341                  * and the PTE updates will not touch it later. So
 1342                  * drop the lock.
 1343                  */
 1344                 if (pte_none(entry))
 1345                         return do_no_page(mm, vma, address, write_access, pte);
 1346                 return do_swap_page(mm, vma, address, pte, entry, write_access);
 1347         }
 1348 
 1349         if (write_access) {
 1350                 if (!pte_write(entry))
 1351                         return do_wp_page(mm, vma, address, pte, entry);
 1352 
 1353                 entry = pte_mkdirty(entry);
 1354         }
 1355         entry = pte_mkyoung(entry);
 1356         establish_pte(vma, address, pte, entry);
 1357         spin_unlock(&mm->page_table_lock);
 1358         return 1;
 1359 }
 1360 
 1361 /*
 1362  * By the time we get here, we already hold the mm semaphore
 1363  */
 1364 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 1365         unsigned long address, int write_access)
 1366 {
 1367         pgd_t *pgd;
 1368         pmd_t *pmd;
 1369 
 1370         current->state = TASK_RUNNING;
 1371         pgd = pgd_offset(mm, address);
 1372 
 1373         /*
 1374          * We need the page table lock to synchronize with kswapd
 1375          * and the SMP-safe atomic PTE updates.
 1376          */
 1377         spin_lock(&mm->page_table_lock);
 1378         pmd = pmd_alloc(mm, pgd, address);
 1379 
 1380         if (pmd) {
 1381                 pte_t * pte = pte_alloc(mm, pmd, address);
 1382                 if (pte)
 1383                         return handle_pte_fault(mm, vma, address, write_access, pte);
 1384         }
 1385         spin_unlock(&mm->page_table_lock);
 1386         return -1;
 1387 }
 1388 
 1389 /*
 1390  * Allocate page middle directory.
 1391  *
 1392  * We've already handled the fast-path in-line, and we own the
 1393  * page table lock.
 1394  *
 1395  * On a two-level page table, this ends up actually being entirely
 1396  * optimized away.
 1397  */
 1398 pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 1399 {
 1400         pmd_t *new;
 1401 
 1402         /* "fast" allocation can happen without dropping the lock.. */
 1403         new = pmd_alloc_one_fast(mm, address);
 1404         if (!new) {
 1405                 spin_unlock(&mm->page_table_lock);
 1406                 new = pmd_alloc_one(mm, address);
 1407                 spin_lock(&mm->page_table_lock);
 1408                 if (!new)
 1409                         return NULL;
 1410 
 1411                 /*
 1412                  * Because we dropped the lock, we should re-check the
 1413                  * entry, as somebody else could have populated it..
 1414                  */
 1415                 if (!pgd_none(*pgd)) {
 1416                         pmd_free(new);
 1417                         goto out;
 1418                 }
 1419         }
 1420         pgd_populate(mm, pgd, new);
 1421 out:
 1422         return pmd_offset(pgd, address);
 1423 }
 1424 
 1425 /*
 1426  * Allocate the page table directory.
 1427  *
 1428  * We've already handled the fast-path in-line, and we own the
 1429  * page table lock.
 1430  */
 1431 pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 1432 {
 1433         if (pmd_none(*pmd)) {
 1434                 pte_t *new;
 1435 
 1436                 /* "fast" allocation can happen without dropping the lock.. */
 1437                 new = pte_alloc_one_fast(mm, address);
 1438                 if (!new) {
 1439                         spin_unlock(&mm->page_table_lock);
 1440                         new = pte_alloc_one(mm, address);
 1441                         spin_lock(&mm->page_table_lock);
 1442                         if (!new)
 1443                                 return NULL;
 1444 
 1445                         /*
 1446                          * Because we dropped the lock, we should re-check the
 1447                          * entry, as somebody else could have populated it..
 1448                          */
 1449                         if (!pmd_none(*pmd)) {
 1450                                 pte_free(new);
 1451                                 goto out;
 1452                         }
 1453                 }
 1454                 pmd_populate(mm, pmd, new);
 1455         }
 1456 out:
 1457         return pte_offset(pmd, address);
 1458 }
 1459 
 1460 int make_pages_present(unsigned long addr, unsigned long end)
 1461 {
 1462         int ret, len, write;
 1463         struct vm_area_struct * vma;
 1464 
 1465         vma = find_vma(current->mm, addr);
 1466         write = (vma->vm_flags & VM_WRITE) != 0;
 1467         if (addr >= end)
 1468                 BUG();
 1469         if (end > vma->vm_end)
 1470                 BUG();
 1471         len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
 1472         ret = get_user_pages(current, current->mm, addr,
 1473                         len, write, 0, NULL, NULL);
 1474         return ret == len ? 0 : -1;
 1475 }
 1476 
 1477 struct page * vmalloc_to_page(void * vmalloc_addr)
 1478 {
 1479         unsigned long addr = (unsigned long) vmalloc_addr;
 1480         struct page *page = NULL;
 1481         pmd_t *pmd;
 1482         pte_t *pte;
 1483         pgd_t *pgd;
 1484         
 1485         pgd = pgd_offset_k(addr);
 1486         if (!pgd_none(*pgd)) {
 1487                 pmd = pmd_offset(pgd, addr);
 1488                 if (!pmd_none(*pmd)) {
 1489                         pte = pte_offset(pmd, addr);
 1490                         if (pte_present(*pte)) {
 1491                                 page = pte_page(*pte);
 1492                         }
 1493                 }
 1494         }
 1495         return page;
 1496 }

Cache object: 41139b24b4cf73eef3e1c8b0f99ff229


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.