The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/mm/migrate.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Memory Migration functionality - linux/mm/migration.c
    3  *
    4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
    5  *
    6  * Page migration was first developed in the context of the memory hotplug
    7  * project. The main authors of the migration code are:
    8  *
    9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
   10  * Hirokazu Takahashi <taka@valinux.co.jp>
   11  * Dave Hansen <haveblue@us.ibm.com>
   12  * Christoph Lameter
   13  */
   14 
   15 #include <linux/migrate.h>
   16 #include <linux/export.h>
   17 #include <linux/swap.h>
   18 #include <linux/swapops.h>
   19 #include <linux/pagemap.h>
   20 #include <linux/buffer_head.h>
   21 #include <linux/mm_inline.h>
   22 #include <linux/nsproxy.h>
   23 #include <linux/pagevec.h>
   24 #include <linux/ksm.h>
   25 #include <linux/rmap.h>
   26 #include <linux/topology.h>
   27 #include <linux/cpu.h>
   28 #include <linux/cpuset.h>
   29 #include <linux/writeback.h>
   30 #include <linux/mempolicy.h>
   31 #include <linux/vmalloc.h>
   32 #include <linux/security.h>
   33 #include <linux/memcontrol.h>
   34 #include <linux/syscalls.h>
   35 #include <linux/hugetlb.h>
   36 #include <linux/hugetlb_cgroup.h>
   37 #include <linux/gfp.h>
   38 #include <linux/balloon_compaction.h>
   39 
   40 #include <asm/tlbflush.h>
   41 
   42 #define CREATE_TRACE_POINTS
   43 #include <trace/events/migrate.h>
   44 
   45 #include "internal.h"
   46 
   47 /*
   48  * migrate_prep() needs to be called before we start compiling a list of pages
   49  * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
   50  * undesirable, use migrate_prep_local()
   51  */
   52 int migrate_prep(void)
   53 {
   54         /*
   55          * Clear the LRU lists so pages can be isolated.
   56          * Note that pages may be moved off the LRU after we have
   57          * drained them. Those pages will fail to migrate like other
   58          * pages that may be busy.
   59          */
   60         lru_add_drain_all();
   61 
   62         return 0;
   63 }
   64 
   65 /* Do the necessary work of migrate_prep but not if it involves other CPUs */
   66 int migrate_prep_local(void)
   67 {
   68         lru_add_drain();
   69 
   70         return 0;
   71 }
   72 
   73 /*
   74  * Add isolated pages on the list back to the LRU under page lock
   75  * to avoid leaking evictable pages back onto unevictable list.
   76  */
   77 void putback_lru_pages(struct list_head *l)
   78 {
   79         struct page *page;
   80         struct page *page2;
   81 
   82         list_for_each_entry_safe(page, page2, l, lru) {
   83                 list_del(&page->lru);
   84                 dec_zone_page_state(page, NR_ISOLATED_ANON +
   85                                 page_is_file_cache(page));
   86                         putback_lru_page(page);
   87         }
   88 }
   89 
   90 /*
   91  * Put previously isolated pages back onto the appropriate lists
   92  * from where they were once taken off for compaction/migration.
   93  *
   94  * This function shall be used instead of putback_lru_pages(),
   95  * whenever the isolated pageset has been built by isolate_migratepages_range()
   96  */
   97 void putback_movable_pages(struct list_head *l)
   98 {
   99         struct page *page;
  100         struct page *page2;
  101 
  102         list_for_each_entry_safe(page, page2, l, lru) {
  103                 list_del(&page->lru);
  104                 dec_zone_page_state(page, NR_ISOLATED_ANON +
  105                                 page_is_file_cache(page));
  106                 if (unlikely(balloon_page_movable(page)))
  107                         balloon_page_putback(page);
  108                 else
  109                         putback_lru_page(page);
  110         }
  111 }
  112 
  113 /*
  114  * Restore a potential migration pte to a working pte entry
  115  */
  116 static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
  117                                  unsigned long addr, void *old)
  118 {
  119         struct mm_struct *mm = vma->vm_mm;
  120         swp_entry_t entry;
  121         pmd_t *pmd;
  122         pte_t *ptep, pte;
  123         spinlock_t *ptl;
  124 
  125         if (unlikely(PageHuge(new))) {
  126                 ptep = huge_pte_offset(mm, addr);
  127                 if (!ptep)
  128                         goto out;
  129                 ptl = &mm->page_table_lock;
  130         } else {
  131                 pmd = mm_find_pmd(mm, addr);
  132                 if (!pmd)
  133                         goto out;
  134                 if (pmd_trans_huge(*pmd))
  135                         goto out;
  136 
  137                 ptep = pte_offset_map(pmd, addr);
  138 
  139                 /*
  140                  * Peek to check is_swap_pte() before taking ptlock?  No, we
  141                  * can race mremap's move_ptes(), which skips anon_vma lock.
  142                  */
  143 
  144                 ptl = pte_lockptr(mm, pmd);
  145         }
  146 
  147         spin_lock(ptl);
  148         pte = *ptep;
  149         if (!is_swap_pte(pte))
  150                 goto unlock;
  151 
  152         entry = pte_to_swp_entry(pte);
  153 
  154         if (!is_migration_entry(entry) ||
  155             migration_entry_to_page(entry) != old)
  156                 goto unlock;
  157 
  158         get_page(new);
  159         pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
  160         if (is_write_migration_entry(entry))
  161                 pte = pte_mkwrite(pte);
  162 #ifdef CONFIG_HUGETLB_PAGE
  163         if (PageHuge(new))
  164                 pte = pte_mkhuge(pte);
  165 #endif
  166         flush_cache_page(vma, addr, pte_pfn(pte));
  167         set_pte_at(mm, addr, ptep, pte);
  168 
  169         if (PageHuge(new)) {
  170                 if (PageAnon(new))
  171                         hugepage_add_anon_rmap(new, vma, addr);
  172                 else
  173                         page_dup_rmap(new);
  174         } else if (PageAnon(new))
  175                 page_add_anon_rmap(new, vma, addr);
  176         else
  177                 page_add_file_rmap(new);
  178 
  179         /* No need to invalidate - it was non-present before */
  180         update_mmu_cache(vma, addr, ptep);
  181 unlock:
  182         pte_unmap_unlock(ptep, ptl);
  183 out:
  184         return SWAP_AGAIN;
  185 }
  186 
  187 /*
  188  * Get rid of all migration entries and replace them by
  189  * references to the indicated page.
  190  */
  191 static void remove_migration_ptes(struct page *old, struct page *new)
  192 {
  193         rmap_walk(new, remove_migration_pte, old);
  194 }
  195 
  196 /*
  197  * Something used the pte of a page under migration. We need to
  198  * get to the page and wait until migration is finished.
  199  * When we return from this function the fault will be retried.
  200  */
  201 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
  202                                 unsigned long address)
  203 {
  204         pte_t *ptep, pte;
  205         spinlock_t *ptl;
  206         swp_entry_t entry;
  207         struct page *page;
  208 
  209         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
  210         pte = *ptep;
  211         if (!is_swap_pte(pte))
  212                 goto out;
  213 
  214         entry = pte_to_swp_entry(pte);
  215         if (!is_migration_entry(entry))
  216                 goto out;
  217 
  218         page = migration_entry_to_page(entry);
  219 
  220         /*
  221          * Once radix-tree replacement of page migration started, page_count
  222          * *must* be zero. And, we don't want to call wait_on_page_locked()
  223          * against a page without get_page().
  224          * So, we use get_page_unless_zero(), here. Even failed, page fault
  225          * will occur again.
  226          */
  227         if (!get_page_unless_zero(page))
  228                 goto out;
  229         pte_unmap_unlock(ptep, ptl);
  230         wait_on_page_locked(page);
  231         put_page(page);
  232         return;
  233 out:
  234         pte_unmap_unlock(ptep, ptl);
  235 }
  236 
  237 #ifdef CONFIG_BLOCK
  238 /* Returns true if all buffers are successfully locked */
  239 static bool buffer_migrate_lock_buffers(struct buffer_head *head,
  240                                                         enum migrate_mode mode)
  241 {
  242         struct buffer_head *bh = head;
  243 
  244         /* Simple case, sync compaction */
  245         if (mode != MIGRATE_ASYNC) {
  246                 do {
  247                         get_bh(bh);
  248                         lock_buffer(bh);
  249                         bh = bh->b_this_page;
  250 
  251                 } while (bh != head);
  252 
  253                 return true;
  254         }
  255 
  256         /* async case, we cannot block on lock_buffer so use trylock_buffer */
  257         do {
  258                 get_bh(bh);
  259                 if (!trylock_buffer(bh)) {
  260                         /*
  261                          * We failed to lock the buffer and cannot stall in
  262                          * async migration. Release the taken locks
  263                          */
  264                         struct buffer_head *failed_bh = bh;
  265                         put_bh(failed_bh);
  266                         bh = head;
  267                         while (bh != failed_bh) {
  268                                 unlock_buffer(bh);
  269                                 put_bh(bh);
  270                                 bh = bh->b_this_page;
  271                         }
  272                         return false;
  273                 }
  274 
  275                 bh = bh->b_this_page;
  276         } while (bh != head);
  277         return true;
  278 }
  279 #else
  280 static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
  281                                                         enum migrate_mode mode)
  282 {
  283         return true;
  284 }
  285 #endif /* CONFIG_BLOCK */
  286 
  287 /*
  288  * Replace the page in the mapping.
  289  *
  290  * The number of remaining references must be:
  291  * 1 for anonymous pages without a mapping
  292  * 2 for pages with a mapping
  293  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  294  */
  295 static int migrate_page_move_mapping(struct address_space *mapping,
  296                 struct page *newpage, struct page *page,
  297                 struct buffer_head *head, enum migrate_mode mode)
  298 {
  299         int expected_count = 0;
  300         void **pslot;
  301 
  302         if (!mapping) {
  303                 /* Anonymous page without mapping */
  304                 if (page_count(page) != 1)
  305                         return -EAGAIN;
  306                 return MIGRATEPAGE_SUCCESS;
  307         }
  308 
  309         spin_lock_irq(&mapping->tree_lock);
  310 
  311         pslot = radix_tree_lookup_slot(&mapping->page_tree,
  312                                         page_index(page));
  313 
  314         expected_count = 2 + page_has_private(page);
  315         if (page_count(page) != expected_count ||
  316                 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
  317                 spin_unlock_irq(&mapping->tree_lock);
  318                 return -EAGAIN;
  319         }
  320 
  321         if (!page_freeze_refs(page, expected_count)) {
  322                 spin_unlock_irq(&mapping->tree_lock);
  323                 return -EAGAIN;
  324         }
  325 
  326         /*
  327          * In the async migration case of moving a page with buffers, lock the
  328          * buffers using trylock before the mapping is moved. If the mapping
  329          * was moved, we later failed to lock the buffers and could not move
  330          * the mapping back due to an elevated page count, we would have to
  331          * block waiting on other references to be dropped.
  332          */
  333         if (mode == MIGRATE_ASYNC && head &&
  334                         !buffer_migrate_lock_buffers(head, mode)) {
  335                 page_unfreeze_refs(page, expected_count);
  336                 spin_unlock_irq(&mapping->tree_lock);
  337                 return -EAGAIN;
  338         }
  339 
  340         /*
  341          * Now we know that no one else is looking at the page.
  342          */
  343         get_page(newpage);      /* add cache reference */
  344         if (PageSwapCache(page)) {
  345                 SetPageSwapCache(newpage);
  346                 set_page_private(newpage, page_private(page));
  347         }
  348 
  349         radix_tree_replace_slot(pslot, newpage);
  350 
  351         /*
  352          * Drop cache reference from old page by unfreezing
  353          * to one less reference.
  354          * We know this isn't the last reference.
  355          */
  356         page_unfreeze_refs(page, expected_count - 1);
  357 
  358         /*
  359          * If moved to a different zone then also account
  360          * the page for that zone. Other VM counters will be
  361          * taken care of when we establish references to the
  362          * new page and drop references to the old page.
  363          *
  364          * Note that anonymous pages are accounted for
  365          * via NR_FILE_PAGES and NR_ANON_PAGES if they
  366          * are mapped to swap space.
  367          */
  368         __dec_zone_page_state(page, NR_FILE_PAGES);
  369         __inc_zone_page_state(newpage, NR_FILE_PAGES);
  370         if (!PageSwapCache(page) && PageSwapBacked(page)) {
  371                 __dec_zone_page_state(page, NR_SHMEM);
  372                 __inc_zone_page_state(newpage, NR_SHMEM);
  373         }
  374         spin_unlock_irq(&mapping->tree_lock);
  375 
  376         return MIGRATEPAGE_SUCCESS;
  377 }
  378 
  379 /*
  380  * The expected number of remaining references is the same as that
  381  * of migrate_page_move_mapping().
  382  */
  383 int migrate_huge_page_move_mapping(struct address_space *mapping,
  384                                    struct page *newpage, struct page *page)
  385 {
  386         int expected_count;
  387         void **pslot;
  388 
  389         if (!mapping) {
  390                 if (page_count(page) != 1)
  391                         return -EAGAIN;
  392                 return MIGRATEPAGE_SUCCESS;
  393         }
  394 
  395         spin_lock_irq(&mapping->tree_lock);
  396 
  397         pslot = radix_tree_lookup_slot(&mapping->page_tree,
  398                                         page_index(page));
  399 
  400         expected_count = 2 + page_has_private(page);
  401         if (page_count(page) != expected_count ||
  402                 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
  403                 spin_unlock_irq(&mapping->tree_lock);
  404                 return -EAGAIN;
  405         }
  406 
  407         if (!page_freeze_refs(page, expected_count)) {
  408                 spin_unlock_irq(&mapping->tree_lock);
  409                 return -EAGAIN;
  410         }
  411 
  412         get_page(newpage);
  413 
  414         radix_tree_replace_slot(pslot, newpage);
  415 
  416         page_unfreeze_refs(page, expected_count - 1);
  417 
  418         spin_unlock_irq(&mapping->tree_lock);
  419         return MIGRATEPAGE_SUCCESS;
  420 }
  421 
  422 /*
  423  * Copy the page to its new location
  424  */
  425 void migrate_page_copy(struct page *newpage, struct page *page)
  426 {
  427         if (PageHuge(page) || PageTransHuge(page))
  428                 copy_huge_page(newpage, page);
  429         else
  430                 copy_highpage(newpage, page);
  431 
  432         if (PageError(page))
  433                 SetPageError(newpage);
  434         if (PageReferenced(page))
  435                 SetPageReferenced(newpage);
  436         if (PageUptodate(page))
  437                 SetPageUptodate(newpage);
  438         if (TestClearPageActive(page)) {
  439                 VM_BUG_ON(PageUnevictable(page));
  440                 SetPageActive(newpage);
  441         } else if (TestClearPageUnevictable(page))
  442                 SetPageUnevictable(newpage);
  443         if (PageChecked(page))
  444                 SetPageChecked(newpage);
  445         if (PageMappedToDisk(page))
  446                 SetPageMappedToDisk(newpage);
  447 
  448         if (PageDirty(page)) {
  449                 clear_page_dirty_for_io(page);
  450                 /*
  451                  * Want to mark the page and the radix tree as dirty, and
  452                  * redo the accounting that clear_page_dirty_for_io undid,
  453                  * but we can't use set_page_dirty because that function
  454                  * is actually a signal that all of the page has become dirty.
  455                  * Whereas only part of our page may be dirty.
  456                  */
  457                 if (PageSwapBacked(page))
  458                         SetPageDirty(newpage);
  459                 else
  460                         __set_page_dirty_nobuffers(newpage);
  461         }
  462 
  463         mlock_migrate_page(newpage, page);
  464         ksm_migrate_page(newpage, page);
  465 
  466         ClearPageSwapCache(page);
  467         ClearPagePrivate(page);
  468         set_page_private(page, 0);
  469 
  470         /*
  471          * If any waiters have accumulated on the new page then
  472          * wake them up.
  473          */
  474         if (PageWriteback(newpage))
  475                 end_page_writeback(newpage);
  476 }
  477 
  478 /************************************************************
  479  *                    Migration functions
  480  ***********************************************************/
  481 
  482 /* Always fail migration. Used for mappings that are not movable */
  483 int fail_migrate_page(struct address_space *mapping,
  484                         struct page *newpage, struct page *page)
  485 {
  486         return -EIO;
  487 }
  488 EXPORT_SYMBOL(fail_migrate_page);
  489 
  490 /*
  491  * Common logic to directly migrate a single page suitable for
  492  * pages that do not use PagePrivate/PagePrivate2.
  493  *
  494  * Pages are locked upon entry and exit.
  495  */
  496 int migrate_page(struct address_space *mapping,
  497                 struct page *newpage, struct page *page,
  498                 enum migrate_mode mode)
  499 {
  500         int rc;
  501 
  502         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
  503 
  504         rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
  505 
  506         if (rc != MIGRATEPAGE_SUCCESS)
  507                 return rc;
  508 
  509         migrate_page_copy(newpage, page);
  510         return MIGRATEPAGE_SUCCESS;
  511 }
  512 EXPORT_SYMBOL(migrate_page);
  513 
  514 #ifdef CONFIG_BLOCK
  515 /*
  516  * Migration function for pages with buffers. This function can only be used
  517  * if the underlying filesystem guarantees that no other references to "page"
  518  * exist.
  519  */
  520 int buffer_migrate_page(struct address_space *mapping,
  521                 struct page *newpage, struct page *page, enum migrate_mode mode)
  522 {
  523         struct buffer_head *bh, *head;
  524         int rc;
  525 
  526         if (!page_has_buffers(page))
  527                 return migrate_page(mapping, newpage, page, mode);
  528 
  529         head = page_buffers(page);
  530 
  531         rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
  532 
  533         if (rc != MIGRATEPAGE_SUCCESS)
  534                 return rc;
  535 
  536         /*
  537          * In the async case, migrate_page_move_mapping locked the buffers
  538          * with an IRQ-safe spinlock held. In the sync case, the buffers
  539          * need to be locked now
  540          */
  541         if (mode != MIGRATE_ASYNC)
  542                 BUG_ON(!buffer_migrate_lock_buffers(head, mode));
  543 
  544         ClearPagePrivate(page);
  545         set_page_private(newpage, page_private(page));
  546         set_page_private(page, 0);
  547         put_page(page);
  548         get_page(newpage);
  549 
  550         bh = head;
  551         do {
  552                 set_bh_page(bh, newpage, bh_offset(bh));
  553                 bh = bh->b_this_page;
  554 
  555         } while (bh != head);
  556 
  557         SetPagePrivate(newpage);
  558 
  559         migrate_page_copy(newpage, page);
  560 
  561         bh = head;
  562         do {
  563                 unlock_buffer(bh);
  564                 put_bh(bh);
  565                 bh = bh->b_this_page;
  566 
  567         } while (bh != head);
  568 
  569         return MIGRATEPAGE_SUCCESS;
  570 }
  571 EXPORT_SYMBOL(buffer_migrate_page);
  572 #endif
  573 
  574 /*
  575  * Writeback a page to clean the dirty state
  576  */
  577 static int writeout(struct address_space *mapping, struct page *page)
  578 {
  579         struct writeback_control wbc = {
  580                 .sync_mode = WB_SYNC_NONE,
  581                 .nr_to_write = 1,
  582                 .range_start = 0,
  583                 .range_end = LLONG_MAX,
  584                 .for_reclaim = 1
  585         };
  586         int rc;
  587 
  588         if (!mapping->a_ops->writepage)
  589                 /* No write method for the address space */
  590                 return -EINVAL;
  591 
  592         if (!clear_page_dirty_for_io(page))
  593                 /* Someone else already triggered a write */
  594                 return -EAGAIN;
  595 
  596         /*
  597          * A dirty page may imply that the underlying filesystem has
  598          * the page on some queue. So the page must be clean for
  599          * migration. Writeout may mean we loose the lock and the
  600          * page state is no longer what we checked for earlier.
  601          * At this point we know that the migration attempt cannot
  602          * be successful.
  603          */
  604         remove_migration_ptes(page, page);
  605 
  606         rc = mapping->a_ops->writepage(page, &wbc);
  607 
  608         if (rc != AOP_WRITEPAGE_ACTIVATE)
  609                 /* unlocked. Relock */
  610                 lock_page(page);
  611 
  612         return (rc < 0) ? -EIO : -EAGAIN;
  613 }
  614 
  615 /*
  616  * Default handling if a filesystem does not provide a migration function.
  617  */
  618 static int fallback_migrate_page(struct address_space *mapping,
  619         struct page *newpage, struct page *page, enum migrate_mode mode)
  620 {
  621         if (PageDirty(page)) {
  622                 /* Only writeback pages in full synchronous migration */
  623                 if (mode != MIGRATE_SYNC)
  624                         return -EBUSY;
  625                 return writeout(mapping, page);
  626         }
  627 
  628         /*
  629          * Buffers may be managed in a filesystem specific way.
  630          * We must have no buffers or drop them.
  631          */
  632         if (page_has_private(page) &&
  633             !try_to_release_page(page, GFP_KERNEL))
  634                 return -EAGAIN;
  635 
  636         return migrate_page(mapping, newpage, page, mode);
  637 }
  638 
  639 /*
  640  * Move a page to a newly allocated page
  641  * The page is locked and all ptes have been successfully removed.
  642  *
  643  * The new page will have replaced the old page if this function
  644  * is successful.
  645  *
  646  * Return value:
  647  *   < 0 - error code
  648  *  MIGRATEPAGE_SUCCESS - success
  649  */
  650 static int move_to_new_page(struct page *newpage, struct page *page,
  651                                 int remap_swapcache, enum migrate_mode mode)
  652 {
  653         struct address_space *mapping;
  654         int rc;
  655 
  656         /*
  657          * Block others from accessing the page when we get around to
  658          * establishing additional references. We are the only one
  659          * holding a reference to the new page at this point.
  660          */
  661         if (!trylock_page(newpage))
  662                 BUG();
  663 
  664         /* Prepare mapping for the new page.*/
  665         newpage->index = page->index;
  666         newpage->mapping = page->mapping;
  667         if (PageSwapBacked(page))
  668                 SetPageSwapBacked(newpage);
  669 
  670         mapping = page_mapping(page);
  671         if (!mapping)
  672                 rc = migrate_page(mapping, newpage, page, mode);
  673         else if (mapping->a_ops->migratepage)
  674                 /*
  675                  * Most pages have a mapping and most filesystems provide a
  676                  * migratepage callback. Anonymous pages are part of swap
  677                  * space which also has its own migratepage callback. This
  678                  * is the most common path for page migration.
  679                  */
  680                 rc = mapping->a_ops->migratepage(mapping,
  681                                                 newpage, page, mode);
  682         else
  683                 rc = fallback_migrate_page(mapping, newpage, page, mode);
  684 
  685         if (rc != MIGRATEPAGE_SUCCESS) {
  686                 newpage->mapping = NULL;
  687         } else {
  688                 if (remap_swapcache)
  689                         remove_migration_ptes(page, newpage);
  690                 page->mapping = NULL;
  691         }
  692 
  693         unlock_page(newpage);
  694 
  695         return rc;
  696 }
  697 
  698 static int __unmap_and_move(struct page *page, struct page *newpage,
  699                         int force, bool offlining, enum migrate_mode mode)
  700 {
  701         int rc = -EAGAIN;
  702         int remap_swapcache = 1;
  703         struct mem_cgroup *mem;
  704         struct anon_vma *anon_vma = NULL;
  705 
  706         if (!trylock_page(page)) {
  707                 if (!force || mode == MIGRATE_ASYNC)
  708                         goto out;
  709 
  710                 /*
  711                  * It's not safe for direct compaction to call lock_page.
  712                  * For example, during page readahead pages are added locked
  713                  * to the LRU. Later, when the IO completes the pages are
  714                  * marked uptodate and unlocked. However, the queueing
  715                  * could be merging multiple pages for one bio (e.g.
  716                  * mpage_readpages). If an allocation happens for the
  717                  * second or third page, the process can end up locking
  718                  * the same page twice and deadlocking. Rather than
  719                  * trying to be clever about what pages can be locked,
  720                  * avoid the use of lock_page for direct compaction
  721                  * altogether.
  722                  */
  723                 if (current->flags & PF_MEMALLOC)
  724                         goto out;
  725 
  726                 lock_page(page);
  727         }
  728 
  729         /*
  730          * Only memory hotplug's offline_pages() caller has locked out KSM,
  731          * and can safely migrate a KSM page.  The other cases have skipped
  732          * PageKsm along with PageReserved - but it is only now when we have
  733          * the page lock that we can be certain it will not go KSM beneath us
  734          * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
  735          * its pagecount raised, but only here do we take the page lock which
  736          * serializes that).
  737          */
  738         if (PageKsm(page) && !offlining) {
  739                 rc = -EBUSY;
  740                 goto unlock;
  741         }
  742 
  743         /* charge against new page */
  744         mem_cgroup_prepare_migration(page, newpage, &mem);
  745 
  746         if (PageWriteback(page)) {
  747                 /*
  748                  * Only in the case of a full syncronous migration is it
  749                  * necessary to wait for PageWriteback. In the async case,
  750                  * the retry loop is too short and in the sync-light case,
  751                  * the overhead of stalling is too much
  752                  */
  753                 if (mode != MIGRATE_SYNC) {
  754                         rc = -EBUSY;
  755                         goto uncharge;
  756                 }
  757                 if (!force)
  758                         goto uncharge;
  759                 wait_on_page_writeback(page);
  760         }
  761         /*
  762          * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
  763          * we cannot notice that anon_vma is freed while we migrates a page.
  764          * This get_anon_vma() delays freeing anon_vma pointer until the end
  765          * of migration. File cache pages are no problem because of page_lock()
  766          * File Caches may use write_page() or lock_page() in migration, then,
  767          * just care Anon page here.
  768          */
  769         if (PageAnon(page)) {
  770                 /*
  771                  * Only page_lock_anon_vma_read() understands the subtleties of
  772                  * getting a hold on an anon_vma from outside one of its mms.
  773                  */
  774                 anon_vma = page_get_anon_vma(page);
  775                 if (anon_vma) {
  776                         /*
  777                          * Anon page
  778                          */
  779                 } else if (PageSwapCache(page)) {
  780                         /*
  781                          * We cannot be sure that the anon_vma of an unmapped
  782                          * swapcache page is safe to use because we don't
  783                          * know in advance if the VMA that this page belonged
  784                          * to still exists. If the VMA and others sharing the
  785                          * data have been freed, then the anon_vma could
  786                          * already be invalid.
  787                          *
  788                          * To avoid this possibility, swapcache pages get
  789                          * migrated but are not remapped when migration
  790                          * completes
  791                          */
  792                         remap_swapcache = 0;
  793                 } else {
  794                         goto uncharge;
  795                 }
  796         }
  797 
  798         if (unlikely(balloon_page_movable(page))) {
  799                 /*
  800                  * A ballooned page does not need any special attention from
  801                  * physical to virtual reverse mapping procedures.
  802                  * Skip any attempt to unmap PTEs or to remap swap cache,
  803                  * in order to avoid burning cycles at rmap level, and perform
  804                  * the page migration right away (proteced by page lock).
  805                  */
  806                 rc = balloon_page_migrate(newpage, page, mode);
  807                 goto uncharge;
  808         }
  809 
  810         /*
  811          * Corner case handling:
  812          * 1. When a new swap-cache page is read into, it is added to the LRU
  813          * and treated as swapcache but it has no rmap yet.
  814          * Calling try_to_unmap() against a page->mapping==NULL page will
  815          * trigger a BUG.  So handle it here.
  816          * 2. An orphaned page (see truncate_complete_page) might have
  817          * fs-private metadata. The page can be picked up due to memory
  818          * offlining.  Everywhere else except page reclaim, the page is
  819          * invisible to the vm, so the page can not be migrated.  So try to
  820          * free the metadata, so the page can be freed.
  821          */
  822         if (!page->mapping) {
  823                 VM_BUG_ON(PageAnon(page));
  824                 if (page_has_private(page)) {
  825                         try_to_free_buffers(page);
  826                         goto uncharge;
  827                 }
  828                 goto skip_unmap;
  829         }
  830 
  831         /* Establish migration ptes or remove ptes */
  832         try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
  833 
  834 skip_unmap:
  835         if (!page_mapped(page))
  836                 rc = move_to_new_page(newpage, page, remap_swapcache, mode);
  837 
  838         if (rc && remap_swapcache)
  839                 remove_migration_ptes(page, page);
  840 
  841         /* Drop an anon_vma reference if we took one */
  842         if (anon_vma)
  843                 put_anon_vma(anon_vma);
  844 
  845 uncharge:
  846         mem_cgroup_end_migration(mem, page, newpage,
  847                                  (rc == MIGRATEPAGE_SUCCESS ||
  848                                   rc == MIGRATEPAGE_BALLOON_SUCCESS));
  849 unlock:
  850         unlock_page(page);
  851 out:
  852         return rc;
  853 }
  854 
  855 /*
  856  * Obtain the lock on page, remove all ptes and migrate the page
  857  * to the newly allocated page in newpage.
  858  */
  859 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
  860                         struct page *page, int force, bool offlining,
  861                         enum migrate_mode mode)
  862 {
  863         int rc = 0;
  864         int *result = NULL;
  865         struct page *newpage = get_new_page(page, private, &result);
  866 
  867         if (!newpage)
  868                 return -ENOMEM;
  869 
  870         if (page_count(page) == 1) {
  871                 /* page was freed from under us. So we are done. */
  872                 goto out;
  873         }
  874 
  875         if (unlikely(PageTransHuge(page)))
  876                 if (unlikely(split_huge_page(page)))
  877                         goto out;
  878 
  879         rc = __unmap_and_move(page, newpage, force, offlining, mode);
  880 
  881         if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
  882                 /*
  883                  * A ballooned page has been migrated already.
  884                  * Now, it's the time to wrap-up counters,
  885                  * handle the page back to Buddy and return.
  886                  */
  887                 dec_zone_page_state(page, NR_ISOLATED_ANON +
  888                                     page_is_file_cache(page));
  889                 balloon_page_free(page);
  890                 return MIGRATEPAGE_SUCCESS;
  891         }
  892 out:
  893         if (rc != -EAGAIN) {
  894                 /*
  895                  * A page that has been migrated has all references
  896                  * removed and will be freed. A page that has not been
  897                  * migrated will have kepts its references and be
  898                  * restored.
  899                  */
  900                 list_del(&page->lru);
  901                 dec_zone_page_state(page, NR_ISOLATED_ANON +
  902                                 page_is_file_cache(page));
  903                 putback_lru_page(page);
  904         }
  905         /*
  906          * Move the new page to the LRU. If migration was not successful
  907          * then this will free the page.
  908          */
  909         putback_lru_page(newpage);
  910         if (result) {
  911                 if (rc)
  912                         *result = rc;
  913                 else
  914                         *result = page_to_nid(newpage);
  915         }
  916         return rc;
  917 }
  918 
  919 /*
  920  * Counterpart of unmap_and_move_page() for hugepage migration.
  921  *
  922  * This function doesn't wait the completion of hugepage I/O
  923  * because there is no race between I/O and migration for hugepage.
  924  * Note that currently hugepage I/O occurs only in direct I/O
  925  * where no lock is held and PG_writeback is irrelevant,
  926  * and writeback status of all subpages are counted in the reference
  927  * count of the head page (i.e. if all subpages of a 2MB hugepage are
  928  * under direct I/O, the reference of the head page is 512 and a bit more.)
  929  * This means that when we try to migrate hugepage whose subpages are
  930  * doing direct I/O, some references remain after try_to_unmap() and
  931  * hugepage migration fails without data corruption.
  932  *
  933  * There is also no race when direct I/O is issued on the page under migration,
  934  * because then pte is replaced with migration swap entry and direct I/O code
  935  * will wait in the page fault for migration to complete.
  936  */
  937 static int unmap_and_move_huge_page(new_page_t get_new_page,
  938                                 unsigned long private, struct page *hpage,
  939                                 int force, bool offlining,
  940                                 enum migrate_mode mode)
  941 {
  942         int rc = 0;
  943         int *result = NULL;
  944         struct page *new_hpage = get_new_page(hpage, private, &result);
  945         struct anon_vma *anon_vma = NULL;
  946 
  947         if (!new_hpage)
  948                 return -ENOMEM;
  949 
  950         rc = -EAGAIN;
  951 
  952         if (!trylock_page(hpage)) {
  953                 if (!force || mode != MIGRATE_SYNC)
  954                         goto out;
  955                 lock_page(hpage);
  956         }
  957 
  958         if (PageAnon(hpage))
  959                 anon_vma = page_get_anon_vma(hpage);
  960 
  961         try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
  962 
  963         if (!page_mapped(hpage))
  964                 rc = move_to_new_page(new_hpage, hpage, 1, mode);
  965 
  966         if (rc)
  967                 remove_migration_ptes(hpage, hpage);
  968 
  969         if (anon_vma)
  970                 put_anon_vma(anon_vma);
  971 
  972         if (!rc)
  973                 hugetlb_cgroup_migrate(hpage, new_hpage);
  974 
  975         unlock_page(hpage);
  976 out:
  977         put_page(new_hpage);
  978         if (result) {
  979                 if (rc)
  980                         *result = rc;
  981                 else
  982                         *result = page_to_nid(new_hpage);
  983         }
  984         return rc;
  985 }
  986 
  987 /*
  988  * migrate_pages
  989  *
  990  * The function takes one list of pages to migrate and a function
  991  * that determines from the page to be migrated and the private data
  992  * the target of the move and allocates the page.
  993  *
  994  * The function returns after 10 attempts or if no pages
  995  * are movable anymore because to has become empty
  996  * or no retryable pages exist anymore.
  997  * Caller should call putback_lru_pages to return pages to the LRU
  998  * or free list only if ret != 0.
  999  *
 1000  * Return: Number of pages not migrated or error code.
 1001  */
 1002 int migrate_pages(struct list_head *from,
 1003                 new_page_t get_new_page, unsigned long private, bool offlining,
 1004                 enum migrate_mode mode, int reason)
 1005 {
 1006         int retry = 1;
 1007         int nr_failed = 0;
 1008         int nr_succeeded = 0;
 1009         int pass = 0;
 1010         struct page *page;
 1011         struct page *page2;
 1012         int swapwrite = current->flags & PF_SWAPWRITE;
 1013         int rc;
 1014 
 1015         if (!swapwrite)
 1016                 current->flags |= PF_SWAPWRITE;
 1017 
 1018         for(pass = 0; pass < 10 && retry; pass++) {
 1019                 retry = 0;
 1020 
 1021                 list_for_each_entry_safe(page, page2, from, lru) {
 1022                         cond_resched();
 1023 
 1024                         rc = unmap_and_move(get_new_page, private,
 1025                                                 page, pass > 2, offlining,
 1026                                                 mode);
 1027 
 1028                         switch(rc) {
 1029                         case -ENOMEM:
 1030                                 goto out;
 1031                         case -EAGAIN:
 1032                                 retry++;
 1033                                 break;
 1034                         case MIGRATEPAGE_SUCCESS:
 1035                                 nr_succeeded++;
 1036                                 break;
 1037                         default:
 1038                                 /* Permanent failure */
 1039                                 nr_failed++;
 1040                                 break;
 1041                         }
 1042                 }
 1043         }
 1044         rc = nr_failed + retry;
 1045 out:
 1046         if (nr_succeeded)
 1047                 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
 1048         if (nr_failed)
 1049                 count_vm_events(PGMIGRATE_FAIL, nr_failed);
 1050         trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
 1051 
 1052         if (!swapwrite)
 1053                 current->flags &= ~PF_SWAPWRITE;
 1054 
 1055         return rc;
 1056 }
 1057 
 1058 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
 1059                       unsigned long private, bool offlining,
 1060                       enum migrate_mode mode)
 1061 {
 1062         int pass, rc;
 1063 
 1064         for (pass = 0; pass < 10; pass++) {
 1065                 rc = unmap_and_move_huge_page(get_new_page,
 1066                                               private, hpage, pass > 2, offlining,
 1067                                               mode);
 1068                 switch (rc) {
 1069                 case -ENOMEM:
 1070                         goto out;
 1071                 case -EAGAIN:
 1072                         /* try again */
 1073                         cond_resched();
 1074                         break;
 1075                 case MIGRATEPAGE_SUCCESS:
 1076                         goto out;
 1077                 default:
 1078                         rc = -EIO;
 1079                         goto out;
 1080                 }
 1081         }
 1082 out:
 1083         return rc;
 1084 }
 1085 
 1086 #ifdef CONFIG_NUMA
 1087 /*
 1088  * Move a list of individual pages
 1089  */
 1090 struct page_to_node {
 1091         unsigned long addr;
 1092         struct page *page;
 1093         int node;
 1094         int status;
 1095 };
 1096 
 1097 static struct page *new_page_node(struct page *p, unsigned long private,
 1098                 int **result)
 1099 {
 1100         struct page_to_node *pm = (struct page_to_node *)private;
 1101 
 1102         while (pm->node != MAX_NUMNODES && pm->page != p)
 1103                 pm++;
 1104 
 1105         if (pm->node == MAX_NUMNODES)
 1106                 return NULL;
 1107 
 1108         *result = &pm->status;
 1109 
 1110         return alloc_pages_exact_node(pm->node,
 1111                                 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 1112 }
 1113 
 1114 /*
 1115  * Move a set of pages as indicated in the pm array. The addr
 1116  * field must be set to the virtual address of the page to be moved
 1117  * and the node number must contain a valid target node.
 1118  * The pm array ends with node = MAX_NUMNODES.
 1119  */
 1120 static int do_move_page_to_node_array(struct mm_struct *mm,
 1121                                       struct page_to_node *pm,
 1122                                       int migrate_all)
 1123 {
 1124         int err;
 1125         struct page_to_node *pp;
 1126         LIST_HEAD(pagelist);
 1127 
 1128         down_read(&mm->mmap_sem);
 1129 
 1130         /*
 1131          * Build a list of pages to migrate
 1132          */
 1133         for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
 1134                 struct vm_area_struct *vma;
 1135                 struct page *page;
 1136 
 1137                 err = -EFAULT;
 1138                 vma = find_vma(mm, pp->addr);
 1139                 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
 1140                         goto set_status;
 1141 
 1142                 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
 1143 
 1144                 err = PTR_ERR(page);
 1145                 if (IS_ERR(page))
 1146                         goto set_status;
 1147 
 1148                 err = -ENOENT;
 1149                 if (!page)
 1150                         goto set_status;
 1151 
 1152                 /* Use PageReserved to check for zero page */
 1153                 if (PageReserved(page) || PageKsm(page))
 1154                         goto put_and_set;
 1155 
 1156                 pp->page = page;
 1157                 err = page_to_nid(page);
 1158 
 1159                 if (err == pp->node)
 1160                         /*
 1161                          * Node already in the right place
 1162                          */
 1163                         goto put_and_set;
 1164 
 1165                 err = -EACCES;
 1166                 if (page_mapcount(page) > 1 &&
 1167                                 !migrate_all)
 1168                         goto put_and_set;
 1169 
 1170                 err = isolate_lru_page(page);
 1171                 if (!err) {
 1172                         list_add_tail(&page->lru, &pagelist);
 1173                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 1174                                             page_is_file_cache(page));
 1175                 }
 1176 put_and_set:
 1177                 /*
 1178                  * Either remove the duplicate refcount from
 1179                  * isolate_lru_page() or drop the page ref if it was
 1180                  * not isolated.
 1181                  */
 1182                 put_page(page);
 1183 set_status:
 1184                 pp->status = err;
 1185         }
 1186 
 1187         err = 0;
 1188         if (!list_empty(&pagelist)) {
 1189                 err = migrate_pages(&pagelist, new_page_node,
 1190                                 (unsigned long)pm, 0, MIGRATE_SYNC,
 1191                                 MR_SYSCALL);
 1192                 if (err)
 1193                         putback_lru_pages(&pagelist);
 1194         }
 1195 
 1196         up_read(&mm->mmap_sem);
 1197         return err;
 1198 }
 1199 
 1200 /*
 1201  * Migrate an array of page address onto an array of nodes and fill
 1202  * the corresponding array of status.
 1203  */
 1204 static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
 1205                          unsigned long nr_pages,
 1206                          const void __user * __user *pages,
 1207                          const int __user *nodes,
 1208                          int __user *status, int flags)
 1209 {
 1210         struct page_to_node *pm;
 1211         unsigned long chunk_nr_pages;
 1212         unsigned long chunk_start;
 1213         int err;
 1214 
 1215         err = -ENOMEM;
 1216         pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
 1217         if (!pm)
 1218                 goto out;
 1219 
 1220         migrate_prep();
 1221 
 1222         /*
 1223          * Store a chunk of page_to_node array in a page,
 1224          * but keep the last one as a marker
 1225          */
 1226         chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
 1227 
 1228         for (chunk_start = 0;
 1229              chunk_start < nr_pages;
 1230              chunk_start += chunk_nr_pages) {
 1231                 int j;
 1232 
 1233                 if (chunk_start + chunk_nr_pages > nr_pages)
 1234                         chunk_nr_pages = nr_pages - chunk_start;
 1235 
 1236                 /* fill the chunk pm with addrs and nodes from user-space */
 1237                 for (j = 0; j < chunk_nr_pages; j++) {
 1238                         const void __user *p;
 1239                         int node;
 1240 
 1241                         err = -EFAULT;
 1242                         if (get_user(p, pages + j + chunk_start))
 1243                                 goto out_pm;
 1244                         pm[j].addr = (unsigned long) p;
 1245 
 1246                         if (get_user(node, nodes + j + chunk_start))
 1247                                 goto out_pm;
 1248 
 1249                         err = -ENODEV;
 1250                         if (node < 0 || node >= MAX_NUMNODES)
 1251                                 goto out_pm;
 1252 
 1253                         if (!node_state(node, N_MEMORY))
 1254                                 goto out_pm;
 1255 
 1256                         err = -EACCES;
 1257                         if (!node_isset(node, task_nodes))
 1258                                 goto out_pm;
 1259 
 1260                         pm[j].node = node;
 1261                 }
 1262 
 1263                 /* End marker for this chunk */
 1264                 pm[chunk_nr_pages].node = MAX_NUMNODES;
 1265 
 1266                 /* Migrate this chunk */
 1267                 err = do_move_page_to_node_array(mm, pm,
 1268                                                  flags & MPOL_MF_MOVE_ALL);
 1269                 if (err < 0)
 1270                         goto out_pm;
 1271 
 1272                 /* Return status information */
 1273                 for (j = 0; j < chunk_nr_pages; j++)
 1274                         if (put_user(pm[j].status, status + j + chunk_start)) {
 1275                                 err = -EFAULT;
 1276                                 goto out_pm;
 1277                         }
 1278         }
 1279         err = 0;
 1280 
 1281 out_pm:
 1282         free_page((unsigned long)pm);
 1283 out:
 1284         return err;
 1285 }
 1286 
 1287 /*
 1288  * Determine the nodes of an array of pages and store it in an array of status.
 1289  */
 1290 static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 1291                                 const void __user **pages, int *status)
 1292 {
 1293         unsigned long i;
 1294 
 1295         down_read(&mm->mmap_sem);
 1296 
 1297         for (i = 0; i < nr_pages; i++) {
 1298                 unsigned long addr = (unsigned long)(*pages);
 1299                 struct vm_area_struct *vma;
 1300                 struct page *page;
 1301                 int err = -EFAULT;
 1302 
 1303                 vma = find_vma(mm, addr);
 1304                 if (!vma || addr < vma->vm_start)
 1305                         goto set_status;
 1306 
 1307                 page = follow_page(vma, addr, 0);
 1308 
 1309                 err = PTR_ERR(page);
 1310                 if (IS_ERR(page))
 1311                         goto set_status;
 1312 
 1313                 err = -ENOENT;
 1314                 /* Use PageReserved to check for zero page */
 1315                 if (!page || PageReserved(page) || PageKsm(page))
 1316                         goto set_status;
 1317 
 1318                 err = page_to_nid(page);
 1319 set_status:
 1320                 *status = err;
 1321 
 1322                 pages++;
 1323                 status++;
 1324         }
 1325 
 1326         up_read(&mm->mmap_sem);
 1327 }
 1328 
 1329 /*
 1330  * Determine the nodes of a user array of pages and store it in
 1331  * a user array of status.
 1332  */
 1333 static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
 1334                          const void __user * __user *pages,
 1335                          int __user *status)
 1336 {
 1337 #define DO_PAGES_STAT_CHUNK_NR 16
 1338         const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
 1339         int chunk_status[DO_PAGES_STAT_CHUNK_NR];
 1340 
 1341         while (nr_pages) {
 1342                 unsigned long chunk_nr;
 1343 
 1344                 chunk_nr = nr_pages;
 1345                 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
 1346                         chunk_nr = DO_PAGES_STAT_CHUNK_NR;
 1347 
 1348                 if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
 1349                         break;
 1350 
 1351                 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
 1352 
 1353                 if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status)))
 1354                         break;
 1355 
 1356                 pages += chunk_nr;
 1357                 status += chunk_nr;
 1358                 nr_pages -= chunk_nr;
 1359         }
 1360         return nr_pages ? -EFAULT : 0;
 1361 }
 1362 
 1363 /*
 1364  * Move a list of pages in the address space of the currently executing
 1365  * process.
 1366  */
 1367 SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
 1368                 const void __user * __user *, pages,
 1369                 const int __user *, nodes,
 1370                 int __user *, status, int, flags)
 1371 {
 1372         const struct cred *cred = current_cred(), *tcred;
 1373         struct task_struct *task;
 1374         struct mm_struct *mm;
 1375         int err;
 1376         nodemask_t task_nodes;
 1377 
 1378         /* Check flags */
 1379         if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
 1380                 return -EINVAL;
 1381 
 1382         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 1383                 return -EPERM;
 1384 
 1385         /* Find the mm_struct */
 1386         rcu_read_lock();
 1387         task = pid ? find_task_by_vpid(pid) : current;
 1388         if (!task) {
 1389                 rcu_read_unlock();
 1390                 return -ESRCH;
 1391         }
 1392         get_task_struct(task);
 1393 
 1394         /*
 1395          * Check if this process has the right to modify the specified
 1396          * process. The right exists if the process has administrative
 1397          * capabilities, superuser privileges or the same
 1398          * userid as the target process.
 1399          */
 1400         tcred = __task_cred(task);
 1401         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
 1402             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
 1403             !capable(CAP_SYS_NICE)) {
 1404                 rcu_read_unlock();
 1405                 err = -EPERM;
 1406                 goto out;
 1407         }
 1408         rcu_read_unlock();
 1409 
 1410         err = security_task_movememory(task);
 1411         if (err)
 1412                 goto out;
 1413 
 1414         task_nodes = cpuset_mems_allowed(task);
 1415         mm = get_task_mm(task);
 1416         put_task_struct(task);
 1417 
 1418         if (!mm)
 1419                 return -EINVAL;
 1420 
 1421         if (nodes)
 1422                 err = do_pages_move(mm, task_nodes, nr_pages, pages,
 1423                                     nodes, status, flags);
 1424         else
 1425                 err = do_pages_stat(mm, nr_pages, pages, status);
 1426 
 1427         mmput(mm);
 1428         return err;
 1429 
 1430 out:
 1431         put_task_struct(task);
 1432         return err;
 1433 }
 1434 
 1435 /*
 1436  * Call migration functions in the vma_ops that may prepare
 1437  * memory in a vm for migration. migration functions may perform
 1438  * the migration for vmas that do not have an underlying page struct.
 1439  */
 1440 int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
 1441         const nodemask_t *from, unsigned long flags)
 1442 {
 1443         struct vm_area_struct *vma;
 1444         int err = 0;
 1445 
 1446         for (vma = mm->mmap; vma && !err; vma = vma->vm_next) {
 1447                 if (vma->vm_ops && vma->vm_ops->migrate) {
 1448                         err = vma->vm_ops->migrate(vma, to, from, flags);
 1449                         if (err)
 1450                                 break;
 1451                 }
 1452         }
 1453         return err;
 1454 }
 1455 
 1456 #ifdef CONFIG_NUMA_BALANCING
 1457 /*
 1458  * Returns true if this is a safe migration target node for misplaced NUMA
 1459  * pages. Currently it only checks the watermarks which crude
 1460  */
 1461 static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
 1462                                    int nr_migrate_pages)
 1463 {
 1464         int z;
 1465         for (z = pgdat->nr_zones - 1; z >= 0; z--) {
 1466                 struct zone *zone = pgdat->node_zones + z;
 1467 
 1468                 if (!populated_zone(zone))
 1469                         continue;
 1470 
 1471                 if (zone->all_unreclaimable)
 1472                         continue;
 1473 
 1474                 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
 1475                 if (!zone_watermark_ok(zone, 0,
 1476                                        high_wmark_pages(zone) +
 1477                                        nr_migrate_pages,
 1478                                        0, 0))
 1479                         continue;
 1480                 return true;
 1481         }
 1482         return false;
 1483 }
 1484 
 1485 static struct page *alloc_misplaced_dst_page(struct page *page,
 1486                                            unsigned long data,
 1487                                            int **result)
 1488 {
 1489         int nid = (int) data;
 1490         struct page *newpage;
 1491 
 1492         newpage = alloc_pages_exact_node(nid,
 1493                                          (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
 1494                                           __GFP_NOMEMALLOC | __GFP_NORETRY |
 1495                                           __GFP_NOWARN) &
 1496                                          ~GFP_IOFS, 0);
 1497         if (newpage)
 1498                 page_xchg_last_nid(newpage, page_last_nid(page));
 1499 
 1500         return newpage;
 1501 }
 1502 
 1503 /*
 1504  * page migration rate limiting control.
 1505  * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
 1506  * window of time. Default here says do not migrate more than 1280M per second.
 1507  * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
 1508  * as it is faults that reset the window, pte updates will happen unconditionally
 1509  * if there has not been a fault since @pteupdate_interval_millisecs after the
 1510  * throttle window closed.
 1511  */
 1512 static unsigned int migrate_interval_millisecs __read_mostly = 100;
 1513 static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
 1514 static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
 1515 
 1516 /* Returns true if NUMA migration is currently rate limited */
 1517 bool migrate_ratelimited(int node)
 1518 {
 1519         pg_data_t *pgdat = NODE_DATA(node);
 1520 
 1521         if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
 1522                                 msecs_to_jiffies(pteupdate_interval_millisecs)))
 1523                 return false;
 1524 
 1525         if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
 1526                 return false;
 1527 
 1528         return true;
 1529 }
 1530 
 1531 /* Returns true if the node is migrate rate-limited after the update */
 1532 bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
 1533 {
 1534         bool rate_limited = false;
 1535 
 1536         /*
 1537          * Rate-limit the amount of data that is being migrated to a node.
 1538          * Optimal placement is no good if the memory bus is saturated and
 1539          * all the time is being spent migrating!
 1540          */
 1541         spin_lock(&pgdat->numabalancing_migrate_lock);
 1542         if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
 1543                 pgdat->numabalancing_migrate_nr_pages = 0;
 1544                 pgdat->numabalancing_migrate_next_window = jiffies +
 1545                         msecs_to_jiffies(migrate_interval_millisecs);
 1546         }
 1547         if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
 1548                 rate_limited = true;
 1549         else
 1550                 pgdat->numabalancing_migrate_nr_pages += nr_pages;
 1551         spin_unlock(&pgdat->numabalancing_migrate_lock);
 1552         
 1553         return rate_limited;
 1554 }
 1555 
 1556 int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 1557 {
 1558         int ret = 0;
 1559 
 1560         /* Avoid migrating to a node that is nearly full */
 1561         if (migrate_balanced_pgdat(pgdat, 1)) {
 1562                 int page_lru;
 1563 
 1564                 if (isolate_lru_page(page)) {
 1565                         put_page(page);
 1566                         return 0;
 1567                 }
 1568 
 1569                 /* Page is isolated */
 1570                 ret = 1;
 1571                 page_lru = page_is_file_cache(page);
 1572                 if (!PageTransHuge(page))
 1573                         inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
 1574                 else
 1575                         mod_zone_page_state(page_zone(page),
 1576                                         NR_ISOLATED_ANON + page_lru,
 1577                                         HPAGE_PMD_NR);
 1578         }
 1579 
 1580         /*
 1581          * Page is either isolated or there is not enough space on the target
 1582          * node. If isolated, then it has taken a reference count and the
 1583          * callers reference can be safely dropped without the page
 1584          * disappearing underneath us during migration. Otherwise the page is
 1585          * not to be migrated but the callers reference should still be
 1586          * dropped so it does not leak.
 1587          */
 1588         put_page(page);
 1589 
 1590         return ret;
 1591 }
 1592 
 1593 /*
 1594  * Attempt to migrate a misplaced page to the specified destination
 1595  * node. Caller is expected to have an elevated reference count on
 1596  * the page that will be dropped by this function before returning.
 1597  */
 1598 int migrate_misplaced_page(struct page *page, int node)
 1599 {
 1600         pg_data_t *pgdat = NODE_DATA(node);
 1601         int isolated = 0;
 1602         int nr_remaining;
 1603         LIST_HEAD(migratepages);
 1604 
 1605         /*
 1606          * Don't migrate pages that are mapped in multiple processes.
 1607          * TODO: Handle false sharing detection instead of this hammer
 1608          */
 1609         if (page_mapcount(page) != 1) {
 1610                 put_page(page);
 1611                 goto out;
 1612         }
 1613 
 1614         /*
 1615          * Rate-limit the amount of data that is being migrated to a node.
 1616          * Optimal placement is no good if the memory bus is saturated and
 1617          * all the time is being spent migrating!
 1618          */
 1619         if (numamigrate_update_ratelimit(pgdat, 1)) {
 1620                 put_page(page);
 1621                 goto out;
 1622         }
 1623 
 1624         isolated = numamigrate_isolate_page(pgdat, page);
 1625         if (!isolated)
 1626                 goto out;
 1627 
 1628         list_add(&page->lru, &migratepages);
 1629         nr_remaining = migrate_pages(&migratepages,
 1630                         alloc_misplaced_dst_page,
 1631                         node, false, MIGRATE_ASYNC,
 1632                         MR_NUMA_MISPLACED);
 1633         if (nr_remaining) {
 1634                 putback_lru_pages(&migratepages);
 1635                 isolated = 0;
 1636         } else
 1637                 count_vm_numa_event(NUMA_PAGE_MIGRATE);
 1638         BUG_ON(!list_empty(&migratepages));
 1639 out:
 1640         return isolated;
 1641 }
 1642 #endif /* CONFIG_NUMA_BALANCING */
 1643 
 1644 #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 1645 int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 1646                                 struct vm_area_struct *vma,
 1647                                 pmd_t *pmd, pmd_t entry,
 1648                                 unsigned long address,
 1649                                 struct page *page, int node)
 1650 {
 1651         unsigned long haddr = address & HPAGE_PMD_MASK;
 1652         pg_data_t *pgdat = NODE_DATA(node);
 1653         int isolated = 0;
 1654         struct page *new_page = NULL;
 1655         struct mem_cgroup *memcg = NULL;
 1656         int page_lru = page_is_file_cache(page);
 1657 
 1658         /*
 1659          * Don't migrate pages that are mapped in multiple processes.
 1660          * TODO: Handle false sharing detection instead of this hammer
 1661          */
 1662         if (page_mapcount(page) != 1)
 1663                 goto out_dropref;
 1664 
 1665         /*
 1666          * Rate-limit the amount of data that is being migrated to a node.
 1667          * Optimal placement is no good if the memory bus is saturated and
 1668          * all the time is being spent migrating!
 1669          */
 1670         if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
 1671                 goto out_dropref;
 1672 
 1673         new_page = alloc_pages_node(node,
 1674                 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
 1675         if (!new_page) {
 1676                 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
 1677                 goto out_dropref;
 1678         }
 1679         page_xchg_last_nid(new_page, page_last_nid(page));
 1680 
 1681         isolated = numamigrate_isolate_page(pgdat, page);
 1682 
 1683         /*
 1684          * Failing to isolate or a GUP pin prevents migration. The expected
 1685          * page count is 2. 1 for anonymous pages without a mapping and 1
 1686          * for the callers pin. If the page was isolated, the page will
 1687          * need to be put back on the LRU.
 1688          */
 1689         if (!isolated || page_count(page) != 2) {
 1690                 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
 1691                 put_page(new_page);
 1692                 if (isolated) {
 1693                         putback_lru_page(page);
 1694                         isolated = 0;
 1695                         goto out;
 1696                 }
 1697                 goto out_keep_locked;
 1698         }
 1699 
 1700         /* Prepare a page as a migration target */
 1701         __set_page_locked(new_page);
 1702         SetPageSwapBacked(new_page);
 1703 
 1704         /* anon mapping, we can simply copy page->mapping to the new page: */
 1705         new_page->mapping = page->mapping;
 1706         new_page->index = page->index;
 1707         migrate_page_copy(new_page, page);
 1708         WARN_ON(PageLRU(new_page));
 1709 
 1710         /* Recheck the target PMD */
 1711         spin_lock(&mm->page_table_lock);
 1712         if (unlikely(!pmd_same(*pmd, entry))) {
 1713                 spin_unlock(&mm->page_table_lock);
 1714 
 1715                 /* Reverse changes made by migrate_page_copy() */
 1716                 if (TestClearPageActive(new_page))
 1717                         SetPageActive(page);
 1718                 if (TestClearPageUnevictable(new_page))
 1719                         SetPageUnevictable(page);
 1720                 mlock_migrate_page(page, new_page);
 1721 
 1722                 unlock_page(new_page);
 1723                 put_page(new_page);             /* Free it */
 1724 
 1725                 unlock_page(page);
 1726                 putback_lru_page(page);
 1727 
 1728                 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
 1729                 goto out;
 1730         }
 1731 
 1732         /*
 1733          * Traditional migration needs to prepare the memcg charge
 1734          * transaction early to prevent the old page from being
 1735          * uncharged when installing migration entries.  Here we can
 1736          * save the potential rollback and start the charge transfer
 1737          * only when migration is already known to end successfully.
 1738          */
 1739         mem_cgroup_prepare_migration(page, new_page, &memcg);
 1740 
 1741         entry = mk_pmd(new_page, vma->vm_page_prot);
 1742         entry = pmd_mknonnuma(entry);
 1743         entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 1744         entry = pmd_mkhuge(entry);
 1745 
 1746         page_add_new_anon_rmap(new_page, vma, haddr);
 1747 
 1748         set_pmd_at(mm, haddr, pmd, entry);
 1749         update_mmu_cache_pmd(vma, address, &entry);
 1750         page_remove_rmap(page);
 1751         /*
 1752          * Finish the charge transaction under the page table lock to
 1753          * prevent split_huge_page() from dividing up the charge
 1754          * before it's fully transferred to the new page.
 1755          */
 1756         mem_cgroup_end_migration(memcg, page, new_page, true);
 1757         spin_unlock(&mm->page_table_lock);
 1758 
 1759         unlock_page(new_page);
 1760         unlock_page(page);
 1761         put_page(page);                 /* Drop the rmap reference */
 1762         put_page(page);                 /* Drop the LRU isolation reference */
 1763 
 1764         count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
 1765         count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
 1766 
 1767 out:
 1768         mod_zone_page_state(page_zone(page),
 1769                         NR_ISOLATED_ANON + page_lru,
 1770                         -HPAGE_PMD_NR);
 1771         return isolated;
 1772 
 1773 out_dropref:
 1774         put_page(page);
 1775 out_keep_locked:
 1776         return 0;
 1777 }
 1778 #endif /* CONFIG_NUMA_BALANCING */
 1779 
 1780 #endif /* CONFIG_NUMA */

Cache object: bcd1c6f4a79a57fb5433b312a2ddf6cd


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.