hugetlb.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Generic hugetlb support.
    3  * (C) Nadia Yvette Chambers, April 2004
    4  */
    5 #include <linux/list.h>
    6 #include <linux/init.h>
    7 #include <linux/module.h>
    8 #include <linux/mm.h>
    9 #include <linux/seq_file.h>
   10 #include <linux/sysctl.h>
   11 #include <linux/highmem.h>
   12 #include <linux/mmu_notifier.h>
   13 #include <linux/nodemask.h>
   14 #include <linux/pagemap.h>
   15 #include <linux/mempolicy.h>
   16 #include <linux/cpuset.h>
   17 #include <linux/mutex.h>
   18 #include <linux/bootmem.h>
   19 #include <linux/sysfs.h>
   20 #include <linux/slab.h>
   21 #include <linux/rmap.h>
   22 #include <linux/swap.h>
   23 #include <linux/swapops.h>
   24 
   25 #include <asm/page.h>
   26 #include <asm/pgtable.h>
   27 #include <asm/tlb.h>
   28 
   29 #include <linux/io.h>
   30 #include <linux/hugetlb.h>
   31 #include <linux/hugetlb_cgroup.h>
   32 #include <linux/node.h>
   33 #include "internal.h"
   34 
   35 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
   36 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
   37 unsigned long hugepages_treat_as_movable;
   38 
   39 int hugetlb_max_hstate __read_mostly;
   40 unsigned int default_hstate_idx;
   41 struct hstate hstates[HUGE_MAX_HSTATE];
   42 
   43 __initdata LIST_HEAD(huge_boot_pages);
   44 
   45 /* for command line parsing */
   46 static struct hstate * __initdata parsed_hstate;
   47 static unsigned long __initdata default_hstate_max_huge_pages;
   48 static unsigned long __initdata default_hstate_size;
   49 
   50 /*
   51  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
   52  */
   53 DEFINE_SPINLOCK(hugetlb_lock);
   54 
   55 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
   56 {
   57         bool free = (spool->count == 0) && (spool->used_hpages == 0);
   58 
   59         spin_unlock(&spool->lock);
   60 
   61         /* If no pages are used, and no other handles to the subpool
   62          * remain, free the subpool the subpool remain */
   63         if (free)
   64                 kfree(spool);
   65 }
   66 
   67 struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
   68 {
   69         struct hugepage_subpool *spool;
   70 
   71         spool = kmalloc(sizeof(*spool), GFP_KERNEL);
   72         if (!spool)
   73                 return NULL;
   74 
   75         spin_lock_init(&spool->lock);
   76         spool->count = 1;
   77         spool->max_hpages = nr_blocks;
   78         spool->used_hpages = 0;
   79 
   80         return spool;
   81 }
   82 
   83 void hugepage_put_subpool(struct hugepage_subpool *spool)
   84 {
   85         spin_lock(&spool->lock);
   86         BUG_ON(!spool->count);
   87         spool->count--;
   88         unlock_or_release_subpool(spool);
   89 }
   90 
   91 static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
   92                                       long delta)
   93 {
   94         int ret = 0;
   95 
   96         if (!spool)
   97                 return 0;
   98 
   99         spin_lock(&spool->lock);
  100         if ((spool->used_hpages + delta) <= spool->max_hpages) {
  101                 spool->used_hpages += delta;
  102         } else {
  103                 ret = -ENOMEM;
  104         }
  105         spin_unlock(&spool->lock);
  106 
  107         return ret;
  108 }
  109 
  110 static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
  111                                        long delta)
  112 {
  113         if (!spool)
  114                 return;
  115 
  116         spin_lock(&spool->lock);
  117         spool->used_hpages -= delta;
  118         /* If hugetlbfs_put_super couldn't free spool due to
  119         * an outstanding quota reference, free it now. */
  120         unlock_or_release_subpool(spool);
  121 }
  122 
  123 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
  124 {
  125         return HUGETLBFS_SB(inode->i_sb)->spool;
  126 }
  127 
  128 static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
  129 {
  130         return subpool_inode(vma->vm_file->f_dentry->d_inode);
  131 }
  132 
  133 /*
  134  * Region tracking -- allows tracking of reservations and instantiated pages
  135  *                    across the pages in a mapping.
  136  *
  137  * The region data structures are protected by a combination of the mmap_sem
  138  * and the hugetlb_instantion_mutex.  To access or modify a region the caller
  139  * must either hold the mmap_sem for write, or the mmap_sem for read and
  140  * the hugetlb_instantiation mutex:
  141  *
  142  *      down_write(&mm->mmap_sem);
  143  * or
  144  *      down_read(&mm->mmap_sem);
  145  *      mutex_lock(&hugetlb_instantiation_mutex);
  146  */
  147 struct file_region {
  148         struct list_head link;
  149         long from;
  150         long to;
  151 };
  152 
  153 static long region_add(struct list_head *head, long f, long t)
  154 {
  155         struct file_region *rg, *nrg, *trg;
  156 
  157         /* Locate the region we are either in or before. */
  158         list_for_each_entry(rg, head, link)
  159                 if (f <= rg->to)
  160                         break;
  161 
  162         /* Round our left edge to the current segment if it encloses us. */
  163         if (f > rg->from)
  164                 f = rg->from;
  165 
  166         /* Check for and consume any regions we now overlap with. */
  167         nrg = rg;
  168         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
  169                 if (&rg->link == head)
  170                         break;
  171                 if (rg->from > t)
  172                         break;
  173 
  174                 /* If this area reaches higher then extend our area to
  175                  * include it completely.  If this is not the first area
  176                  * which we intend to reuse, free it. */
  177                 if (rg->to > t)
  178                         t = rg->to;
  179                 if (rg != nrg) {
  180                         list_del(&rg->link);
  181                         kfree(rg);
  182                 }
  183         }
  184         nrg->from = f;
  185         nrg->to = t;
  186         return 0;
  187 }
  188 
  189 static long region_chg(struct list_head *head, long f, long t)
  190 {
  191         struct file_region *rg, *nrg;
  192         long chg = 0;
  193 
  194         /* Locate the region we are before or in. */
  195         list_for_each_entry(rg, head, link)
  196                 if (f <= rg->to)
  197                         break;
  198 
  199         /* If we are below the current region then a new region is required.
  200          * Subtle, allocate a new region at the position but make it zero
  201          * size such that we can guarantee to record the reservation. */
  202         if (&rg->link == head || t < rg->from) {
  203                 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
  204                 if (!nrg)
  205                         return -ENOMEM;
  206                 nrg->from = f;
  207                 nrg->to   = f;
  208                 INIT_LIST_HEAD(&nrg->link);
  209                 list_add(&nrg->link, rg->link.prev);
  210 
  211                 return t - f;
  212         }
  213 
  214         /* Round our left edge to the current segment if it encloses us. */
  215         if (f > rg->from)
  216                 f = rg->from;
  217         chg = t - f;
  218 
  219         /* Check for and consume any regions we now overlap with. */
  220         list_for_each_entry(rg, rg->link.prev, link) {
  221                 if (&rg->link == head)
  222                         break;
  223                 if (rg->from > t)
  224                         return chg;
  225 
  226                 /* We overlap with this area, if it extends further than
  227                  * us then we must extend ourselves.  Account for its
  228                  * existing reservation. */
  229                 if (rg->to > t) {
  230                         chg += rg->to - t;
  231                         t = rg->to;
  232                 }
  233                 chg -= rg->to - rg->from;
  234         }
  235         return chg;
  236 }
  237 
  238 static long region_truncate(struct list_head *head, long end)
  239 {
  240         struct file_region *rg, *trg;
  241         long chg = 0;
  242 
  243         /* Locate the region we are either in or before. */
  244         list_for_each_entry(rg, head, link)
  245                 if (end <= rg->to)
  246                         break;
  247         if (&rg->link == head)
  248                 return 0;
  249 
  250         /* If we are in the middle of a region then adjust it. */
  251         if (end > rg->from) {
  252                 chg = rg->to - end;
  253                 rg->to = end;
  254                 rg = list_entry(rg->link.next, typeof(*rg), link);
  255         }
  256 
  257         /* Drop any remaining regions. */
  258         list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
  259                 if (&rg->link == head)
  260                         break;
  261                 chg += rg->to - rg->from;
  262                 list_del(&rg->link);
  263                 kfree(rg);
  264         }
  265         return chg;
  266 }
  267 
  268 static long region_count(struct list_head *head, long f, long t)
  269 {
  270         struct file_region *rg;
  271         long chg = 0;
  272 
  273         /* Locate each segment we overlap with, and count that overlap. */
  274         list_for_each_entry(rg, head, link) {
  275                 long seg_from;
  276                 long seg_to;
  277 
  278                 if (rg->to <= f)
  279                         continue;
  280                 if (rg->from >= t)
  281                         break;
  282 
  283                 seg_from = max(rg->from, f);
  284                 seg_to = min(rg->to, t);
  285 
  286                 chg += seg_to - seg_from;
  287         }
  288 
  289         return chg;
  290 }
  291 
  292 /*
  293  * Convert the address within this vma to the page offset within
  294  * the mapping, in pagecache page units; huge pages here.
  295  */
  296 static pgoff_t vma_hugecache_offset(struct hstate *h,
  297                         struct vm_area_struct *vma, unsigned long address)
  298 {
  299         return ((address - vma->vm_start) >> huge_page_shift(h)) +
  300                         (vma->vm_pgoff >> huge_page_order(h));
  301 }
  302 
  303 pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
  304                                      unsigned long address)
  305 {
  306         return vma_hugecache_offset(hstate_vma(vma), vma, address);
  307 }
  308 
  309 /*
  310  * Return the size of the pages allocated when backing a VMA. In the majority
  311  * cases this will be same size as used by the page table entries.
  312  */
  313 unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
  314 {
  315         struct hstate *hstate;
  316 
  317         if (!is_vm_hugetlb_page(vma))
  318                 return PAGE_SIZE;
  319 
  320         hstate = hstate_vma(vma);
  321 
  322         return 1UL << (hstate->order + PAGE_SHIFT);
  323 }
  324 EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
  325 
  326 /*
  327  * Return the page size being used by the MMU to back a VMA. In the majority
  328  * of cases, the page size used by the kernel matches the MMU size. On
  329  * architectures where it differs, an architecture-specific version of this
  330  * function is required.
  331  */
  332 #ifndef vma_mmu_pagesize
  333 unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  334 {
  335         return vma_kernel_pagesize(vma);
  336 }
  337 #endif
  338 
  339 /*
  340  * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
  341  * bits of the reservation map pointer, which are always clear due to
  342  * alignment.
  343  */
  344 #define HPAGE_RESV_OWNER    (1UL << 0)
  345 #define HPAGE_RESV_UNMAPPED (1UL << 1)
  346 #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
  347 
  348 /*
  349  * These helpers are used to track how many pages are reserved for
  350  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
  351  * is guaranteed to have their future faults succeed.
  352  *
  353  * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
  354  * the reserve counters are updated with the hugetlb_lock held. It is safe
  355  * to reset the VMA at fork() time as it is not in use yet and there is no
  356  * chance of the global counters getting corrupted as a result of the values.
  357  *
  358  * The private mapping reservation is represented in a subtly different
  359  * manner to a shared mapping.  A shared mapping has a region map associated
  360  * with the underlying file, this region map represents the backing file
  361  * pages which have ever had a reservation assigned which this persists even
  362  * after the page is instantiated.  A private mapping has a region map
  363  * associated with the original mmap which is attached to all VMAs which
  364  * reference it, this region map represents those offsets which have consumed
  365  * reservation ie. where pages have been instantiated.
  366  */
  367 static unsigned long get_vma_private_data(struct vm_area_struct *vma)
  368 {
  369         return (unsigned long)vma->vm_private_data;
  370 }
  371 
  372 static void set_vma_private_data(struct vm_area_struct *vma,
  373                                                         unsigned long value)
  374 {
  375         vma->vm_private_data = (void *)value;
  376 }
  377 
  378 struct resv_map {
  379         struct kref refs;
  380         struct list_head regions;
  381 };
  382 
  383 static struct resv_map *resv_map_alloc(void)
  384 {
  385         struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
  386         if (!resv_map)
  387                 return NULL;
  388 
  389         kref_init(&resv_map->refs);
  390         INIT_LIST_HEAD(&resv_map->regions);
  391 
  392         return resv_map;
  393 }
  394 
  395 static void resv_map_release(struct kref *ref)
  396 {
  397         struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
  398 
  399         /* Clear out any active regions before we release the map. */
  400         region_truncate(&resv_map->regions, 0);
  401         kfree(resv_map);
  402 }
  403 
  404 static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
  405 {
  406         VM_BUG_ON(!is_vm_hugetlb_page(vma));
  407         if (!(vma->vm_flags & VM_MAYSHARE))
  408                 return (struct resv_map *)(get_vma_private_data(vma) &
  409                                                         ~HPAGE_RESV_MASK);
  410         return NULL;
  411 }
  412 
  413 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
  414 {
  415         VM_BUG_ON(!is_vm_hugetlb_page(vma));
  416         VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
  417 
  418         set_vma_private_data(vma, (get_vma_private_data(vma) &
  419                                 HPAGE_RESV_MASK) | (unsigned long)map);
  420 }
  421 
  422 static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
  423 {
  424         VM_BUG_ON(!is_vm_hugetlb_page(vma));
  425         VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
  426 
  427         set_vma_private_data(vma, get_vma_private_data(vma) | flags);
  428 }
  429 
  430 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
  431 {
  432         VM_BUG_ON(!is_vm_hugetlb_page(vma));
  433 
  434         return (get_vma_private_data(vma) & flag) != 0;
  435 }
  436 
  437 /* Decrement the reserved pages in the hugepage pool by one */
  438 static void decrement_hugepage_resv_vma(struct hstate *h,
  439                         struct vm_area_struct *vma)
  440 {
  441         if (vma->vm_flags & VM_NORESERVE)
  442                 return;
  443 
  444         if (vma->vm_flags & VM_MAYSHARE) {
  445                 /* Shared mappings always use reserves */
  446                 h->resv_huge_pages--;
  447         } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
  448                 /*
  449                  * Only the process that called mmap() has reserves for
  450                  * private mappings.
  451                  */
  452                 h->resv_huge_pages--;
  453         }
  454 }
  455 
  456 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
  457 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
  458 {
  459         VM_BUG_ON(!is_vm_hugetlb_page(vma));
  460         if (!(vma->vm_flags & VM_MAYSHARE))
  461                 vma->vm_private_data = (void *)0;
  462 }
  463 
  464 /* Returns true if the VMA has associated reserve pages */
  465 static int vma_has_reserves(struct vm_area_struct *vma)
  466 {
  467         if (vma->vm_flags & VM_MAYSHARE)
  468                 return 1;
  469         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
  470                 return 1;
  471         return 0;
  472 }
  473 
  474 static void copy_gigantic_page(struct page *dst, struct page *src)
  475 {
  476         int i;
  477         struct hstate *h = page_hstate(src);
  478         struct page *dst_base = dst;
  479         struct page *src_base = src;
  480 
  481         for (i = 0; i < pages_per_huge_page(h); ) {
  482                 cond_resched();
  483                 copy_highpage(dst, src);
  484 
  485                 i++;
  486                 dst = mem_map_next(dst, dst_base, i);
  487                 src = mem_map_next(src, src_base, i);
  488         }
  489 }
  490 
  491 void copy_huge_page(struct page *dst, struct page *src)
  492 {
  493         int i;
  494         struct hstate *h = page_hstate(src);
  495 
  496         if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
  497                 copy_gigantic_page(dst, src);
  498                 return;
  499         }
  500 
  501         might_sleep();
  502         for (i = 0; i < pages_per_huge_page(h); i++) {
  503                 cond_resched();
  504                 copy_highpage(dst + i, src + i);
  505         }
  506 }
  507 
  508 static void enqueue_huge_page(struct hstate *h, struct page *page)
  509 {
  510         int nid = page_to_nid(page);
  511         list_move(&page->lru, &h->hugepage_freelists[nid]);
  512         h->free_huge_pages++;
  513         h->free_huge_pages_node[nid]++;
  514 }
  515 
  516 static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
  517 {
  518         struct page *page;
  519 
  520         if (list_empty(&h->hugepage_freelists[nid]))
  521                 return NULL;
  522         page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
  523         list_move(&page->lru, &h->hugepage_activelist);
  524         set_page_refcounted(page);
  525         h->free_huge_pages--;
  526         h->free_huge_pages_node[nid]--;
  527         return page;
  528 }
  529 
  530 static struct page *dequeue_huge_page_vma(struct hstate *h,
  531                                 struct vm_area_struct *vma,
  532                                 unsigned long address, int avoid_reserve)
  533 {
  534         struct page *page = NULL;
  535         struct mempolicy *mpol;
  536         nodemask_t *nodemask;
  537         struct zonelist *zonelist;
  538         struct zone *zone;
  539         struct zoneref *z;
  540         unsigned int cpuset_mems_cookie;
  541 
  542 retry_cpuset:
  543         cpuset_mems_cookie = get_mems_allowed();
  544         zonelist = huge_zonelist(vma, address,
  545                                         htlb_alloc_mask, &mpol, &nodemask);
  546         /*
  547          * A child process with MAP_PRIVATE mappings created by their parent
  548          * have no page reserves. This check ensures that reservations are
  549          * not "stolen". The child may still get SIGKILLed
  550          */
  551         if (!vma_has_reserves(vma) &&
  552                         h->free_huge_pages - h->resv_huge_pages == 0)
  553                 goto err;
  554 
  555         /* If reserves cannot be used, ensure enough pages are in the pool */
  556         if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
  557                 goto err;
  558 
  559         for_each_zone_zonelist_nodemask(zone, z, zonelist,
  560                                                 MAX_NR_ZONES - 1, nodemask) {
  561                 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
  562                         page = dequeue_huge_page_node(h, zone_to_nid(zone));
  563                         if (page) {
  564                                 if (!avoid_reserve)
  565                                         decrement_hugepage_resv_vma(h, vma);
  566                                 break;
  567                         }
  568                 }
  569         }
  570 
  571         mpol_cond_put(mpol);
  572         if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
  573                 goto retry_cpuset;
  574         return page;
  575 
  576 err:
  577         mpol_cond_put(mpol);
  578         return NULL;
  579 }
  580 
  581 static void update_and_free_page(struct hstate *h, struct page *page)
  582 {
  583         int i;
  584 
  585         VM_BUG_ON(h->order >= MAX_ORDER);
  586 
  587         h->nr_huge_pages--;
  588         h->nr_huge_pages_node[page_to_nid(page)]--;
  589         for (i = 0; i < pages_per_huge_page(h); i++) {
  590                 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
  591                                 1 << PG_referenced | 1 << PG_dirty |
  592                                 1 << PG_active | 1 << PG_reserved |
  593                                 1 << PG_private | 1 << PG_writeback);
  594         }
  595         VM_BUG_ON(hugetlb_cgroup_from_page(page));
  596         set_compound_page_dtor(page, NULL);
  597         set_page_refcounted(page);
  598         arch_release_hugepage(page);
  599         __free_pages(page, huge_page_order(h));
  600 }
  601 
  602 struct hstate *size_to_hstate(unsigned long size)
  603 {
  604         struct hstate *h;
  605 
  606         for_each_hstate(h) {
  607                 if (huge_page_size(h) == size)
  608                         return h;
  609         }
  610         return NULL;
  611 }
  612 
  613 static void free_huge_page(struct page *page)
  614 {
  615         /*
  616          * Can't pass hstate in here because it is called from the
  617          * compound page destructor.
  618          */
  619         struct hstate *h = page_hstate(page);
  620         int nid = page_to_nid(page);
  621         struct hugepage_subpool *spool =
  622                 (struct hugepage_subpool *)page_private(page);
  623 
  624         set_page_private(page, 0);
  625         page->mapping = NULL;
  626         BUG_ON(page_count(page));
  627         BUG_ON(page_mapcount(page));
  628 
  629         spin_lock(&hugetlb_lock);
  630         hugetlb_cgroup_uncharge_page(hstate_index(h),
  631                                      pages_per_huge_page(h), page);
  632         if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
  633                 /* remove the page from active list */
  634                 list_del(&page->lru);
  635                 update_and_free_page(h, page);
  636                 h->surplus_huge_pages--;
  637                 h->surplus_huge_pages_node[nid]--;
  638         } else {
  639                 arch_clear_hugepage_flags(page);
  640                 enqueue_huge_page(h, page);
  641         }
  642         spin_unlock(&hugetlb_lock);
  643         hugepage_subpool_put_pages(spool, 1);
  644 }
  645 
  646 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
  647 {
  648         INIT_LIST_HEAD(&page->lru);
  649         set_compound_page_dtor(page, free_huge_page);
  650         spin_lock(&hugetlb_lock);
  651         set_hugetlb_cgroup(page, NULL);
  652         h->nr_huge_pages++;
  653         h->nr_huge_pages_node[nid]++;
  654         spin_unlock(&hugetlb_lock);
  655         put_page(page); /* free it into the hugepage allocator */
  656 }
  657 
  658 static void prep_compound_gigantic_page(struct page *page, unsigned long order)
  659 {
  660         int i;
  661         int nr_pages = 1 << order;
  662         struct page *p = page + 1;
  663 
  664         /* we rely on prep_new_huge_page to set the destructor */
  665         set_compound_order(page, order);
  666         __SetPageHead(page);
  667         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
  668                 __SetPageTail(p);
  669                 set_page_count(p, 0);
  670                 p->first_page = page;
  671         }
  672 }
  673 
  674 /*
  675  * PageHuge() only returns true for hugetlbfs pages, but not for normal or
  676  * transparent huge pages.  See the PageTransHuge() documentation for more
  677  * details.
  678  */
  679 int PageHuge(struct page *page)
  680 {
  681         compound_page_dtor *dtor;
  682 
  683         if (!PageCompound(page))
  684                 return 0;
  685 
  686         page = compound_head(page);
  687         dtor = get_compound_page_dtor(page);
  688 
  689         return dtor == free_huge_page;
  690 }
  691 EXPORT_SYMBOL_GPL(PageHuge);
  692 
  693 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
  694 {
  695         struct page *page;
  696 
  697         if (h->order >= MAX_ORDER)
  698                 return NULL;
  699 
  700         page = alloc_pages_exact_node(nid,
  701                 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
  702                                                 __GFP_REPEAT|__GFP_NOWARN,
  703                 huge_page_order(h));
  704         if (page) {
  705                 if (arch_prepare_hugepage(page)) {
  706                         __free_pages(page, huge_page_order(h));
  707                         return NULL;
  708                 }
  709                 prep_new_huge_page(h, page, nid);
  710         }
  711 
  712         return page;
  713 }
  714 
  715 /*
  716  * common helper functions for hstate_next_node_to_{alloc|free}.
  717  * We may have allocated or freed a huge page based on a different
  718  * nodes_allowed previously, so h->next_node_to_{alloc|free} might
  719  * be outside of *nodes_allowed.  Ensure that we use an allowed
  720  * node for alloc or free.
  721  */
  722 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
  723 {
  724         nid = next_node(nid, *nodes_allowed);
  725         if (nid == MAX_NUMNODES)
  726                 nid = first_node(*nodes_allowed);
  727         VM_BUG_ON(nid >= MAX_NUMNODES);
  728 
  729         return nid;
  730 }
  731 
  732 static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
  733 {
  734         if (!node_isset(nid, *nodes_allowed))
  735                 nid = next_node_allowed(nid, nodes_allowed);
  736         return nid;
  737 }
  738 
  739 /*
  740  * returns the previously saved node ["this node"] from which to
  741  * allocate a persistent huge page for the pool and advance the
  742  * next node from which to allocate, handling wrap at end of node
  743  * mask.
  744  */
  745 static int hstate_next_node_to_alloc(struct hstate *h,
  746                                         nodemask_t *nodes_allowed)
  747 {
  748         int nid;
  749 
  750         VM_BUG_ON(!nodes_allowed);
  751 
  752         nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
  753         h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
  754 
  755         return nid;
  756 }
  757 
  758 static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
  759 {
  760         struct page *page;
  761         int start_nid;
  762         int next_nid;
  763         int ret = 0;
  764 
  765         start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
  766         next_nid = start_nid;
  767 
  768         do {
  769                 page = alloc_fresh_huge_page_node(h, next_nid);
  770                 if (page) {
  771                         ret = 1;
  772                         break;
  773                 }
  774                 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
  775         } while (next_nid != start_nid);
  776 
  777         if (ret)
  778                 count_vm_event(HTLB_BUDDY_PGALLOC);
  779         else
  780                 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
  781 
  782         return ret;
  783 }
  784 
  785 /*
  786  * helper for free_pool_huge_page() - return the previously saved
  787  * node ["this node"] from which to free a huge page.  Advance the
  788  * next node id whether or not we find a free huge page to free so
  789  * that the next attempt to free addresses the next node.
  790  */
  791 static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
  792 {
  793         int nid;
  794 
  795         VM_BUG_ON(!nodes_allowed);
  796 
  797         nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
  798         h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
  799 
  800         return nid;
  801 }
  802 
  803 /*
  804  * Free huge page from pool from next node to free.
  805  * Attempt to keep persistent huge pages more or less
  806  * balanced over allowed nodes.
  807  * Called with hugetlb_lock locked.
  808  */
  809 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
  810                                                          bool acct_surplus)
  811 {
  812         int start_nid;
  813         int next_nid;
  814         int ret = 0;
  815 
  816         start_nid = hstate_next_node_to_free(h, nodes_allowed);
  817         next_nid = start_nid;
  818 
  819         do {
  820                 /*
  821                  * If we're returning unused surplus pages, only examine
  822                  * nodes with surplus pages.
  823                  */
  824                 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
  825                     !list_empty(&h->hugepage_freelists[next_nid])) {
  826                         struct page *page =
  827                                 list_entry(h->hugepage_freelists[next_nid].next,
  828                                           struct page, lru);
  829                         list_del(&page->lru);
  830                         h->free_huge_pages--;
  831                         h->free_huge_pages_node[next_nid]--;
  832                         if (acct_surplus) {
  833                                 h->surplus_huge_pages--;
  834                                 h->surplus_huge_pages_node[next_nid]--;
  835                         }
  836                         update_and_free_page(h, page);
  837                         ret = 1;
  838                         break;
  839                 }
  840                 next_nid = hstate_next_node_to_free(h, nodes_allowed);
  841         } while (next_nid != start_nid);
  842 
  843         return ret;
  844 }
  845 
  846 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
  847 {
  848         struct page *page;
  849         unsigned int r_nid;
  850 
  851         if (h->order >= MAX_ORDER)
  852                 return NULL;
  853 
  854         /*
  855          * Assume we will successfully allocate the surplus page to
  856          * prevent racing processes from causing the surplus to exceed
  857          * overcommit
  858          *
  859          * This however introduces a different race, where a process B
  860          * tries to grow the static hugepage pool while alloc_pages() is
  861          * called by process A. B will only examine the per-node
  862          * counters in determining if surplus huge pages can be
  863          * converted to normal huge pages in adjust_pool_surplus(). A
  864          * won't be able to increment the per-node counter, until the
  865          * lock is dropped by B, but B doesn't drop hugetlb_lock until
  866          * no more huge pages can be converted from surplus to normal
  867          * state (and doesn't try to convert again). Thus, we have a
  868          * case where a surplus huge page exists, the pool is grown, and
  869          * the surplus huge page still exists after, even though it
  870          * should just have been converted to a normal huge page. This
  871          * does not leak memory, though, as the hugepage will be freed
  872          * once it is out of use. It also does not allow the counters to
  873          * go out of whack in adjust_pool_surplus() as we don't modify
  874          * the node values until we've gotten the hugepage and only the
  875          * per-node value is checked there.
  876          */
  877         spin_lock(&hugetlb_lock);
  878         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
  879                 spin_unlock(&hugetlb_lock);
  880                 return NULL;
  881         } else {
  882                 h->nr_huge_pages++;
  883                 h->surplus_huge_pages++;
  884         }
  885         spin_unlock(&hugetlb_lock);
  886 
  887         if (nid == NUMA_NO_NODE)
  888                 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
  889                                    __GFP_REPEAT|__GFP_NOWARN,
  890                                    huge_page_order(h));
  891         else
  892                 page = alloc_pages_exact_node(nid,
  893                         htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
  894                         __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
  895 
  896         if (page && arch_prepare_hugepage(page)) {
  897                 __free_pages(page, huge_page_order(h));
  898                 page = NULL;
  899         }
  900 
  901         spin_lock(&hugetlb_lock);
  902         if (page) {
  903                 INIT_LIST_HEAD(&page->lru);
  904                 r_nid = page_to_nid(page);
  905                 set_compound_page_dtor(page, free_huge_page);
  906                 set_hugetlb_cgroup(page, NULL);
  907                 /*
  908                  * We incremented the global counters already
  909                  */
  910                 h->nr_huge_pages_node[r_nid]++;
  911                 h->surplus_huge_pages_node[r_nid]++;
  912                 __count_vm_event(HTLB_BUDDY_PGALLOC);
  913         } else {
  914                 h->nr_huge_pages--;
  915                 h->surplus_huge_pages--;
  916                 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
  917         }
  918         spin_unlock(&hugetlb_lock);
  919 
  920         return page;
  921 }
  922 
  923 /*
  924  * This allocation function is useful in the context where vma is irrelevant.
  925  * E.g. soft-offlining uses this function because it only cares physical
  926  * address of error page.
  927  */
  928 struct page *alloc_huge_page_node(struct hstate *h, int nid)
  929 {
  930         struct page *page;
  931 
  932         spin_lock(&hugetlb_lock);
  933         page = dequeue_huge_page_node(h, nid);
  934         spin_unlock(&hugetlb_lock);
  935 
  936         if (!page)
  937                 page = alloc_buddy_huge_page(h, nid);
  938 
  939         return page;
  940 }
  941 
  942 /*
  943  * Increase the hugetlb pool such that it can accommodate a reservation
  944  * of size 'delta'.
  945  */
  946 static int gather_surplus_pages(struct hstate *h, int delta)
  947 {
  948         struct list_head surplus_list;
  949         struct page *page, *tmp;
  950         int ret, i;
  951         int needed, allocated;
  952         bool alloc_ok = true;
  953 
  954         needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
  955         if (needed <= 0) {
  956                 h->resv_huge_pages += delta;
  957                 return 0;
  958         }
  959 
  960         allocated = 0;
  961         INIT_LIST_HEAD(&surplus_list);
  962 
  963         ret = -ENOMEM;
  964 retry:
  965         spin_unlock(&hugetlb_lock);
  966         for (i = 0; i < needed; i++) {
  967                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
  968                 if (!page) {
  969                         alloc_ok = false;
  970                         break;
  971                 }
  972                 list_add(&page->lru, &surplus_list);
  973         }
  974         allocated += i;
  975 
  976         /*
  977          * After retaking hugetlb_lock, we need to recalculate 'needed'
  978          * because either resv_huge_pages or free_huge_pages may have changed.
  979          */
  980         spin_lock(&hugetlb_lock);
  981         needed = (h->resv_huge_pages + delta) -
  982                         (h->free_huge_pages + allocated);
  983         if (needed > 0) {
  984                 if (alloc_ok)
  985                         goto retry;
  986                 /*
  987                  * We were not able to allocate enough pages to
  988                  * satisfy the entire reservation so we free what
  989                  * we've allocated so far.
  990                  */
  991                 goto free;
  992         }
  993         /*
  994          * The surplus_list now contains _at_least_ the number of extra pages
  995          * needed to accommodate the reservation.  Add the appropriate number
  996          * of pages to the hugetlb pool and free the extras back to the buddy
  997          * allocator.  Commit the entire reservation here to prevent another
  998          * process from stealing the pages as they are added to the pool but
  999          * before they are reserved.
 1000          */
 1001         needed += allocated;
 1002         h->resv_huge_pages += delta;
 1003         ret = 0;
 1004 
 1005         /* Free the needed pages to the hugetlb pool */
 1006         list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
 1007                 if ((--needed) < 0)
 1008                         break;
 1009                 /*
 1010                  * This page is now managed by the hugetlb allocator and has
 1011                  * no users -- drop the buddy allocator's reference.
 1012                  */
 1013                 put_page_testzero(page);
 1014                 VM_BUG_ON(page_count(page));
 1015                 enqueue_huge_page(h, page);
 1016         }
 1017 free:
 1018         spin_unlock(&hugetlb_lock);
 1019 
 1020         /* Free unnecessary surplus pages to the buddy allocator */
 1021         if (!list_empty(&surplus_list)) {
 1022                 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
 1023                         put_page(page);
 1024                 }
 1025         }
 1026         spin_lock(&hugetlb_lock);
 1027 
 1028         return ret;
 1029 }
 1030 
 1031 /*
 1032  * When releasing a hugetlb pool reservation, any surplus pages that were
 1033  * allocated to satisfy the reservation must be explicitly freed if they were
 1034  * never used.
 1035  * Called with hugetlb_lock held.
 1036  */
 1037 static void return_unused_surplus_pages(struct hstate *h,
 1038                                         unsigned long unused_resv_pages)
 1039 {
 1040         unsigned long nr_pages;
 1041 
 1042         /* Uncommit the reservation */
 1043         h->resv_huge_pages -= unused_resv_pages;
 1044 
 1045         /* Cannot return gigantic pages currently */
 1046         if (h->order >= MAX_ORDER)
 1047                 return;
 1048 
 1049         nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
 1050 
 1051         /*
 1052          * We want to release as many surplus pages as possible, spread
 1053          * evenly across all nodes with memory. Iterate across these nodes
 1054          * until we can no longer free unreserved surplus pages. This occurs
 1055          * when the nodes with surplus pages have no free pages.
 1056          * free_pool_huge_page() will balance the the freed pages across the
 1057          * on-line nodes with memory and will handle the hstate accounting.
 1058          */
 1059         while (nr_pages--) {
 1060                 if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
 1061                         break;
 1062         }
 1063 }
 1064 
 1065 /*
 1066  * Determine if the huge page at addr within the vma has an associated
 1067  * reservation.  Where it does not we will need to logically increase
 1068  * reservation and actually increase subpool usage before an allocation
 1069  * can occur.  Where any new reservation would be required the
 1070  * reservation change is prepared, but not committed.  Once the page
 1071  * has been allocated from the subpool and instantiated the change should
 1072  * be committed via vma_commit_reservation.  No action is required on
 1073  * failure.
 1074  */
 1075 static long vma_needs_reservation(struct hstate *h,
 1076                         struct vm_area_struct *vma, unsigned long addr)
 1077 {
 1078         struct address_space *mapping = vma->vm_file->f_mapping;
 1079         struct inode *inode = mapping->host;
 1080 
 1081         if (vma->vm_flags & VM_MAYSHARE) {
 1082                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 1083                 return region_chg(&inode->i_mapping->private_list,
 1084                                                         idx, idx + 1);
 1085 
 1086         } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 1087                 return 1;
 1088 
 1089         } else  {
 1090                 long err;
 1091                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 1092                 struct resv_map *reservations = vma_resv_map(vma);
 1093 
 1094                 err = region_chg(&reservations->regions, idx, idx + 1);
 1095                 if (err < 0)
 1096                         return err;
 1097                 return 0;
 1098         }
 1099 }
 1100 static void vma_commit_reservation(struct hstate *h,
 1101                         struct vm_area_struct *vma, unsigned long addr)
 1102 {
 1103         struct address_space *mapping = vma->vm_file->f_mapping;
 1104         struct inode *inode = mapping->host;
 1105 
 1106         if (vma->vm_flags & VM_MAYSHARE) {
 1107                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 1108                 region_add(&inode->i_mapping->private_list, idx, idx + 1);
 1109 
 1110         } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 1111                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 1112                 struct resv_map *reservations = vma_resv_map(vma);
 1113 
 1114                 /* Mark this page used in the map. */
 1115                 region_add(&reservations->regions, idx, idx + 1);
 1116         }
 1117 }
 1118 
 1119 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 1120                                     unsigned long addr, int avoid_reserve)
 1121 {
 1122         struct hugepage_subpool *spool = subpool_vma(vma);
 1123         struct hstate *h = hstate_vma(vma);
 1124         struct page *page;
 1125         long chg;
 1126         int ret, idx;
 1127         struct hugetlb_cgroup *h_cg;
 1128 
 1129         idx = hstate_index(h);
 1130         /*
 1131          * Processes that did not create the mapping will have no
 1132          * reserves and will not have accounted against subpool
 1133          * limit. Check that the subpool limit can be made before
 1134          * satisfying the allocation MAP_NORESERVE mappings may also
 1135          * need pages and subpool limit allocated allocated if no reserve
 1136          * mapping overlaps.
 1137          */
 1138         chg = vma_needs_reservation(h, vma, addr);
 1139         if (chg < 0)
 1140                 return ERR_PTR(-ENOMEM);
 1141         if (chg)
 1142                 if (hugepage_subpool_get_pages(spool, chg))
 1143                         return ERR_PTR(-ENOSPC);
 1144 
 1145         ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
 1146         if (ret) {
 1147                 hugepage_subpool_put_pages(spool, chg);
 1148                 return ERR_PTR(-ENOSPC);
 1149         }
 1150         spin_lock(&hugetlb_lock);
 1151         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
 1152         if (page) {
 1153                 /* update page cgroup details */
 1154                 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
 1155                                              h_cg, page);
 1156                 spin_unlock(&hugetlb_lock);
 1157         } else {
 1158                 spin_unlock(&hugetlb_lock);
 1159                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
 1160                 if (!page) {
 1161                         hugetlb_cgroup_uncharge_cgroup(idx,
 1162                                                        pages_per_huge_page(h),
 1163                                                        h_cg);
 1164                         hugepage_subpool_put_pages(spool, chg);
 1165                         return ERR_PTR(-ENOSPC);
 1166                 }
 1167                 spin_lock(&hugetlb_lock);
 1168                 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
 1169                                              h_cg, page);
 1170                 list_move(&page->lru, &h->hugepage_activelist);
 1171                 spin_unlock(&hugetlb_lock);
 1172         }
 1173 
 1174         set_page_private(page, (unsigned long)spool);
 1175 
 1176         vma_commit_reservation(h, vma, addr);
 1177         return page;
 1178 }
 1179 
 1180 int __weak alloc_bootmem_huge_page(struct hstate *h)
 1181 {
 1182         struct huge_bootmem_page *m;
 1183         int nr_nodes = nodes_weight(node_states[N_MEMORY]);
 1184 
 1185         while (nr_nodes) {
 1186                 void *addr;
 1187 
 1188                 addr = __alloc_bootmem_node_nopanic(
 1189                                 NODE_DATA(hstate_next_node_to_alloc(h,
 1190                                                 &node_states[N_MEMORY])),
 1191                                 huge_page_size(h), huge_page_size(h), 0);
 1192 
 1193                 if (addr) {
 1194                         /*
 1195                          * Use the beginning of the huge page to store the
 1196                          * huge_bootmem_page struct (until gather_bootmem
 1197                          * puts them into the mem_map).
 1198                          */
 1199                         m = addr;
 1200                         goto found;
 1201                 }
 1202                 nr_nodes--;
 1203         }
 1204         return 0;
 1205 
 1206 found:
 1207         BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
 1208         /* Put them into a private list first because mem_map is not up yet */
 1209         list_add(&m->list, &huge_boot_pages);
 1210         m->hstate = h;
 1211         return 1;
 1212 }
 1213 
 1214 static void prep_compound_huge_page(struct page *page, int order)
 1215 {
 1216         if (unlikely(order > (MAX_ORDER - 1)))
 1217                 prep_compound_gigantic_page(page, order);
 1218         else
 1219                 prep_compound_page(page, order);
 1220 }
 1221 
 1222 /* Put bootmem huge pages into the standard lists after mem_map is up */
 1223 static void __init gather_bootmem_prealloc(void)
 1224 {
 1225         struct huge_bootmem_page *m;
 1226 
 1227         list_for_each_entry(m, &huge_boot_pages, list) {
 1228                 struct hstate *h = m->hstate;
 1229                 struct page *page;
 1230 
 1231 #ifdef CONFIG_HIGHMEM
 1232                 page = pfn_to_page(m->phys >> PAGE_SHIFT);
 1233                 free_bootmem_late((unsigned long)m,
 1234                                   sizeof(struct huge_bootmem_page));
 1235 #else
 1236                 page = virt_to_page(m);
 1237 #endif
 1238                 __ClearPageReserved(page);
 1239                 WARN_ON(page_count(page) != 1);
 1240                 prep_compound_huge_page(page, h->order);
 1241                 prep_new_huge_page(h, page, page_to_nid(page));
 1242                 /*
 1243                  * If we had gigantic hugepages allocated at boot time, we need
 1244                  * to restore the 'stolen' pages to totalram_pages in order to
 1245                  * fix confusing memory reports from free(1) and another
 1246                  * side-effects, like CommitLimit going negative.
 1247                  */
 1248                 if (h->order > (MAX_ORDER - 1))
 1249                         totalram_pages += 1 << h->order;
 1250         }
 1251 }
 1252 
 1253 static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 1254 {
 1255         unsigned long i;
 1256 
 1257         for (i = 0; i < h->max_huge_pages; ++i) {
 1258                 if (h->order >= MAX_ORDER) {
 1259                         if (!alloc_bootmem_huge_page(h))
 1260                                 break;
 1261                 } else if (!alloc_fresh_huge_page(h,
 1262                                          &node_states[N_MEMORY]))
 1263                         break;
 1264         }
 1265         h->max_huge_pages = i;
 1266 }
 1267 
 1268 static void __init hugetlb_init_hstates(void)
 1269 {
 1270         struct hstate *h;
 1271 
 1272         for_each_hstate(h) {
 1273                 /* oversize hugepages were init'ed in early boot */
 1274                 if (h->order < MAX_ORDER)
 1275                         hugetlb_hstate_alloc_pages(h);
 1276         }
 1277 }
 1278 
 1279 static char * __init memfmt(char *buf, unsigned long n)
 1280 {
 1281         if (n >= (1UL << 30))
 1282                 sprintf(buf, "%lu GB", n >> 30);
 1283         else if (n >= (1UL << 20))
 1284                 sprintf(buf, "%lu MB", n >> 20);
 1285         else
 1286                 sprintf(buf, "%lu KB", n >> 10);
 1287         return buf;
 1288 }
 1289 
 1290 static void __init report_hugepages(void)
 1291 {
 1292         struct hstate *h;
 1293 
 1294         for_each_hstate(h) {
 1295                 char buf[32];
 1296                 printk(KERN_INFO "HugeTLB registered %s page size, "
 1297                                  "pre-allocated %ld pages\n",
 1298                         memfmt(buf, huge_page_size(h)),
 1299                         h->free_huge_pages);
 1300         }
 1301 }
 1302 
 1303 #ifdef CONFIG_HIGHMEM
 1304 static void try_to_free_low(struct hstate *h, unsigned long count,
 1305                                                 nodemask_t *nodes_allowed)
 1306 {
 1307         int i;
 1308 
 1309         if (h->order >= MAX_ORDER)
 1310                 return;
 1311 
 1312         for_each_node_mask(i, *nodes_allowed) {
 1313                 struct page *page, *next;
 1314                 struct list_head *freel = &h->hugepage_freelists[i];
 1315                 list_for_each_entry_safe(page, next, freel, lru) {
 1316                         if (count >= h->nr_huge_pages)
 1317                                 return;
 1318                         if (PageHighMem(page))
 1319                                 continue;
 1320                         list_del(&page->lru);
 1321                         update_and_free_page(h, page);
 1322                         h->free_huge_pages--;
 1323                         h->free_huge_pages_node[page_to_nid(page)]--;
 1324                 }
 1325         }
 1326 }
 1327 #else
 1328 static inline void try_to_free_low(struct hstate *h, unsigned long count,
 1329                                                 nodemask_t *nodes_allowed)
 1330 {
 1331 }
 1332 #endif
 1333 
 1334 /*
 1335  * Increment or decrement surplus_huge_pages.  Keep node-specific counters
 1336  * balanced by operating on them in a round-robin fashion.
 1337  * Returns 1 if an adjustment was made.
 1338  */
 1339 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
 1340                                 int delta)
 1341 {
 1342         int start_nid, next_nid;
 1343         int ret = 0;
 1344 
 1345         VM_BUG_ON(delta != -1 && delta != 1);
 1346 
 1347         if (delta < 0)
 1348                 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
 1349         else
 1350                 start_nid = hstate_next_node_to_free(h, nodes_allowed);
 1351         next_nid = start_nid;
 1352 
 1353         do {
 1354                 int nid = next_nid;
 1355                 if (delta < 0)  {
 1356                         /*
 1357                          * To shrink on this node, there must be a surplus page
 1358                          */
 1359                         if (!h->surplus_huge_pages_node[nid]) {
 1360                                 next_nid = hstate_next_node_to_alloc(h,
 1361                                                                 nodes_allowed);
 1362                                 continue;
 1363                         }
 1364                 }
 1365                 if (delta > 0) {
 1366                         /*
 1367                          * Surplus cannot exceed the total number of pages
 1368                          */
 1369                         if (h->surplus_huge_pages_node[nid] >=
 1370                                                 h->nr_huge_pages_node[nid]) {
 1371                                 next_nid = hstate_next_node_to_free(h,
 1372                                                                 nodes_allowed);
 1373                                 continue;
 1374                         }
 1375                 }
 1376 
 1377                 h->surplus_huge_pages += delta;
 1378                 h->surplus_huge_pages_node[nid] += delta;
 1379                 ret = 1;
 1380                 break;
 1381         } while (next_nid != start_nid);
 1382 
 1383         return ret;
 1384 }
 1385 
 1386 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
 1387 static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 1388                                                 nodemask_t *nodes_allowed)
 1389 {
 1390         unsigned long min_count, ret;
 1391 
 1392         if (h->order >= MAX_ORDER)
 1393                 return h->max_huge_pages;
 1394 
 1395         /*
 1396          * Increase the pool size
 1397          * First take pages out of surplus state.  Then make up the
 1398          * remaining difference by allocating fresh huge pages.
 1399          *
 1400          * We might race with alloc_buddy_huge_page() here and be unable
 1401          * to convert a surplus huge page to a normal huge page. That is
 1402          * not critical, though, it just means the overall size of the
 1403          * pool might be one hugepage larger than it needs to be, but
 1404          * within all the constraints specified by the sysctls.
 1405          */
 1406         spin_lock(&hugetlb_lock);
 1407         while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
 1408                 if (!adjust_pool_surplus(h, nodes_allowed, -1))
 1409                         break;
 1410         }
 1411 
 1412         while (count > persistent_huge_pages(h)) {
 1413                 /*
 1414                  * If this allocation races such that we no longer need the
 1415                  * page, free_huge_page will handle it by freeing the page
 1416                  * and reducing the surplus.
 1417                  */
 1418                 spin_unlock(&hugetlb_lock);
 1419                 ret = alloc_fresh_huge_page(h, nodes_allowed);
 1420                 spin_lock(&hugetlb_lock);
 1421                 if (!ret)
 1422                         goto out;
 1423 
 1424                 /* Bail for signals. Probably ctrl-c from user */
 1425                 if (signal_pending(current))
 1426                         goto out;
 1427         }
 1428 
 1429         /*
 1430          * Decrease the pool size
 1431          * First return free pages to the buddy allocator (being careful
 1432          * to keep enough around to satisfy reservations).  Then place
 1433          * pages into surplus state as needed so the pool will shrink
 1434          * to the desired size as pages become free.
 1435          *
 1436          * By placing pages into the surplus state independent of the
 1437          * overcommit value, we are allowing the surplus pool size to
 1438          * exceed overcommit. There are few sane options here. Since
 1439          * alloc_buddy_huge_page() is checking the global counter,
 1440          * though, we'll note that we're not allowed to exceed surplus
 1441          * and won't grow the pool anywhere else. Not until one of the
 1442          * sysctls are changed, or the surplus pages go out of use.
 1443          */
 1444         min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
 1445         min_count = max(count, min_count);
 1446         try_to_free_low(h, min_count, nodes_allowed);
 1447         while (min_count < persistent_huge_pages(h)) {
 1448                 if (!free_pool_huge_page(h, nodes_allowed, 0))
 1449                         break;
 1450         }
 1451         while (count < persistent_huge_pages(h)) {
 1452                 if (!adjust_pool_surplus(h, nodes_allowed, 1))
 1453                         break;
 1454         }
 1455 out:
 1456         ret = persistent_huge_pages(h);
 1457         spin_unlock(&hugetlb_lock);
 1458         return ret;
 1459 }
 1460 
 1461 #define HSTATE_ATTR_RO(_name) \
 1462         static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 1463 
 1464 #define HSTATE_ATTR(_name) \
 1465         static struct kobj_attribute _name##_attr = \
 1466                 __ATTR(_name, 0644, _name##_show, _name##_store)
 1467 
 1468 static struct kobject *hugepages_kobj;
 1469 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
 1470 
 1471 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
 1472 
 1473 static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
 1474 {
 1475         int i;
 1476 
 1477         for (i = 0; i < HUGE_MAX_HSTATE; i++)
 1478                 if (hstate_kobjs[i] == kobj) {
 1479                         if (nidp)
 1480                                 *nidp = NUMA_NO_NODE;
 1481                         return &hstates[i];
 1482                 }
 1483 
 1484         return kobj_to_node_hstate(kobj, nidp);
 1485 }
 1486 
 1487 static ssize_t nr_hugepages_show_common(struct kobject *kobj,
 1488                                         struct kobj_attribute *attr, char *buf)
 1489 {
 1490         struct hstate *h;
 1491         unsigned long nr_huge_pages;
 1492         int nid;
 1493 
 1494         h = kobj_to_hstate(kobj, &nid);
 1495         if (nid == NUMA_NO_NODE)
 1496                 nr_huge_pages = h->nr_huge_pages;
 1497         else
 1498                 nr_huge_pages = h->nr_huge_pages_node[nid];
 1499 
 1500         return sprintf(buf, "%lu\n", nr_huge_pages);
 1501 }
 1502 
 1503 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
 1504                         struct kobject *kobj, struct kobj_attribute *attr,
 1505                         const char *buf, size_t len)
 1506 {
 1507         int err;
 1508         int nid;
 1509         unsigned long count;
 1510         struct hstate *h;
 1511         NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
 1512 
 1513         err = strict_strtoul(buf, 10, &count);
 1514         if (err)
 1515                 goto out;
 1516 
 1517         h = kobj_to_hstate(kobj, &nid);
 1518         if (h->order >= MAX_ORDER) {
 1519                 err = -EINVAL;
 1520                 goto out;
 1521         }
 1522 
 1523         if (nid == NUMA_NO_NODE) {
 1524                 /*
 1525                  * global hstate attribute
 1526                  */
 1527                 if (!(obey_mempolicy &&
 1528                                 init_nodemask_of_mempolicy(nodes_allowed))) {
 1529                         NODEMASK_FREE(nodes_allowed);
 1530                         nodes_allowed = &node_states[N_MEMORY];
 1531                 }
 1532         } else if (nodes_allowed) {
 1533                 /*
 1534                  * per node hstate attribute: adjust count to global,
 1535                  * but restrict alloc/free to the specified node.
 1536                  */
 1537                 count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
 1538                 init_nodemask_of_node(nodes_allowed, nid);
 1539         } else
 1540                 nodes_allowed = &node_states[N_MEMORY];
 1541 
 1542         h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
 1543 
 1544         if (nodes_allowed != &node_states[N_MEMORY])
 1545                 NODEMASK_FREE(nodes_allowed);
 1546 
 1547         return len;
 1548 out:
 1549         NODEMASK_FREE(nodes_allowed);
 1550         return err;
 1551 }
 1552 
 1553 static ssize_t nr_hugepages_show(struct kobject *kobj,
 1554                                        struct kobj_attribute *attr, char *buf)
 1555 {
 1556         return nr_hugepages_show_common(kobj, attr, buf);
 1557 }
 1558 
 1559 static ssize_t nr_hugepages_store(struct kobject *kobj,
 1560                struct kobj_attribute *attr, const char *buf, size_t len)
 1561 {
 1562         return nr_hugepages_store_common(false, kobj, attr, buf, len);
 1563 }
 1564 HSTATE_ATTR(nr_hugepages);
 1565 
 1566 #ifdef CONFIG_NUMA
 1567 
 1568 /*
 1569  * hstate attribute for optionally mempolicy-based constraint on persistent
 1570  * huge page alloc/free.
 1571  */
 1572 static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
 1573                                        struct kobj_attribute *attr, char *buf)
 1574 {
 1575         return nr_hugepages_show_common(kobj, attr, buf);
 1576 }
 1577 
 1578 static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
 1579                struct kobj_attribute *attr, const char *buf, size_t len)
 1580 {
 1581         return nr_hugepages_store_common(true, kobj, attr, buf, len);
 1582 }
 1583 HSTATE_ATTR(nr_hugepages_mempolicy);
 1584 #endif
 1585 
 1586 
 1587 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
 1588                                         struct kobj_attribute *attr, char *buf)
 1589 {
 1590         struct hstate *h = kobj_to_hstate(kobj, NULL);
 1591         return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
 1592 }
 1593 
 1594 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
 1595                 struct kobj_attribute *attr, const char *buf, size_t count)
 1596 {
 1597         int err;
 1598         unsigned long input;
 1599         struct hstate *h = kobj_to_hstate(kobj, NULL);
 1600 
 1601         if (h->order >= MAX_ORDER)
 1602                 return -EINVAL;
 1603 
 1604         err = strict_strtoul(buf, 10, &input);
 1605         if (err)
 1606                 return err;
 1607 
 1608         spin_lock(&hugetlb_lock);
 1609         h->nr_overcommit_huge_pages = input;
 1610         spin_unlock(&hugetlb_lock);
 1611 
 1612         return count;
 1613 }
 1614 HSTATE_ATTR(nr_overcommit_hugepages);
 1615 
 1616 static ssize_t free_hugepages_show(struct kobject *kobj,
 1617                                         struct kobj_attribute *attr, char *buf)
 1618 {
 1619         struct hstate *h;
 1620         unsigned long free_huge_pages;
 1621         int nid;
 1622 
 1623         h = kobj_to_hstate(kobj, &nid);
 1624         if (nid == NUMA_NO_NODE)
 1625                 free_huge_pages = h->free_huge_pages;
 1626         else
 1627                 free_huge_pages = h->free_huge_pages_node[nid];
 1628 
 1629         return sprintf(buf, "%lu\n", free_huge_pages);
 1630 }
 1631 HSTATE_ATTR_RO(free_hugepages);
 1632 
 1633 static ssize_t resv_hugepages_show(struct kobject *kobj,
 1634                                         struct kobj_attribute *attr, char *buf)
 1635 {
 1636         struct hstate *h = kobj_to_hstate(kobj, NULL);
 1637         return sprintf(buf, "%lu\n", h->resv_huge_pages);
 1638 }
 1639 HSTATE_ATTR_RO(resv_hugepages);
 1640 
 1641 static ssize_t surplus_hugepages_show(struct kobject *kobj,
 1642                                         struct kobj_attribute *attr, char *buf)
 1643 {
 1644         struct hstate *h;
 1645         unsigned long surplus_huge_pages;
 1646         int nid;
 1647 
 1648         h = kobj_to_hstate(kobj, &nid);
 1649         if (nid == NUMA_NO_NODE)
 1650                 surplus_huge_pages = h->surplus_huge_pages;
 1651         else
 1652                 surplus_huge_pages = h->surplus_huge_pages_node[nid];
 1653 
 1654         return sprintf(buf, "%lu\n", surplus_huge_pages);
 1655 }
 1656 HSTATE_ATTR_RO(surplus_hugepages);
 1657 
 1658 static struct attribute *hstate_attrs[] = {
 1659         &nr_hugepages_attr.attr,
 1660         &nr_overcommit_hugepages_attr.attr,
 1661         &free_hugepages_attr.attr,
 1662         &resv_hugepages_attr.attr,
 1663         &surplus_hugepages_attr.attr,
 1664 #ifdef CONFIG_NUMA
 1665         &nr_hugepages_mempolicy_attr.attr,
 1666 #endif
 1667         NULL,
 1668 };
 1669 
 1670 static struct attribute_group hstate_attr_group = {
 1671         .attrs = hstate_attrs,
 1672 };
 1673 
 1674 static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
 1675                                     struct kobject **hstate_kobjs,
 1676                                     struct attribute_group *hstate_attr_group)
 1677 {
 1678         int retval;
 1679         int hi = hstate_index(h);
 1680 
 1681         hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
 1682         if (!hstate_kobjs[hi])
 1683                 return -ENOMEM;
 1684 
 1685         retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
 1686         if (retval)
 1687                 kobject_put(hstate_kobjs[hi]);
 1688 
 1689         return retval;
 1690 }
 1691 
 1692 static void __init hugetlb_sysfs_init(void)
 1693 {
 1694         struct hstate *h;
 1695         int err;
 1696 
 1697         hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
 1698         if (!hugepages_kobj)
 1699                 return;
 1700 
 1701         for_each_hstate(h) {
 1702                 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
 1703                                          hstate_kobjs, &hstate_attr_group);
 1704                 if (err)
 1705                         printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
 1706                                                                 h->name);
 1707         }
 1708 }
 1709 
 1710 #ifdef CONFIG_NUMA
 1711 
 1712 /*
 1713  * node_hstate/s - associate per node hstate attributes, via their kobjects,
 1714  * with node devices in node_devices[] using a parallel array.  The array
 1715  * index of a node device or _hstate == node id.
 1716  * This is here to avoid any static dependency of the node device driver, in
 1717  * the base kernel, on the hugetlb module.
 1718  */
 1719 struct node_hstate {
 1720         struct kobject          *hugepages_kobj;
 1721         struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
 1722 };
 1723 struct node_hstate node_hstates[MAX_NUMNODES];
 1724 
 1725 /*
 1726  * A subset of global hstate attributes for node devices
 1727  */
 1728 static struct attribute *per_node_hstate_attrs[] = {
 1729         &nr_hugepages_attr.attr,
 1730         &free_hugepages_attr.attr,
 1731         &surplus_hugepages_attr.attr,
 1732         NULL,
 1733 };
 1734 
 1735 static struct attribute_group per_node_hstate_attr_group = {
 1736         .attrs = per_node_hstate_attrs,
 1737 };
 1738 
 1739 /*
 1740  * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
 1741  * Returns node id via non-NULL nidp.
 1742  */
 1743 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
 1744 {
 1745         int nid;
 1746 
 1747         for (nid = 0; nid < nr_node_ids; nid++) {
 1748                 struct node_hstate *nhs = &node_hstates[nid];
 1749                 int i;
 1750                 for (i = 0; i < HUGE_MAX_HSTATE; i++)
 1751                         if (nhs->hstate_kobjs[i] == kobj) {
 1752                                 if (nidp)
 1753                                         *nidp = nid;
 1754                                 return &hstates[i];
 1755                         }
 1756         }
 1757 
 1758         BUG();
 1759         return NULL;
 1760 }
 1761 
 1762 /*
 1763  * Unregister hstate attributes from a single node device.
 1764  * No-op if no hstate attributes attached.
 1765  */
 1766 void hugetlb_unregister_node(struct node *node)
 1767 {
 1768         struct hstate *h;
 1769         struct node_hstate *nhs = &node_hstates[node->dev.id];
 1770 
 1771         if (!nhs->hugepages_kobj)
 1772                 return;         /* no hstate attributes */
 1773 
 1774         for_each_hstate(h) {
 1775                 int idx = hstate_index(h);
 1776                 if (nhs->hstate_kobjs[idx]) {
 1777                         kobject_put(nhs->hstate_kobjs[idx]);
 1778                         nhs->hstate_kobjs[idx] = NULL;
 1779                 }
 1780         }
 1781 
 1782         kobject_put(nhs->hugepages_kobj);
 1783         nhs->hugepages_kobj = NULL;
 1784 }
 1785 
 1786 /*
 1787  * hugetlb module exit:  unregister hstate attributes from node devices
 1788  * that have them.
 1789  */
 1790 static void hugetlb_unregister_all_nodes(void)
 1791 {
 1792         int nid;
 1793 
 1794         /*
 1795          * disable node device registrations.
 1796          */
 1797         register_hugetlbfs_with_node(NULL, NULL);
 1798 
 1799         /*
 1800          * remove hstate attributes from any nodes that have them.
 1801          */
 1802         for (nid = 0; nid < nr_node_ids; nid++)
 1803                 hugetlb_unregister_node(node_devices[nid]);
 1804 }
 1805 
 1806 /*
 1807  * Register hstate attributes for a single node device.
 1808  * No-op if attributes already registered.
 1809  */
 1810 void hugetlb_register_node(struct node *node)
 1811 {
 1812         struct hstate *h;
 1813         struct node_hstate *nhs = &node_hstates[node->dev.id];
 1814         int err;
 1815 
 1816         if (nhs->hugepages_kobj)
 1817                 return;         /* already allocated */
 1818 
 1819         nhs->hugepages_kobj = kobject_create_and_add("hugepages",
 1820                                                         &node->dev.kobj);
 1821         if (!nhs->hugepages_kobj)
 1822                 return;
 1823 
 1824         for_each_hstate(h) {
 1825                 err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
 1826                                                 nhs->hstate_kobjs,
 1827                                                 &per_node_hstate_attr_group);
 1828                 if (err) {
 1829                         printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
 1830                                         " for node %d\n",
 1831                                                 h->name, node->dev.id);
 1832                         hugetlb_unregister_node(node);
 1833                         break;
 1834                 }
 1835         }
 1836 }
 1837 
 1838 /*
 1839  * hugetlb init time:  register hstate attributes for all registered node
 1840  * devices of nodes that have memory.  All on-line nodes should have
 1841  * registered their associated device by this time.
 1842  */
 1843 static void hugetlb_register_all_nodes(void)
 1844 {
 1845         int nid;
 1846 
 1847         for_each_node_state(nid, N_MEMORY) {
 1848                 struct node *node = node_devices[nid];
 1849                 if (node->dev.id == nid)
 1850                         hugetlb_register_node(node);
 1851         }
 1852 
 1853         /*
 1854          * Let the node device driver know we're here so it can
 1855          * [un]register hstate attributes on node hotplug.
 1856          */
 1857         register_hugetlbfs_with_node(hugetlb_register_node,
 1858                                      hugetlb_unregister_node);
 1859 }
 1860 #else   /* !CONFIG_NUMA */
 1861 
 1862 static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
 1863 {
 1864         BUG();
 1865         if (nidp)
 1866                 *nidp = -1;
 1867         return NULL;
 1868 }
 1869 
 1870 static void hugetlb_unregister_all_nodes(void) { }
 1871 
 1872 static void hugetlb_register_all_nodes(void) { }
 1873 
 1874 #endif
 1875 
 1876 static void __exit hugetlb_exit(void)
 1877 {
 1878         struct hstate *h;
 1879 
 1880         hugetlb_unregister_all_nodes();
 1881 
 1882         for_each_hstate(h) {
 1883                 kobject_put(hstate_kobjs[hstate_index(h)]);
 1884         }
 1885 
 1886         kobject_put(hugepages_kobj);
 1887 }
 1888 module_exit(hugetlb_exit);
 1889 
 1890 static int __init hugetlb_init(void)
 1891 {
 1892         /* Some platform decide whether they support huge pages at boot
 1893          * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
 1894          * there is no such support
 1895          */
 1896         if (HPAGE_SHIFT == 0)
 1897                 return 0;
 1898 
 1899         if (!size_to_hstate(default_hstate_size)) {
 1900                 default_hstate_size = HPAGE_SIZE;
 1901                 if (!size_to_hstate(default_hstate_size))
 1902                         hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
 1903         }
 1904         default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
 1905         if (default_hstate_max_huge_pages)
 1906                 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
 1907 
 1908         hugetlb_init_hstates();
 1909         gather_bootmem_prealloc();
 1910         report_hugepages();
 1911 
 1912         hugetlb_sysfs_init();
 1913         hugetlb_register_all_nodes();
 1914         hugetlb_cgroup_file_init();
 1915 
 1916         return 0;
 1917 }
 1918 module_init(hugetlb_init);
 1919 
 1920 /* Should be called on processing a hugepagesz=... option */
 1921 void __init hugetlb_add_hstate(unsigned order)
 1922 {
 1923         struct hstate *h;
 1924         unsigned long i;
 1925 
 1926         if (size_to_hstate(PAGE_SIZE << order)) {
 1927                 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
 1928                 return;
 1929         }
 1930         BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
 1931         BUG_ON(order == 0);
 1932         h = &hstates[hugetlb_max_hstate++];
 1933         h->order = order;
 1934         h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
 1935         h->nr_huge_pages = 0;
 1936         h->free_huge_pages = 0;
 1937         for (i = 0; i < MAX_NUMNODES; ++i)
 1938                 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
 1939         INIT_LIST_HEAD(&h->hugepage_activelist);
 1940         h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
 1941         h->next_nid_to_free = first_node(node_states[N_MEMORY]);
 1942         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 1943                                         huge_page_size(h)/1024);
 1944 
 1945         parsed_hstate = h;
 1946 }
 1947 
 1948 static int __init hugetlb_nrpages_setup(char *s)
 1949 {
 1950         unsigned long *mhp;
 1951         static unsigned long *last_mhp;
 1952 
 1953         /*
 1954          * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
 1955          * so this hugepages= parameter goes to the "default hstate".
 1956          */
 1957         if (!hugetlb_max_hstate)
 1958                 mhp = &default_hstate_max_huge_pages;
 1959         else
 1960                 mhp = &parsed_hstate->max_huge_pages;
 1961 
 1962         if (mhp == last_mhp) {
 1963                 printk(KERN_WARNING "hugepages= specified twice without "
 1964                         "interleaving hugepagesz=, ignoring\n");
 1965                 return 1;
 1966         }
 1967 
 1968         if (sscanf(s, "%lu", mhp) <= 0)
 1969                 *mhp = 0;
 1970 
 1971         /*
 1972          * Global state is always initialized later in hugetlb_init.
 1973          * But we need to allocate >= MAX_ORDER hstates here early to still
 1974          * use the bootmem allocator.
 1975          */
 1976         if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
 1977                 hugetlb_hstate_alloc_pages(parsed_hstate);
 1978 
 1979         last_mhp = mhp;
 1980 
 1981         return 1;
 1982 }
 1983 __setup("hugepages=", hugetlb_nrpages_setup);
 1984 
 1985 static int __init hugetlb_default_setup(char *s)
 1986 {
 1987         default_hstate_size = memparse(s, &s);
 1988         return 1;
 1989 }
 1990 __setup("default_hugepagesz=", hugetlb_default_setup);
 1991 
 1992 static unsigned int cpuset_mems_nr(unsigned int *array)
 1993 {
 1994         int node;
 1995         unsigned int nr = 0;
 1996 
 1997         for_each_node_mask(node, cpuset_current_mems_allowed)
 1998                 nr += array[node];
 1999 
 2000         return nr;
 2001 }
 2002 
 2003 #ifdef CONFIG_SYSCTL
 2004 static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 2005                          struct ctl_table *table, int write,
 2006                          void __user *buffer, size_t *length, loff_t *ppos)
 2007 {
 2008         struct hstate *h = &default_hstate;
 2009         unsigned long tmp;
 2010         int ret;
 2011 
 2012         tmp = h->max_huge_pages;
 2013 
 2014         if (write && h->order >= MAX_ORDER)
 2015                 return -EINVAL;
 2016 
 2017         table->data = &tmp;
 2018         table->maxlen = sizeof(unsigned long);
 2019         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
 2020         if (ret)
 2021                 goto out;
 2022 
 2023         if (write) {
 2024                 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
 2025                                                 GFP_KERNEL | __GFP_NORETRY);
 2026                 if (!(obey_mempolicy &&
 2027                                init_nodemask_of_mempolicy(nodes_allowed))) {
 2028                         NODEMASK_FREE(nodes_allowed);
 2029                         nodes_allowed = &node_states[N_MEMORY];
 2030                 }
 2031                 h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
 2032 
 2033                 if (nodes_allowed != &node_states[N_MEMORY])
 2034                         NODEMASK_FREE(nodes_allowed);
 2035         }
 2036 out:
 2037         return ret;
 2038 }
 2039 
 2040 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 2041                           void __user *buffer, size_t *length, loff_t *ppos)
 2042 {
 2043 
 2044         return hugetlb_sysctl_handler_common(false, table, write,
 2045                                                         buffer, length, ppos);
 2046 }
 2047 
 2048 #ifdef CONFIG_NUMA
 2049 int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
 2050                           void __user *buffer, size_t *length, loff_t *ppos)
 2051 {
 2052         return hugetlb_sysctl_handler_common(true, table, write,
 2053                                                         buffer, length, ppos);
 2054 }
 2055 #endif /* CONFIG_NUMA */
 2056 
 2057 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
 2058                         void __user *buffer,
 2059                         size_t *length, loff_t *ppos)
 2060 {
 2061         proc_dointvec(table, write, buffer, length, ppos);
 2062         if (hugepages_treat_as_movable)
 2063                 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
 2064         else
 2065                 htlb_alloc_mask = GFP_HIGHUSER;
 2066         return 0;
 2067 }
 2068 
 2069 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 2070                         void __user *buffer,
 2071                         size_t *length, loff_t *ppos)
 2072 {
 2073         struct hstate *h = &default_hstate;
 2074         unsigned long tmp;
 2075         int ret;
 2076 
 2077         tmp = h->nr_overcommit_huge_pages;
 2078 
 2079         if (write && h->order >= MAX_ORDER)
 2080                 return -EINVAL;
 2081 
 2082         table->data = &tmp;
 2083         table->maxlen = sizeof(unsigned long);
 2084         ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
 2085         if (ret)
 2086                 goto out;
 2087 
 2088         if (write) {
 2089                 spin_lock(&hugetlb_lock);
 2090                 h->nr_overcommit_huge_pages = tmp;
 2091                 spin_unlock(&hugetlb_lock);
 2092         }
 2093 out:
 2094         return ret;
 2095 }
 2096 
 2097 #endif /* CONFIG_SYSCTL */
 2098 
 2099 void hugetlb_report_meminfo(struct seq_file *m)
 2100 {
 2101         struct hstate *h = &default_hstate;
 2102         seq_printf(m,
 2103                         "HugePages_Total:   %5lu\n"
 2104                         "HugePages_Free:    %5lu\n"
 2105                         "HugePages_Rsvd:    %5lu\n"
 2106                         "HugePages_Surp:    %5lu\n"
 2107                         "Hugepagesize:   %8lu kB\n",
 2108                         h->nr_huge_pages,
 2109                         h->free_huge_pages,
 2110                         h->resv_huge_pages,
 2111                         h->surplus_huge_pages,
 2112                         1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 2113 }
 2114 
 2115 int hugetlb_report_node_meminfo(int nid, char *buf)
 2116 {
 2117         struct hstate *h = &default_hstate;
 2118         return sprintf(buf,
 2119                 "Node %d HugePages_Total: %5u\n"
 2120                 "Node %d HugePages_Free:  %5u\n"
 2121                 "Node %d HugePages_Surp:  %5u\n",
 2122                 nid, h->nr_huge_pages_node[nid],
 2123                 nid, h->free_huge_pages_node[nid],
 2124                 nid, h->surplus_huge_pages_node[nid]);
 2125 }
 2126 
 2127 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 2128 unsigned long hugetlb_total_pages(void)
 2129 {
 2130         struct hstate *h = &default_hstate;
 2131         return h->nr_huge_pages * pages_per_huge_page(h);
 2132 }
 2133 
 2134 static int hugetlb_acct_memory(struct hstate *h, long delta)
 2135 {
 2136         int ret = -ENOMEM;
 2137 
 2138         spin_lock(&hugetlb_lock);
 2139         /*
 2140          * When cpuset is configured, it breaks the strict hugetlb page
 2141          * reservation as the accounting is done on a global variable. Such
 2142          * reservation is completely rubbish in the presence of cpuset because
 2143          * the reservation is not checked against page availability for the
 2144          * current cpuset. Application can still potentially OOM'ed by kernel
 2145          * with lack of free htlb page in cpuset that the task is in.
 2146          * Attempt to enforce strict accounting with cpuset is almost
 2147          * impossible (or too ugly) because cpuset is too fluid that
 2148          * task or memory node can be dynamically moved between cpusets.
 2149          *
 2150          * The change of semantics for shared hugetlb mapping with cpuset is
 2151          * undesirable. However, in order to preserve some of the semantics,
 2152          * we fall back to check against current free page availability as
 2153          * a best attempt and hopefully to minimize the impact of changing
 2154          * semantics that cpuset has.
 2155          */
 2156         if (delta > 0) {
 2157                 if (gather_surplus_pages(h, delta) < 0)
 2158                         goto out;
 2159 
 2160                 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
 2161                         return_unused_surplus_pages(h, delta);
 2162                         goto out;
 2163                 }
 2164         }
 2165 
 2166         ret = 0;
 2167         if (delta < 0)
 2168                 return_unused_surplus_pages(h, (unsigned long) -delta);
 2169 
 2170 out:
 2171         spin_unlock(&hugetlb_lock);
 2172         return ret;
 2173 }
 2174 
 2175 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 2176 {
 2177         struct resv_map *reservations = vma_resv_map(vma);
 2178 
 2179         /*
 2180          * This new VMA should share its siblings reservation map if present.
 2181          * The VMA will only ever have a valid reservation map pointer where
 2182          * it is being copied for another still existing VMA.  As that VMA
 2183          * has a reference to the reservation map it cannot disappear until
 2184          * after this open call completes.  It is therefore safe to take a
 2185          * new reference here without additional locking.
 2186          */
 2187         if (reservations)
 2188                 kref_get(&reservations->refs);
 2189 }
 2190 
 2191 static void resv_map_put(struct vm_area_struct *vma)
 2192 {
 2193         struct resv_map *reservations = vma_resv_map(vma);
 2194 
 2195         if (!reservations)
 2196                 return;
 2197         kref_put(&reservations->refs, resv_map_release);
 2198 }
 2199 
 2200 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 2201 {
 2202         struct hstate *h = hstate_vma(vma);
 2203         struct resv_map *reservations = vma_resv_map(vma);
 2204         struct hugepage_subpool *spool = subpool_vma(vma);
 2205         unsigned long reserve;
 2206         unsigned long start;
 2207         unsigned long end;
 2208 
 2209         if (reservations) {
 2210                 start = vma_hugecache_offset(h, vma, vma->vm_start);
 2211                 end = vma_hugecache_offset(h, vma, vma->vm_end);
 2212 
 2213                 reserve = (end - start) -
 2214                         region_count(&reservations->regions, start, end);
 2215 
 2216                 resv_map_put(vma);
 2217 
 2218                 if (reserve) {
 2219                         hugetlb_acct_memory(h, -reserve);
 2220                         hugepage_subpool_put_pages(spool, reserve);
 2221                 }
 2222         }
 2223 }
 2224 
 2225 /*
 2226  * We cannot handle pagefaults against hugetlb pages at all.  They cause
 2227  * handle_mm_fault() to try to instantiate regular-sized pages in the
 2228  * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 2229  * this far.
 2230  */
 2231 static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 2232 {
 2233         BUG();
 2234         return 0;
 2235 }
 2236 
 2237 const struct vm_operations_struct hugetlb_vm_ops = {
 2238         .fault = hugetlb_vm_op_fault,
 2239         .open = hugetlb_vm_op_open,
 2240         .close = hugetlb_vm_op_close,
 2241 };
 2242 
 2243 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
 2244                                 int writable)
 2245 {
 2246         pte_t entry;
 2247 
 2248         if (writable) {
 2249                 entry =
 2250                     pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
 2251         } else {
 2252                 entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
 2253         }
 2254         entry = pte_mkyoung(entry);
 2255         entry = pte_mkhuge(entry);
 2256         entry = arch_make_huge_pte(entry, vma, page, writable);
 2257 
 2258         return entry;
 2259 }
 2260 
 2261 static void set_huge_ptep_writable(struct vm_area_struct *vma,
 2262                                    unsigned long address, pte_t *ptep)
 2263 {
 2264         pte_t entry;
 2265 
 2266         entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
 2267         if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
 2268                 update_mmu_cache(vma, address, ptep);
 2269 }
 2270 
 2271 
 2272 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 2273                             struct vm_area_struct *vma)
 2274 {
 2275         pte_t *src_pte, *dst_pte, entry;
 2276         struct page *ptepage;
 2277         unsigned long addr;
 2278         int cow;
 2279         struct hstate *h = hstate_vma(vma);
 2280         unsigned long sz = huge_page_size(h);
 2281 
 2282         cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 2283 
 2284         for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 2285                 src_pte = huge_pte_offset(src, addr);
 2286                 if (!src_pte)
 2287                         continue;
 2288                 dst_pte = huge_pte_alloc(dst, addr, sz);
 2289                 if (!dst_pte)
 2290                         goto nomem;
 2291 
 2292                 /* If the pagetables are shared don't copy or take references */
 2293                 if (dst_pte == src_pte)
 2294                         continue;
 2295 
 2296                 spin_lock(&dst->page_table_lock);
 2297                 spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
 2298                 if (!huge_pte_none(huge_ptep_get(src_pte))) {
 2299                         if (cow)
 2300                                 huge_ptep_set_wrprotect(src, addr, src_pte);
 2301                         entry = huge_ptep_get(src_pte);
 2302                         ptepage = pte_page(entry);
 2303                         get_page(ptepage);
 2304                         page_dup_rmap(ptepage);
 2305                         set_huge_pte_at(dst, addr, dst_pte, entry);
 2306                 }
 2307                 spin_unlock(&src->page_table_lock);
 2308                 spin_unlock(&dst->page_table_lock);
 2309         }
 2310         return 0;
 2311 
 2312 nomem:
 2313         return -ENOMEM;
 2314 }
 2315 
 2316 static int is_hugetlb_entry_migration(pte_t pte)
 2317 {
 2318         swp_entry_t swp;
 2319 
 2320         if (huge_pte_none(pte) || pte_present(pte))
 2321                 return 0;
 2322         swp = pte_to_swp_entry(pte);
 2323         if (non_swap_entry(swp) && is_migration_entry(swp))
 2324                 return 1;
 2325         else
 2326                 return 0;
 2327 }
 2328 
 2329 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
 2330 {
 2331         swp_entry_t swp;
 2332 
 2333         if (huge_pte_none(pte) || pte_present(pte))
 2334                 return 0;
 2335         swp = pte_to_swp_entry(pte);
 2336         if (non_swap_entry(swp) && is_hwpoison_entry(swp))
 2337                 return 1;
 2338         else
 2339                 return 0;
 2340 }
 2341 
 2342 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 2343                             unsigned long start, unsigned long end,
 2344                             struct page *ref_page)
 2345 {
 2346         int force_flush = 0;
 2347         struct mm_struct *mm = vma->vm_mm;
 2348         unsigned long address;
 2349         pte_t *ptep;
 2350         pte_t pte;
 2351         struct page *page;
 2352         struct hstate *h = hstate_vma(vma);
 2353         unsigned long sz = huge_page_size(h);
 2354         const unsigned long mmun_start = start; /* For mmu_notifiers */
 2355         const unsigned long mmun_end   = end;   /* For mmu_notifiers */
 2356 
 2357         WARN_ON(!is_vm_hugetlb_page(vma));
 2358         BUG_ON(start & ~huge_page_mask(h));
 2359         BUG_ON(end & ~huge_page_mask(h));
 2360 
 2361         tlb_start_vma(tlb, vma);
 2362         mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 2363 again:
 2364         spin_lock(&mm->page_table_lock);
 2365         for (address = start; address < end; address += sz) {
 2366                 ptep = huge_pte_offset(mm, address);
 2367                 if (!ptep)
 2368                         continue;
 2369 
 2370                 if (huge_pmd_unshare(mm, &address, ptep))
 2371                         continue;
 2372 
 2373                 pte = huge_ptep_get(ptep);
 2374                 if (huge_pte_none(pte))
 2375                         continue;
 2376 
 2377                 /*
 2378                  * HWPoisoned hugepage is already unmapped and dropped reference
 2379                  */
 2380                 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
 2381                         pte_clear(mm, address, ptep);
 2382                         continue;
 2383                 }
 2384 
 2385                 page = pte_page(pte);
 2386                 /*
 2387                  * If a reference page is supplied, it is because a specific
 2388                  * page is being unmapped, not a range. Ensure the page we
 2389                  * are about to unmap is the actual page of interest.
 2390                  */
 2391                 if (ref_page) {
 2392                         if (page != ref_page)
 2393                                 continue;
 2394 
 2395                         /*
 2396                          * Mark the VMA as having unmapped its page so that
 2397                          * future faults in this VMA will fail rather than
 2398                          * looking like data was lost
 2399                          */
 2400                         set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
 2401                 }
 2402 
 2403                 pte = huge_ptep_get_and_clear(mm, address, ptep);
 2404                 tlb_remove_tlb_entry(tlb, ptep, address);
 2405                 if (pte_dirty(pte))
 2406                         set_page_dirty(page);
 2407 
 2408                 page_remove_rmap(page);
 2409                 force_flush = !__tlb_remove_page(tlb, page);
 2410                 if (force_flush)
 2411                         break;
 2412                 /* Bail out after unmapping reference page if supplied */
 2413                 if (ref_page)
 2414                         break;
 2415         }
 2416         spin_unlock(&mm->page_table_lock);
 2417         /*
 2418          * mmu_gather ran out of room to batch pages, we break out of
 2419          * the PTE lock to avoid doing the potential expensive TLB invalidate
 2420          * and page-free while holding it.
 2421          */
 2422         if (force_flush) {
 2423                 force_flush = 0;
 2424                 tlb_flush_mmu(tlb);
 2425                 if (address < end && !ref_page)
 2426                         goto again;
 2427         }
 2428         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 2429         tlb_end_vma(tlb, vma);
 2430 }
 2431 
 2432 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 2433                           struct vm_area_struct *vma, unsigned long start,
 2434                           unsigned long end, struct page *ref_page)
 2435 {
 2436         __unmap_hugepage_range(tlb, vma, start, end, ref_page);
 2437 
 2438         /*
 2439          * Clear this flag so that x86's huge_pmd_share page_table_shareable
 2440          * test will fail on a vma being torn down, and not grab a page table
 2441          * on its way out.  We're lucky that the flag has such an appropriate
 2442          * name, and can in fact be safely cleared here. We could clear it
 2443          * before the __unmap_hugepage_range above, but all that's necessary
 2444          * is to clear it before releasing the i_mmap_mutex. This works
 2445          * because in the context this is called, the VMA is about to be
 2446          * destroyed and the i_mmap_mutex is held.
 2447          */
 2448         vma->vm_flags &= ~VM_MAYSHARE;
 2449 }
 2450 
 2451 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 2452                           unsigned long end, struct page *ref_page)
 2453 {
 2454         struct mm_struct *mm;
 2455         struct mmu_gather tlb;
 2456 
 2457         mm = vma->vm_mm;
 2458 
 2459         tlb_gather_mmu(&tlb, mm, 0);
 2460         __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
 2461         tlb_finish_mmu(&tlb, start, end);
 2462 }
 2463 
 2464 /*
 2465  * This is called when the original mapper is failing to COW a MAP_PRIVATE
 2466  * mappping it owns the reserve page for. The intention is to unmap the page
 2467  * from other VMAs and let the children be SIGKILLed if they are faulting the
 2468  * same region.
 2469  */
 2470 static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 2471                                 struct page *page, unsigned long address)
 2472 {
 2473         struct hstate *h = hstate_vma(vma);
 2474         struct vm_area_struct *iter_vma;
 2475         struct address_space *mapping;
 2476         pgoff_t pgoff;
 2477 
 2478         /*
 2479          * vm_pgoff is in PAGE_SIZE units, hence the different calculation
 2480          * from page cache lookup which is in HPAGE_SIZE units.
 2481          */
 2482         address = address & huge_page_mask(h);
 2483         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
 2484                         vma->vm_pgoff;
 2485         mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
 2486 
 2487         /*
 2488          * Take the mapping lock for the duration of the table walk. As
 2489          * this mapping should be shared between all the VMAs,
 2490          * __unmap_hugepage_range() is called as the lock is already held
 2491          */
 2492         mutex_lock(&mapping->i_mmap_mutex);
 2493         vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
 2494                 /* Do not unmap the current VMA */
 2495                 if (iter_vma == vma)
 2496                         continue;
 2497 
 2498                 /*
 2499                  * Unmap the page from other VMAs without their own reserves.
 2500                  * They get marked to be SIGKILLed if they fault in these
 2501                  * areas. This is because a future no-page fault on this VMA
 2502                  * could insert a zeroed page instead of the data existing
 2503                  * from the time of fork. This would look like data corruption
 2504                  */
 2505                 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
 2506                         unmap_hugepage_range(iter_vma, address,
 2507                                              address + huge_page_size(h), page);
 2508         }
 2509         mutex_unlock(&mapping->i_mmap_mutex);
 2510 
 2511         return 1;
 2512 }
 2513 
 2514 /*
 2515  * Hugetlb_cow() should be called with page lock of the original hugepage held.
 2516  * Called with hugetlb_instantiation_mutex held and pte_page locked so we
 2517  * cannot race with other handlers or page migration.
 2518  * Keep the pte_same checks anyway to make transition from the mutex easier.
 2519  */
 2520 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 2521                         unsigned long address, pte_t *ptep, pte_t pte,
 2522                         struct page *pagecache_page)
 2523 {
 2524         struct hstate *h = hstate_vma(vma);
 2525         struct page *old_page, *new_page;
 2526         int avoidcopy;
 2527         int outside_reserve = 0;
 2528         unsigned long mmun_start;       /* For mmu_notifiers */
 2529         unsigned long mmun_end;         /* For mmu_notifiers */
 2530 
 2531         old_page = pte_page(pte);
 2532 
 2533 retry_avoidcopy:
 2534         /* If no-one else is actually using this page, avoid the copy
 2535          * and just make the page writable */
 2536         avoidcopy = (page_mapcount(old_page) == 1);
 2537         if (avoidcopy) {
 2538                 if (PageAnon(old_page))
 2539                         page_move_anon_rmap(old_page, vma, address);
 2540                 set_huge_ptep_writable(vma, address, ptep);
 2541                 return 0;
 2542         }
 2543 
 2544         /*
 2545          * If the process that created a MAP_PRIVATE mapping is about to
 2546          * perform a COW due to a shared page count, attempt to satisfy
 2547          * the allocation without using the existing reserves. The pagecache
 2548          * page is used to determine if the reserve at this address was
 2549          * consumed or not. If reserves were used, a partial faulted mapping
 2550          * at the time of fork() could consume its reserves on COW instead
 2551          * of the full address range.
 2552          */
 2553         if (!(vma->vm_flags & VM_MAYSHARE) &&
 2554                         is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
 2555                         old_page != pagecache_page)
 2556                 outside_reserve = 1;
 2557 
 2558         page_cache_get(old_page);
 2559 
 2560         /* Drop page_table_lock as buddy allocator may be called */
 2561         spin_unlock(&mm->page_table_lock);
 2562         new_page = alloc_huge_page(vma, address, outside_reserve);
 2563 
 2564         if (IS_ERR(new_page)) {
 2565                 long err = PTR_ERR(new_page);
 2566                 page_cache_release(old_page);
 2567 
 2568                 /*
 2569                  * If a process owning a MAP_PRIVATE mapping fails to COW,
 2570                  * it is due to references held by a child and an insufficient
 2571                  * huge page pool. To guarantee the original mappers
 2572                  * reliability, unmap the page from child processes. The child
 2573                  * may get SIGKILLed if it later faults.
 2574                  */
 2575                 if (outside_reserve) {
 2576                         BUG_ON(huge_pte_none(pte));
 2577                         if (unmap_ref_private(mm, vma, old_page, address)) {
 2578                                 BUG_ON(huge_pte_none(pte));
 2579                                 spin_lock(&mm->page_table_lock);
 2580                                 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 2581                                 if (likely(pte_same(huge_ptep_get(ptep), pte)))
 2582                                         goto retry_avoidcopy;
 2583                                 /*
 2584                                  * race occurs while re-acquiring page_table_lock, and
 2585                                  * our job is done.
 2586                                  */
 2587                                 return 0;
 2588                         }
 2589                         WARN_ON_ONCE(1);
 2590                 }
 2591 
 2592                 /* Caller expects lock to be held */
 2593                 spin_lock(&mm->page_table_lock);
 2594                 if (err == -ENOMEM)
 2595                         return VM_FAULT_OOM;
 2596                 else
 2597                         return VM_FAULT_SIGBUS;
 2598         }
 2599 
 2600         /*
 2601          * When the original hugepage is shared one, it does not have
 2602          * anon_vma prepared.
 2603          */
 2604         if (unlikely(anon_vma_prepare(vma))) {
 2605                 page_cache_release(new_page);
 2606                 page_cache_release(old_page);
 2607                 /* Caller expects lock to be held */
 2608                 spin_lock(&mm->page_table_lock);
 2609                 return VM_FAULT_OOM;
 2610         }
 2611 
 2612         copy_user_huge_page(new_page, old_page, address, vma,
 2613                             pages_per_huge_page(h));
 2614         __SetPageUptodate(new_page);
 2615 
 2616         mmun_start = address & huge_page_mask(h);
 2617         mmun_end = mmun_start + huge_page_size(h);
 2618         mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 2619         /*
 2620          * Retake the page_table_lock to check for racing updates
 2621          * before the page tables are altered
 2622          */
 2623         spin_lock(&mm->page_table_lock);
 2624         ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 2625         if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 2626                 /* Break COW */
 2627                 huge_ptep_clear_flush(vma, address, ptep);
 2628                 set_huge_pte_at(mm, address, ptep,
 2629                                 make_huge_pte(vma, new_page, 1));
 2630                 page_remove_rmap(old_page);
 2631                 hugepage_add_new_anon_rmap(new_page, vma, address);
 2632                 /* Make the old page be freed below */
 2633                 new_page = old_page;
 2634         }
 2635         spin_unlock(&mm->page_table_lock);
 2636         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 2637         /* Caller expects lock to be held */
 2638         spin_lock(&mm->page_table_lock);
 2639         page_cache_release(new_page);
 2640         page_cache_release(old_page);
 2641         return 0;
 2642 }
 2643 
 2644 /* Return the pagecache page at a given address within a VMA */
 2645 static struct page *hugetlbfs_pagecache_page(struct hstate *h,
 2646                         struct vm_area_struct *vma, unsigned long address)
 2647 {
 2648         struct address_space *mapping;
 2649         pgoff_t idx;
 2650 
 2651         mapping = vma->vm_file->f_mapping;
 2652         idx = vma_hugecache_offset(h, vma, address);
 2653 
 2654         return find_lock_page(mapping, idx);
 2655 }
 2656 
 2657 /*
 2658  * Return whether there is a pagecache page to back given address within VMA.
 2659  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
 2660  */
 2661 static bool hugetlbfs_pagecache_present(struct hstate *h,
 2662                         struct vm_area_struct *vma, unsigned long address)
 2663 {
 2664         struct address_space *mapping;
 2665         pgoff_t idx;
 2666         struct page *page;
 2667 
 2668         mapping = vma->vm_file->f_mapping;
 2669         idx = vma_hugecache_offset(h, vma, address);
 2670 
 2671         page = find_get_page(mapping, idx);
 2672         if (page)
 2673                 put_page(page);
 2674         return page != NULL;
 2675 }
 2676 
 2677 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 2678                         unsigned long address, pte_t *ptep, unsigned int flags)
 2679 {
 2680         struct hstate *h = hstate_vma(vma);
 2681         int ret = VM_FAULT_SIGBUS;
 2682         int anon_rmap = 0;
 2683         pgoff_t idx;
 2684         unsigned long size;
 2685         struct page *page;
 2686         struct address_space *mapping;
 2687         pte_t new_pte;
 2688 
 2689         /*
 2690          * Currently, we are forced to kill the process in the event the
 2691          * original mapper has unmapped pages from the child due to a failed
 2692          * COW. Warn that such a situation has occurred as it may not be obvious
 2693          */
 2694         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
 2695                 printk(KERN_WARNING
 2696                         "PID %d killed due to inadequate hugepage pool\n",
 2697                         current->pid);
 2698                 return ret;
 2699         }
 2700 
 2701         mapping = vma->vm_file->f_mapping;
 2702         idx = vma_hugecache_offset(h, vma, address);
 2703 
 2704         /*
 2705          * Use page lock to guard against racing truncation
 2706          * before we get page_table_lock.
 2707          */
 2708 retry:
 2709         page = find_lock_page(mapping, idx);
 2710         if (!page) {
 2711                 size = i_size_read(mapping->host) >> huge_page_shift(h);
 2712                 if (idx >= size)
 2713                         goto out;
 2714                 page = alloc_huge_page(vma, address, 0);
 2715                 if (IS_ERR(page)) {
 2716                         ret = PTR_ERR(page);
 2717                         if (ret == -ENOMEM)
 2718                                 ret = VM_FAULT_OOM;
 2719                         else
 2720                                 ret = VM_FAULT_SIGBUS;
 2721                         goto out;
 2722                 }
 2723                 clear_huge_page(page, address, pages_per_huge_page(h));
 2724                 __SetPageUptodate(page);
 2725 
 2726                 if (vma->vm_flags & VM_MAYSHARE) {
 2727                         int err;
 2728                         struct inode *inode = mapping->host;
 2729 
 2730                         err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
 2731                         if (err) {
 2732                                 put_page(page);
 2733                                 if (err == -EEXIST)
 2734                                         goto retry;
 2735                                 goto out;
 2736                         }
 2737 
 2738                         spin_lock(&inode->i_lock);
 2739                         inode->i_blocks += blocks_per_huge_page(h);
 2740                         spin_unlock(&inode->i_lock);
 2741                 } else {
 2742                         lock_page(page);
 2743                         if (unlikely(anon_vma_prepare(vma))) {
 2744                                 ret = VM_FAULT_OOM;
 2745                                 goto backout_unlocked;
 2746                         }
 2747                         anon_rmap = 1;
 2748                 }
 2749         } else {
 2750                 /*
 2751                  * If memory error occurs between mmap() and fault, some process
 2752                  * don't have hwpoisoned swap entry for errored virtual address.
 2753                  * So we need to block hugepage fault by PG_hwpoison bit check.
 2754                  */
 2755                 if (unlikely(PageHWPoison(page))) {
 2756                         ret = VM_FAULT_HWPOISON |
 2757                                 VM_FAULT_SET_HINDEX(hstate_index(h));
 2758                         goto backout_unlocked;
 2759                 }
 2760         }
 2761 
 2762         /*
 2763          * If we are going to COW a private mapping later, we examine the
 2764          * pending reservations for this page now. This will ensure that
 2765          * any allocations necessary to record that reservation occur outside
 2766          * the spinlock.
 2767          */
 2768         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
 2769                 if (vma_needs_reservation(h, vma, address) < 0) {
 2770                         ret = VM_FAULT_OOM;
 2771                         goto backout_unlocked;
 2772                 }
 2773 
 2774         spin_lock(&mm->page_table_lock);
 2775         size = i_size_read(mapping->host) >> huge_page_shift(h);
 2776         if (idx >= size)
 2777                 goto backout;
 2778 
 2779         ret = 0;
 2780         if (!huge_pte_none(huge_ptep_get(ptep)))
 2781                 goto backout;
 2782 
 2783         if (anon_rmap)
 2784                 hugepage_add_new_anon_rmap(page, vma, address);
 2785         else
 2786                 page_dup_rmap(page);
 2787         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
 2788                                 && (vma->vm_flags & VM_SHARED)));
 2789         set_huge_pte_at(mm, address, ptep, new_pte);
 2790 
 2791         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 2792                 /* Optimization, do the COW without a second fault */
 2793                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
 2794         }
 2795 
 2796         spin_unlock(&mm->page_table_lock);
 2797         unlock_page(page);
 2798 out:
 2799         return ret;
 2800 
 2801 backout:
 2802         spin_unlock(&mm->page_table_lock);
 2803 backout_unlocked:
 2804         unlock_page(page);
 2805         put_page(page);
 2806         goto out;
 2807 }
 2808 
 2809 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 2810                         unsigned long address, unsigned int flags)
 2811 {
 2812         pte_t *ptep;
 2813         pte_t entry;
 2814         int ret;
 2815         struct page *page = NULL;
 2816         struct page *pagecache_page = NULL;
 2817         static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 2818         struct hstate *h = hstate_vma(vma);
 2819 
 2820         address &= huge_page_mask(h);
 2821 
 2822         ptep = huge_pte_offset(mm, address);
 2823         if (ptep) {
 2824                 entry = huge_ptep_get(ptep);
 2825                 if (unlikely(is_hugetlb_entry_migration(entry))) {
 2826                         migration_entry_wait(mm, (pmd_t *)ptep, address);
 2827                         return 0;
 2828                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 2829                         return VM_FAULT_HWPOISON_LARGE |
 2830                                 VM_FAULT_SET_HINDEX(hstate_index(h));
 2831         }
 2832 
 2833         ptep = huge_pte_alloc(mm, address, huge_page_size(h));
 2834         if (!ptep)
 2835                 return VM_FAULT_OOM;
 2836 
 2837         /*
 2838          * Serialize hugepage allocation and instantiation, so that we don't
 2839          * get spurious allocation failures if two CPUs race to instantiate
 2840          * the same page in the page cache.
 2841          */
 2842         mutex_lock(&hugetlb_instantiation_mutex);
 2843         entry = huge_ptep_get(ptep);
 2844         if (huge_pte_none(entry)) {
 2845                 ret = hugetlb_no_page(mm, vma, address, ptep, flags);
 2846                 goto out_mutex;
 2847         }
 2848 
 2849         ret = 0;
 2850 
 2851         /*
 2852          * If we are going to COW the mapping later, we examine the pending
 2853          * reservations for this page now. This will ensure that any
 2854          * allocations necessary to record that reservation occur outside the
 2855          * spinlock. For private mappings, we also lookup the pagecache
 2856          * page now as it is used to determine if a reservation has been
 2857          * consumed.
 2858          */
 2859         if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) {
 2860                 if (vma_needs_reservation(h, vma, address) < 0) {
 2861                         ret = VM_FAULT_OOM;
 2862                         goto out_mutex;
 2863                 }
 2864 
 2865                 if (!(vma->vm_flags & VM_MAYSHARE))
 2866                         pagecache_page = hugetlbfs_pagecache_page(h,
 2867                                                                 vma, address);
 2868         }
 2869 
 2870         /*
 2871          * hugetlb_cow() requires page locks of pte_page(entry) and
 2872          * pagecache_page, so here we need take the former one
 2873          * when page != pagecache_page or !pagecache_page.
 2874          * Note that locking order is always pagecache_page -> page,
 2875          * so no worry about deadlock.
 2876          */
 2877         page = pte_page(entry);
 2878         get_page(page);
 2879         if (page != pagecache_page)
 2880                 lock_page(page);
 2881 
 2882         spin_lock(&mm->page_table_lock);
 2883         /* Check for a racing update before calling hugetlb_cow */
 2884         if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
 2885                 goto out_page_table_lock;
 2886 
 2887 
 2888         if (flags & FAULT_FLAG_WRITE) {
 2889                 if (!pte_write(entry)) {
 2890                         ret = hugetlb_cow(mm, vma, address, ptep, entry,
 2891                                                         pagecache_page);
 2892                         goto out_page_table_lock;
 2893                 }
 2894                 entry = pte_mkdirty(entry);
 2895         }
 2896         entry = pte_mkyoung(entry);
 2897         if (huge_ptep_set_access_flags(vma, address, ptep, entry,
 2898                                                 flags & FAULT_FLAG_WRITE))
 2899                 update_mmu_cache(vma, address, ptep);
 2900 
 2901 out_page_table_lock:
 2902         spin_unlock(&mm->page_table_lock);
 2903 
 2904         if (pagecache_page) {
 2905                 unlock_page(pagecache_page);
 2906                 put_page(pagecache_page);
 2907         }
 2908         if (page != pagecache_page)
 2909                 unlock_page(page);
 2910         put_page(page);
 2911 
 2912 out_mutex:
 2913         mutex_unlock(&hugetlb_instantiation_mutex);
 2914 
 2915         return ret;
 2916 }
 2917 
 2918 /* Can be overriden by architectures */
 2919 __attribute__((weak)) struct page *
 2920 follow_huge_pud(struct mm_struct *mm, unsigned long address,
 2921                pud_t *pud, int write)
 2922 {
 2923         BUG();
 2924         return NULL;
 2925 }
 2926 
 2927 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 2928                         struct page **pages, struct vm_area_struct **vmas,
 2929                         unsigned long *position, int *length, int i,
 2930                         unsigned int flags)
 2931 {
 2932         unsigned long pfn_offset;
 2933         unsigned long vaddr = *position;
 2934         int remainder = *length;
 2935         struct hstate *h = hstate_vma(vma);
 2936 
 2937         spin_lock(&mm->page_table_lock);
 2938         while (vaddr < vma->vm_end && remainder) {
 2939                 pte_t *pte;
 2940                 int absent;
 2941                 struct page *page;
 2942 
 2943                 /*
 2944                  * Some archs (sparc64, sh*) have multiple pte_ts to
 2945                  * each hugepage.  We have to make sure we get the
 2946                  * first, for the page indexing below to work.
 2947                  */
 2948                 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
 2949                 absent = !pte || huge_pte_none(huge_ptep_get(pte));
 2950 
 2951                 /*
 2952                  * When coredumping, it suits get_dump_page if we just return
 2953                  * an error where there's an empty slot with no huge pagecache
 2954                  * to back it.  This way, we avoid allocating a hugepage, and
 2955                  * the sparse dumpfile avoids allocating disk blocks, but its
 2956                  * huge holes still show up with zeroes where they need to be.
 2957                  */
 2958                 if (absent && (flags & FOLL_DUMP) &&
 2959                     !hugetlbfs_pagecache_present(h, vma, vaddr)) {
 2960                         remainder = 0;
 2961                         break;
 2962                 }
 2963 
 2964                 if (absent ||
 2965                     ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
 2966                         int ret;
 2967 
 2968                         spin_unlock(&mm->page_table_lock);
 2969                         ret = hugetlb_fault(mm, vma, vaddr,
 2970                                 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
 2971                         spin_lock(&mm->page_table_lock);
 2972                         if (!(ret & VM_FAULT_ERROR))
 2973                                 continue;
 2974 
 2975                         remainder = 0;
 2976                         break;
 2977                 }
 2978 
 2979                 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
 2980                 page = pte_page(huge_ptep_get(pte));
 2981 same_page:
 2982                 if (pages) {
 2983                         pages[i] = mem_map_offset(page, pfn_offset);
 2984                         get_page(pages[i]);
 2985                 }
 2986 
 2987                 if (vmas)
 2988                         vmas[i] = vma;
 2989 
 2990                 vaddr += PAGE_SIZE;
 2991                 ++pfn_offset;
 2992                 --remainder;
 2993                 ++i;
 2994                 if (vaddr < vma->vm_end && remainder &&
 2995                                 pfn_offset < pages_per_huge_page(h)) {
 2996                         /*
 2997                          * We use pfn_offset to avoid touching the pageframes
 2998                          * of this compound page.
 2999                          */
 3000                         goto same_page;
 3001                 }
 3002         }
 3003         spin_unlock(&mm->page_table_lock);
 3004         *length = remainder;
 3005         *position = vaddr;
 3006 
 3007         return i ? i : -EFAULT;
 3008 }
 3009 
 3010 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 3011                 unsigned long address, unsigned long end, pgprot_t newprot)
 3012 {
 3013         struct mm_struct *mm = vma->vm_mm;
 3014         unsigned long start = address;
 3015         pte_t *ptep;
 3016         pte_t pte;
 3017         struct hstate *h = hstate_vma(vma);
 3018         unsigned long pages = 0;
 3019 
 3020         BUG_ON(address >= end);
 3021         flush_cache_range(vma, address, end);
 3022 
 3023         mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
 3024         spin_lock(&mm->page_table_lock);
 3025         for (; address < end; address += huge_page_size(h)) {
 3026                 ptep = huge_pte_offset(mm, address);
 3027                 if (!ptep)
 3028                         continue;
 3029                 if (huge_pmd_unshare(mm, &address, ptep)) {
 3030                         pages++;
 3031                         continue;
 3032                 }
 3033                 if (!huge_pte_none(huge_ptep_get(ptep))) {
 3034                         pte = huge_ptep_get_and_clear(mm, address, ptep);
 3035                         pte = pte_mkhuge(pte_modify(pte, newprot));
 3036                         set_huge_pte_at(mm, address, ptep, pte);
 3037                         pages++;
 3038                 }
 3039         }
 3040         spin_unlock(&mm->page_table_lock);
 3041         /*
 3042          * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
 3043          * may have cleared our pud entry and done put_page on the page table:
 3044          * once we release i_mmap_mutex, another task can do the final put_page
 3045          * and that page table be reused and filled with junk.
 3046          */
 3047         flush_tlb_range(vma, start, end);
 3048         mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 3049 
 3050         return pages << h->order;
 3051 }
 3052 
 3053 int hugetlb_reserve_pages(struct inode *inode,
 3054                                         long from, long to,
 3055                                         struct vm_area_struct *vma,
 3056                                         vm_flags_t vm_flags)
 3057 {
 3058         long ret, chg;
 3059         struct hstate *h = hstate_inode(inode);
 3060         struct hugepage_subpool *spool = subpool_inode(inode);
 3061 
 3062         /*
 3063          * Only apply hugepage reservation if asked. At fault time, an
 3064          * attempt will be made for VM_NORESERVE to allocate a page
 3065          * without using reserves
 3066          */
 3067         if (vm_flags & VM_NORESERVE)
 3068                 return 0;
 3069 
 3070         /*
 3071          * Shared mappings base their reservation on the number of pages that
 3072          * are already allocated on behalf of the file. Private mappings need
 3073          * to reserve the full area even if read-only as mprotect() may be
 3074          * called to make the mapping read-write. Assume !vma is a shm mapping
 3075          */
 3076         if (!vma || vma->vm_flags & VM_MAYSHARE)
 3077                 chg = region_chg(&inode->i_mapping->private_list, from, to);
 3078         else {
 3079                 struct resv_map *resv_map = resv_map_alloc();
 3080                 if (!resv_map)
 3081                         return -ENOMEM;
 3082 
 3083                 chg = to - from;
 3084 
 3085                 set_vma_resv_map(vma, resv_map);
 3086                 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
 3087         }
 3088 
 3089         if (chg < 0) {
 3090                 ret = chg;
 3091                 goto out_err;
 3092         }
 3093 
 3094         /* There must be enough pages in the subpool for the mapping */
 3095         if (hugepage_subpool_get_pages(spool, chg)) {
 3096                 ret = -ENOSPC;
 3097                 goto out_err;
 3098         }
 3099 
 3100         /*
 3101          * Check enough hugepages are available for the reservation.
 3102          * Hand the pages back to the subpool if there are not
 3103          */
 3104         ret = hugetlb_acct_memory(h, chg);
 3105         if (ret < 0) {
 3106                 hugepage_subpool_put_pages(spool, chg);
 3107                 goto out_err;
 3108         }
 3109 
 3110         /*
 3111          * Account for the reservations made. Shared mappings record regions
 3112          * that have reservations as they are shared by multiple VMAs.
 3113          * When the last VMA disappears, the region map says how much
 3114          * the reservation was and the page cache tells how much of
 3115          * the reservation was consumed. Private mappings are per-VMA and
 3116          * only the consumed reservations are tracked. When the VMA
 3117          * disappears, the original reservation is the VMA size and the
 3118          * consumed reservations are stored in the map. Hence, nothing
 3119          * else has to be done for private mappings here
 3120          */
 3121         if (!vma || vma->vm_flags & VM_MAYSHARE)
 3122                 region_add(&inode->i_mapping->private_list, from, to);
 3123         return 0;
 3124 out_err:
 3125         if (vma)
 3126                 resv_map_put(vma);
 3127         return ret;
 3128 }
 3129 
 3130 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 3131 {
 3132         struct hstate *h = hstate_inode(inode);
 3133         long chg = region_truncate(&inode->i_mapping->private_list, offset);
 3134         struct hugepage_subpool *spool = subpool_inode(inode);
 3135 
 3136         spin_lock(&inode->i_lock);
 3137         inode->i_blocks -= (blocks_per_huge_page(h) * freed);
 3138         spin_unlock(&inode->i_lock);
 3139 
 3140         hugepage_subpool_put_pages(spool, (chg - freed));
 3141         hugetlb_acct_memory(h, -(chg - freed));
 3142 }
 3143 
 3144 #ifdef CONFIG_MEMORY_FAILURE
 3145 
 3146 /* Should be called in hugetlb_lock */
 3147 static int is_hugepage_on_freelist(struct page *hpage)
 3148 {
 3149         struct page *page;
 3150         struct page *tmp;
 3151         struct hstate *h = page_hstate(hpage);
 3152         int nid = page_to_nid(hpage);
 3153 
 3154         list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
 3155                 if (page == hpage)
 3156                         return 1;
 3157         return 0;
 3158 }
 3159 
 3160 /*
 3161  * This function is called from memory failure code.
 3162  * Assume the caller holds page lock of the head page.
 3163  */
 3164 int dequeue_hwpoisoned_huge_page(struct page *hpage)
 3165 {
 3166         struct hstate *h = page_hstate(hpage);
 3167         int nid = page_to_nid(hpage);
 3168         int ret = -EBUSY;
 3169 
 3170         spin_lock(&hugetlb_lock);
 3171         if (is_hugepage_on_freelist(hpage)) {
 3172                 /*
 3173                  * Hwpoisoned hugepage isn't linked to activelist or freelist,
 3174                  * but dangling hpage->lru can trigger list-debug warnings
 3175                  * (this happens when we call unpoison_memory() on it),
 3176                  * so let it point to itself with list_del_init().
 3177                  */
 3178                 list_del_init(&hpage->lru);
 3179                 set_page_refcounted(hpage);
 3180                 h->free_huge_pages--;
 3181                 h->free_huge_pages_node[nid]--;
 3182                 ret = 0;
 3183         }
 3184         spin_unlock(&hugetlb_lock);
 3185         return ret;
 3186 }
 3187 #endif
Cache object: a7c59346d1687e9b6eb5196a8dad8ad5
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/mm/hugetlb.c

FreeBSD/Linux Kernel Cross Reference
sys/mm/hugetlb.c