vmalloc.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  *  linux/mm/vmalloc.c
    3  *
    4  *  Copyright (C) 1993  Linus Torvalds
    5  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
    6  *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
    7  *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
    8  *  Numa awareness, Christoph Lameter, SGI, June 2005
    9  */
   10 
   11 #include <linux/vmalloc.h>
   12 #include <linux/mm.h>
   13 #include <linux/module.h>
   14 #include <linux/highmem.h>
   15 #include <linux/sched.h>
   16 #include <linux/slab.h>
   17 #include <linux/spinlock.h>
   18 #include <linux/interrupt.h>
   19 #include <linux/proc_fs.h>
   20 #include <linux/seq_file.h>
   21 #include <linux/debugobjects.h>
   22 #include <linux/kallsyms.h>
   23 #include <linux/list.h>
   24 #include <linux/rbtree.h>
   25 #include <linux/radix-tree.h>
   26 #include <linux/rcupdate.h>
   27 #include <linux/pfn.h>
   28 #include <linux/kmemleak.h>
   29 #include <linux/atomic.h>
   30 #include <asm/uaccess.h>
   31 #include <asm/tlbflush.h>
   32 #include <asm/shmparam.h>
   33 
   34 /*** Page table manipulation functions ***/
   35 
   36 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
   37 {
   38         pte_t *pte;
   39 
   40         pte = pte_offset_kernel(pmd, addr);
   41         do {
   42                 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
   43                 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
   44         } while (pte++, addr += PAGE_SIZE, addr != end);
   45 }
   46 
   47 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
   48 {
   49         pmd_t *pmd;
   50         unsigned long next;
   51 
   52         pmd = pmd_offset(pud, addr);
   53         do {
   54                 next = pmd_addr_end(addr, end);
   55                 if (pmd_none_or_clear_bad(pmd))
   56                         continue;
   57                 vunmap_pte_range(pmd, addr, next);
   58         } while (pmd++, addr = next, addr != end);
   59 }
   60 
   61 static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
   62 {
   63         pud_t *pud;
   64         unsigned long next;
   65 
   66         pud = pud_offset(pgd, addr);
   67         do {
   68                 next = pud_addr_end(addr, end);
   69                 if (pud_none_or_clear_bad(pud))
   70                         continue;
   71                 vunmap_pmd_range(pud, addr, next);
   72         } while (pud++, addr = next, addr != end);
   73 }
   74 
   75 static void vunmap_page_range(unsigned long addr, unsigned long end)
   76 {
   77         pgd_t *pgd;
   78         unsigned long next;
   79 
   80         BUG_ON(addr >= end);
   81         pgd = pgd_offset_k(addr);
   82         do {
   83                 next = pgd_addr_end(addr, end);
   84                 if (pgd_none_or_clear_bad(pgd))
   85                         continue;
   86                 vunmap_pud_range(pgd, addr, next);
   87         } while (pgd++, addr = next, addr != end);
   88 }
   89 
   90 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
   91                 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
   92 {
   93         pte_t *pte;
   94 
   95         /*
   96          * nr is a running index into the array which helps higher level
   97          * callers keep track of where we're up to.
   98          */
   99 
  100         pte = pte_alloc_kernel(pmd, addr);
  101         if (!pte)
  102                 return -ENOMEM;
  103         do {
  104                 struct page *page = pages[*nr];
  105 
  106                 if (WARN_ON(!pte_none(*pte)))
  107                         return -EBUSY;
  108                 if (WARN_ON(!page))
  109                         return -ENOMEM;
  110                 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
  111                 (*nr)++;
  112         } while (pte++, addr += PAGE_SIZE, addr != end);
  113         return 0;
  114 }
  115 
  116 static int vmap_pmd_range(pud_t *pud, unsigned long addr,
  117                 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
  118 {
  119         pmd_t *pmd;
  120         unsigned long next;
  121 
  122         pmd = pmd_alloc(&init_mm, pud, addr);
  123         if (!pmd)
  124                 return -ENOMEM;
  125         do {
  126                 next = pmd_addr_end(addr, end);
  127                 if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
  128                         return -ENOMEM;
  129         } while (pmd++, addr = next, addr != end);
  130         return 0;
  131 }
  132 
  133 static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
  134                 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
  135 {
  136         pud_t *pud;
  137         unsigned long next;
  138 
  139         pud = pud_alloc(&init_mm, pgd, addr);
  140         if (!pud)
  141                 return -ENOMEM;
  142         do {
  143                 next = pud_addr_end(addr, end);
  144                 if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
  145                         return -ENOMEM;
  146         } while (pud++, addr = next, addr != end);
  147         return 0;
  148 }
  149 
  150 /*
  151  * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
  152  * will have pfns corresponding to the "pages" array.
  153  *
  154  * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
  155  */
  156 static int vmap_page_range_noflush(unsigned long start, unsigned long end,
  157                                    pgprot_t prot, struct page **pages)
  158 {
  159         pgd_t *pgd;
  160         unsigned long next;
  161         unsigned long addr = start;
  162         int err = 0;
  163         int nr = 0;
  164 
  165         BUG_ON(addr >= end);
  166         pgd = pgd_offset_k(addr);
  167         do {
  168                 next = pgd_addr_end(addr, end);
  169                 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
  170                 if (err)
  171                         return err;
  172         } while (pgd++, addr = next, addr != end);
  173 
  174         return nr;
  175 }
  176 
  177 static int vmap_page_range(unsigned long start, unsigned long end,
  178                            pgprot_t prot, struct page **pages)
  179 {
  180         int ret;
  181 
  182         ret = vmap_page_range_noflush(start, end, prot, pages);
  183         flush_cache_vmap(start, end);
  184         return ret;
  185 }
  186 
  187 int is_vmalloc_or_module_addr(const void *x)
  188 {
  189         /*
  190          * ARM, x86-64 and sparc64 put modules in a special place,
  191          * and fall back on vmalloc() if that fails. Others
  192          * just put it in the vmalloc space.
  193          */
  194 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
  195         unsigned long addr = (unsigned long)x;
  196         if (addr >= MODULES_VADDR && addr < MODULES_END)
  197                 return 1;
  198 #endif
  199         return is_vmalloc_addr(x);
  200 }
  201 
  202 /*
  203  * Walk a vmap address to the struct page it maps.
  204  */
  205 struct page *vmalloc_to_page(const void *vmalloc_addr)
  206 {
  207         unsigned long addr = (unsigned long) vmalloc_addr;
  208         struct page *page = NULL;
  209         pgd_t *pgd = pgd_offset_k(addr);
  210 
  211         /*
  212          * XXX we might need to change this if we add VIRTUAL_BUG_ON for
  213          * architectures that do not vmalloc module space
  214          */
  215         VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
  216 
  217         if (!pgd_none(*pgd)) {
  218                 pud_t *pud = pud_offset(pgd, addr);
  219                 if (!pud_none(*pud)) {
  220                         pmd_t *pmd = pmd_offset(pud, addr);
  221                         if (!pmd_none(*pmd)) {
  222                                 pte_t *ptep, pte;
  223 
  224                                 ptep = pte_offset_map(pmd, addr);
  225                                 pte = *ptep;
  226                                 if (pte_present(pte))
  227                                         page = pte_page(pte);
  228                                 pte_unmap(ptep);
  229                         }
  230                 }
  231         }
  232         return page;
  233 }
  234 EXPORT_SYMBOL(vmalloc_to_page);
  235 
  236 /*
  237  * Map a vmalloc()-space virtual address to the physical page frame number.
  238  */
  239 unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
  240 {
  241         return page_to_pfn(vmalloc_to_page(vmalloc_addr));
  242 }
  243 EXPORT_SYMBOL(vmalloc_to_pfn);
  244 
  245 
  246 /*** Global kva allocator ***/
  247 
  248 #define VM_LAZY_FREE    0x01
  249 #define VM_LAZY_FREEING 0x02
  250 #define VM_VM_AREA      0x04
  251 
  252 struct vmap_area {
  253         unsigned long va_start;
  254         unsigned long va_end;
  255         unsigned long flags;
  256         struct rb_node rb_node;         /* address sorted rbtree */
  257         struct list_head list;          /* address sorted list */
  258         struct list_head purge_list;    /* "lazy purge" list */
  259         struct vm_struct *vm;
  260         struct rcu_head rcu_head;
  261 };
  262 
  263 static DEFINE_SPINLOCK(vmap_area_lock);
  264 static LIST_HEAD(vmap_area_list);
  265 static struct rb_root vmap_area_root = RB_ROOT;
  266 
  267 /* The vmap cache globals are protected by vmap_area_lock */
  268 static struct rb_node *free_vmap_cache;
  269 static unsigned long cached_hole_size;
  270 static unsigned long cached_vstart;
  271 static unsigned long cached_align;
  272 
  273 static unsigned long vmap_area_pcpu_hole;
  274 
  275 static struct vmap_area *__find_vmap_area(unsigned long addr)
  276 {
  277         struct rb_node *n = vmap_area_root.rb_node;
  278 
  279         while (n) {
  280                 struct vmap_area *va;
  281 
  282                 va = rb_entry(n, struct vmap_area, rb_node);
  283                 if (addr < va->va_start)
  284                         n = n->rb_left;
  285                 else if (addr > va->va_start)
  286                         n = n->rb_right;
  287                 else
  288                         return va;
  289         }
  290 
  291         return NULL;
  292 }
  293 
  294 static void __insert_vmap_area(struct vmap_area *va)
  295 {
  296         struct rb_node **p = &vmap_area_root.rb_node;
  297         struct rb_node *parent = NULL;
  298         struct rb_node *tmp;
  299 
  300         while (*p) {
  301                 struct vmap_area *tmp_va;
  302 
  303                 parent = *p;
  304                 tmp_va = rb_entry(parent, struct vmap_area, rb_node);
  305                 if (va->va_start < tmp_va->va_end)
  306                         p = &(*p)->rb_left;
  307                 else if (va->va_end > tmp_va->va_start)
  308                         p = &(*p)->rb_right;
  309                 else
  310                         BUG();
  311         }
  312 
  313         rb_link_node(&va->rb_node, parent, p);
  314         rb_insert_color(&va->rb_node, &vmap_area_root);
  315 
  316         /* address-sort this list so it is usable like the vmlist */
  317         tmp = rb_prev(&va->rb_node);
  318         if (tmp) {
  319                 struct vmap_area *prev;
  320                 prev = rb_entry(tmp, struct vmap_area, rb_node);
  321                 list_add_rcu(&va->list, &prev->list);
  322         } else
  323                 list_add_rcu(&va->list, &vmap_area_list);
  324 }
  325 
  326 static void purge_vmap_area_lazy(void);
  327 
  328 /*
  329  * Allocate a region of KVA of the specified size and alignment, within the
  330  * vstart and vend.
  331  */
  332 static struct vmap_area *alloc_vmap_area(unsigned long size,
  333                                 unsigned long align,
  334                                 unsigned long vstart, unsigned long vend,
  335                                 int node, gfp_t gfp_mask)
  336 {
  337         struct vmap_area *va;
  338         struct rb_node *n;
  339         unsigned long addr;
  340         int purged = 0;
  341         struct vmap_area *first;
  342 
  343         BUG_ON(!size);
  344         BUG_ON(size & ~PAGE_MASK);
  345         BUG_ON(!is_power_of_2(align));
  346 
  347         va = kmalloc_node(sizeof(struct vmap_area),
  348                         gfp_mask & GFP_RECLAIM_MASK, node);
  349         if (unlikely(!va))
  350                 return ERR_PTR(-ENOMEM);
  351 
  352 retry:
  353         spin_lock(&vmap_area_lock);
  354         /*
  355          * Invalidate cache if we have more permissive parameters.
  356          * cached_hole_size notes the largest hole noticed _below_
  357          * the vmap_area cached in free_vmap_cache: if size fits
  358          * into that hole, we want to scan from vstart to reuse
  359          * the hole instead of allocating above free_vmap_cache.
  360          * Note that __free_vmap_area may update free_vmap_cache
  361          * without updating cached_hole_size or cached_align.
  362          */
  363         if (!free_vmap_cache ||
  364                         size < cached_hole_size ||
  365                         vstart < cached_vstart ||
  366                         align < cached_align) {
  367 nocache:
  368                 cached_hole_size = 0;
  369                 free_vmap_cache = NULL;
  370         }
  371         /* record if we encounter less permissive parameters */
  372         cached_vstart = vstart;
  373         cached_align = align;
  374 
  375         /* find starting point for our search */
  376         if (free_vmap_cache) {
  377                 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
  378                 addr = ALIGN(first->va_end, align);
  379                 if (addr < vstart)
  380                         goto nocache;
  381                 if (addr + size - 1 < addr)
  382                         goto overflow;
  383 
  384         } else {
  385                 addr = ALIGN(vstart, align);
  386                 if (addr + size - 1 < addr)
  387                         goto overflow;
  388 
  389                 n = vmap_area_root.rb_node;
  390                 first = NULL;
  391 
  392                 while (n) {
  393                         struct vmap_area *tmp;
  394                         tmp = rb_entry(n, struct vmap_area, rb_node);
  395                         if (tmp->va_end >= addr) {
  396                                 first = tmp;
  397                                 if (tmp->va_start <= addr)
  398                                         break;
  399                                 n = n->rb_left;
  400                         } else
  401                                 n = n->rb_right;
  402                 }
  403 
  404                 if (!first)
  405                         goto found;
  406         }
  407 
  408         /* from the starting point, walk areas until a suitable hole is found */
  409         while (addr + size > first->va_start && addr + size <= vend) {
  410                 if (addr + cached_hole_size < first->va_start)
  411                         cached_hole_size = first->va_start - addr;
  412                 addr = ALIGN(first->va_end, align);
  413                 if (addr + size - 1 < addr)
  414                         goto overflow;
  415 
  416                 if (list_is_last(&first->list, &vmap_area_list))
  417                         goto found;
  418 
  419                 first = list_entry(first->list.next,
  420                                 struct vmap_area, list);
  421         }
  422 
  423 found:
  424         if (addr + size > vend)
  425                 goto overflow;
  426 
  427         va->va_start = addr;
  428         va->va_end = addr + size;
  429         va->flags = 0;
  430         __insert_vmap_area(va);
  431         free_vmap_cache = &va->rb_node;
  432         spin_unlock(&vmap_area_lock);
  433 
  434         BUG_ON(va->va_start & (align-1));
  435         BUG_ON(va->va_start < vstart);
  436         BUG_ON(va->va_end > vend);
  437 
  438         return va;
  439 
  440 overflow:
  441         spin_unlock(&vmap_area_lock);
  442         if (!purged) {
  443                 purge_vmap_area_lazy();
  444                 purged = 1;
  445                 goto retry;
  446         }
  447         if (printk_ratelimit())
  448                 printk(KERN_WARNING
  449                         "vmap allocation for size %lu failed: "
  450                         "use vmalloc=<size> to increase size.\n", size);
  451         kfree(va);
  452         return ERR_PTR(-EBUSY);
  453 }
  454 
  455 static void __free_vmap_area(struct vmap_area *va)
  456 {
  457         BUG_ON(RB_EMPTY_NODE(&va->rb_node));
  458 
  459         if (free_vmap_cache) {
  460                 if (va->va_end < cached_vstart) {
  461                         free_vmap_cache = NULL;
  462                 } else {
  463                         struct vmap_area *cache;
  464                         cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
  465                         if (va->va_start <= cache->va_start) {
  466                                 free_vmap_cache = rb_prev(&va->rb_node);
  467                                 /*
  468                                  * We don't try to update cached_hole_size or
  469                                  * cached_align, but it won't go very wrong.
  470                                  */
  471                         }
  472                 }
  473         }
  474         rb_erase(&va->rb_node, &vmap_area_root);
  475         RB_CLEAR_NODE(&va->rb_node);
  476         list_del_rcu(&va->list);
  477 
  478         /*
  479          * Track the highest possible candidate for pcpu area
  480          * allocation.  Areas outside of vmalloc area can be returned
  481          * here too, consider only end addresses which fall inside
  482          * vmalloc area proper.
  483          */
  484         if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
  485                 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
  486 
  487         kfree_rcu(va, rcu_head);
  488 }
  489 
  490 /*
  491  * Free a region of KVA allocated by alloc_vmap_area
  492  */
  493 static void free_vmap_area(struct vmap_area *va)
  494 {
  495         spin_lock(&vmap_area_lock);
  496         __free_vmap_area(va);
  497         spin_unlock(&vmap_area_lock);
  498 }
  499 
  500 /*
  501  * Clear the pagetable entries of a given vmap_area
  502  */
  503 static void unmap_vmap_area(struct vmap_area *va)
  504 {
  505         vunmap_page_range(va->va_start, va->va_end);
  506 }
  507 
  508 static void vmap_debug_free_range(unsigned long start, unsigned long end)
  509 {
  510         /*
  511          * Unmap page tables and force a TLB flush immediately if
  512          * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
  513          * bugs similarly to those in linear kernel virtual address
  514          * space after a page has been freed.
  515          *
  516          * All the lazy freeing logic is still retained, in order to
  517          * minimise intrusiveness of this debugging feature.
  518          *
  519          * This is going to be *slow* (linear kernel virtual address
  520          * debugging doesn't do a broadcast TLB flush so it is a lot
  521          * faster).
  522          */
  523 #ifdef CONFIG_DEBUG_PAGEALLOC
  524         vunmap_page_range(start, end);
  525         flush_tlb_kernel_range(start, end);
  526 #endif
  527 }
  528 
  529 /*
  530  * lazy_max_pages is the maximum amount of virtual address space we gather up
  531  * before attempting to purge with a TLB flush.
  532  *
  533  * There is a tradeoff here: a larger number will cover more kernel page tables
  534  * and take slightly longer to purge, but it will linearly reduce the number of
  535  * global TLB flushes that must be performed. It would seem natural to scale
  536  * this number up linearly with the number of CPUs (because vmapping activity
  537  * could also scale linearly with the number of CPUs), however it is likely
  538  * that in practice, workloads might be constrained in other ways that mean
  539  * vmap activity will not scale linearly with CPUs. Also, I want to be
  540  * conservative and not introduce a big latency on huge systems, so go with
  541  * a less aggressive log scale. It will still be an improvement over the old
  542  * code, and it will be simple to change the scale factor if we find that it
  543  * becomes a problem on bigger systems.
  544  */
  545 static unsigned long lazy_max_pages(void)
  546 {
  547         unsigned int log;
  548 
  549         log = fls(num_online_cpus());
  550 
  551         return log * (32UL * 1024 * 1024 / PAGE_SIZE);
  552 }
  553 
  554 static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
  555 
  556 /* for per-CPU blocks */
  557 static void purge_fragmented_blocks_allcpus(void);
  558 
  559 /*
  560  * called before a call to iounmap() if the caller wants vm_area_struct's
  561  * immediately freed.
  562  */
  563 void set_iounmap_nonlazy(void)
  564 {
  565         atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
  566 }
  567 
  568 /*
  569  * Purges all lazily-freed vmap areas.
  570  *
  571  * If sync is 0 then don't purge if there is already a purge in progress.
  572  * If force_flush is 1, then flush kernel TLBs between *start and *end even
  573  * if we found no lazy vmap areas to unmap (callers can use this to optimise
  574  * their own TLB flushing).
  575  * Returns with *start = min(*start, lowest purged address)
  576  *              *end = max(*end, highest purged address)
  577  */
  578 static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
  579                                         int sync, int force_flush)
  580 {
  581         static DEFINE_SPINLOCK(purge_lock);
  582         LIST_HEAD(valist);
  583         struct vmap_area *va;
  584         struct vmap_area *n_va;
  585         int nr = 0;
  586 
  587         /*
  588          * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
  589          * should not expect such behaviour. This just simplifies locking for
  590          * the case that isn't actually used at the moment anyway.
  591          */
  592         if (!sync && !force_flush) {
  593                 if (!spin_trylock(&purge_lock))
  594                         return;
  595         } else
  596                 spin_lock(&purge_lock);
  597 
  598         if (sync)
  599                 purge_fragmented_blocks_allcpus();
  600 
  601         rcu_read_lock();
  602         list_for_each_entry_rcu(va, &vmap_area_list, list) {
  603                 if (va->flags & VM_LAZY_FREE) {
  604                         if (va->va_start < *start)
  605                                 *start = va->va_start;
  606                         if (va->va_end > *end)
  607                                 *end = va->va_end;
  608                         nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
  609                         list_add_tail(&va->purge_list, &valist);
  610                         va->flags |= VM_LAZY_FREEING;
  611                         va->flags &= ~VM_LAZY_FREE;
  612                 }
  613         }
  614         rcu_read_unlock();
  615 
  616         if (nr)
  617                 atomic_sub(nr, &vmap_lazy_nr);
  618 
  619         if (nr || force_flush)
  620                 flush_tlb_kernel_range(*start, *end);
  621 
  622         if (nr) {
  623                 spin_lock(&vmap_area_lock);
  624                 list_for_each_entry_safe(va, n_va, &valist, purge_list)
  625                         __free_vmap_area(va);
  626                 spin_unlock(&vmap_area_lock);
  627         }
  628         spin_unlock(&purge_lock);
  629 }
  630 
  631 /*
  632  * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
  633  * is already purging.
  634  */
  635 static void try_purge_vmap_area_lazy(void)
  636 {
  637         unsigned long start = ULONG_MAX, end = 0;
  638 
  639         __purge_vmap_area_lazy(&start, &end, 0, 0);
  640 }
  641 
  642 /*
  643  * Kick off a purge of the outstanding lazy areas.
  644  */
  645 static void purge_vmap_area_lazy(void)
  646 {
  647         unsigned long start = ULONG_MAX, end = 0;
  648 
  649         __purge_vmap_area_lazy(&start, &end, 1, 0);
  650 }
  651 
  652 /*
  653  * Free a vmap area, caller ensuring that the area has been unmapped
  654  * and flush_cache_vunmap had been called for the correct range
  655  * previously.
  656  */
  657 static void free_vmap_area_noflush(struct vmap_area *va)
  658 {
  659         va->flags |= VM_LAZY_FREE;
  660         atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
  661         if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
  662                 try_purge_vmap_area_lazy();
  663 }
  664 
  665 /*
  666  * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
  667  * called for the correct range previously.
  668  */
  669 static void free_unmap_vmap_area_noflush(struct vmap_area *va)
  670 {
  671         unmap_vmap_area(va);
  672         free_vmap_area_noflush(va);
  673 }
  674 
  675 /*
  676  * Free and unmap a vmap area
  677  */
  678 static void free_unmap_vmap_area(struct vmap_area *va)
  679 {
  680         flush_cache_vunmap(va->va_start, va->va_end);
  681         free_unmap_vmap_area_noflush(va);
  682 }
  683 
  684 static struct vmap_area *find_vmap_area(unsigned long addr)
  685 {
  686         struct vmap_area *va;
  687 
  688         spin_lock(&vmap_area_lock);
  689         va = __find_vmap_area(addr);
  690         spin_unlock(&vmap_area_lock);
  691 
  692         return va;
  693 }
  694 
  695 static void free_unmap_vmap_area_addr(unsigned long addr)
  696 {
  697         struct vmap_area *va;
  698 
  699         va = find_vmap_area(addr);
  700         BUG_ON(!va);
  701         free_unmap_vmap_area(va);
  702 }
  703 
  704 
  705 /*** Per cpu kva allocator ***/
  706 
  707 /*
  708  * vmap space is limited especially on 32 bit architectures. Ensure there is
  709  * room for at least 16 percpu vmap blocks per CPU.
  710  */
  711 /*
  712  * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
  713  * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
  714  * instead (we just need a rough idea)
  715  */
  716 #if BITS_PER_LONG == 32
  717 #define VMALLOC_SPACE           (128UL*1024*1024)
  718 #else
  719 #define VMALLOC_SPACE           (128UL*1024*1024*1024)
  720 #endif
  721 
  722 #define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
  723 #define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
  724 #define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
  725 #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
  726 #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
  727 #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
  728 #define VMAP_BBMAP_BITS         \
  729                 VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
  730                 VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
  731                         VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
  732 
  733 #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
  734 
  735 static bool vmap_initialized __read_mostly = false;
  736 
  737 struct vmap_block_queue {
  738         spinlock_t lock;
  739         struct list_head free;
  740 };
  741 
  742 struct vmap_block {
  743         spinlock_t lock;
  744         struct vmap_area *va;
  745         struct vmap_block_queue *vbq;
  746         unsigned long free, dirty;
  747         DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
  748         DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
  749         struct list_head free_list;
  750         struct rcu_head rcu_head;
  751         struct list_head purge;
  752 };
  753 
  754 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
  755 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
  756 
  757 /*
  758  * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
  759  * in the free path. Could get rid of this if we change the API to return a
  760  * "cookie" from alloc, to be passed to free. But no big deal yet.
  761  */
  762 static DEFINE_SPINLOCK(vmap_block_tree_lock);
  763 static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
  764 
  765 /*
  766  * We should probably have a fallback mechanism to allocate virtual memory
  767  * out of partially filled vmap blocks. However vmap block sizing should be
  768  * fairly reasonable according to the vmalloc size, so it shouldn't be a
  769  * big problem.
  770  */
  771 
  772 static unsigned long addr_to_vb_idx(unsigned long addr)
  773 {
  774         addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
  775         addr /= VMAP_BLOCK_SIZE;
  776         return addr;
  777 }
  778 
  779 static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
  780 {
  781         struct vmap_block_queue *vbq;
  782         struct vmap_block *vb;
  783         struct vmap_area *va;
  784         unsigned long vb_idx;
  785         int node, err;
  786 
  787         node = numa_node_id();
  788 
  789         vb = kmalloc_node(sizeof(struct vmap_block),
  790                         gfp_mask & GFP_RECLAIM_MASK, node);
  791         if (unlikely(!vb))
  792                 return ERR_PTR(-ENOMEM);
  793 
  794         va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
  795                                         VMALLOC_START, VMALLOC_END,
  796                                         node, gfp_mask);
  797         if (IS_ERR(va)) {
  798                 kfree(vb);
  799                 return ERR_CAST(va);
  800         }
  801 
  802         err = radix_tree_preload(gfp_mask);
  803         if (unlikely(err)) {
  804                 kfree(vb);
  805                 free_vmap_area(va);
  806                 return ERR_PTR(err);
  807         }
  808 
  809         spin_lock_init(&vb->lock);
  810         vb->va = va;
  811         vb->free = VMAP_BBMAP_BITS;
  812         vb->dirty = 0;
  813         bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
  814         bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
  815         INIT_LIST_HEAD(&vb->free_list);
  816 
  817         vb_idx = addr_to_vb_idx(va->va_start);
  818         spin_lock(&vmap_block_tree_lock);
  819         err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
  820         spin_unlock(&vmap_block_tree_lock);
  821         BUG_ON(err);
  822         radix_tree_preload_end();
  823 
  824         vbq = &get_cpu_var(vmap_block_queue);
  825         vb->vbq = vbq;
  826         spin_lock(&vbq->lock);
  827         list_add_rcu(&vb->free_list, &vbq->free);
  828         spin_unlock(&vbq->lock);
  829         put_cpu_var(vmap_block_queue);
  830 
  831         return vb;
  832 }
  833 
  834 static void free_vmap_block(struct vmap_block *vb)
  835 {
  836         struct vmap_block *tmp;
  837         unsigned long vb_idx;
  838 
  839         vb_idx = addr_to_vb_idx(vb->va->va_start);
  840         spin_lock(&vmap_block_tree_lock);
  841         tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
  842         spin_unlock(&vmap_block_tree_lock);
  843         BUG_ON(tmp != vb);
  844 
  845         free_vmap_area_noflush(vb->va);
  846         kfree_rcu(vb, rcu_head);
  847 }
  848 
  849 static void purge_fragmented_blocks(int cpu)
  850 {
  851         LIST_HEAD(purge);
  852         struct vmap_block *vb;
  853         struct vmap_block *n_vb;
  854         struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
  855 
  856         rcu_read_lock();
  857         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
  858 
  859                 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
  860                         continue;
  861 
  862                 spin_lock(&vb->lock);
  863                 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
  864                         vb->free = 0; /* prevent further allocs after releasing lock */
  865                         vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
  866                         bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
  867                         bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
  868                         spin_lock(&vbq->lock);
  869                         list_del_rcu(&vb->free_list);
  870                         spin_unlock(&vbq->lock);
  871                         spin_unlock(&vb->lock);
  872                         list_add_tail(&vb->purge, &purge);
  873                 } else
  874                         spin_unlock(&vb->lock);
  875         }
  876         rcu_read_unlock();
  877 
  878         list_for_each_entry_safe(vb, n_vb, &purge, purge) {
  879                 list_del(&vb->purge);
  880                 free_vmap_block(vb);
  881         }
  882 }
  883 
  884 static void purge_fragmented_blocks_thiscpu(void)
  885 {
  886         purge_fragmented_blocks(smp_processor_id());
  887 }
  888 
  889 static void purge_fragmented_blocks_allcpus(void)
  890 {
  891         int cpu;
  892 
  893         for_each_possible_cpu(cpu)
  894                 purge_fragmented_blocks(cpu);
  895 }
  896 
  897 static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
  898 {
  899         struct vmap_block_queue *vbq;
  900         struct vmap_block *vb;
  901         unsigned long addr = 0;
  902         unsigned int order;
  903         int purge = 0;
  904 
  905         BUG_ON(size & ~PAGE_MASK);
  906         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
  907         if (WARN_ON(size == 0)) {
  908                 /*
  909                  * Allocating 0 bytes isn't what caller wants since
  910                  * get_order(0) returns funny result. Just warn and terminate
  911                  * early.
  912                  */
  913                 return NULL;
  914         }
  915         order = get_order(size);
  916 
  917 again:
  918         rcu_read_lock();
  919         vbq = &get_cpu_var(vmap_block_queue);
  920         list_for_each_entry_rcu(vb, &vbq->free, free_list) {
  921                 int i;
  922 
  923                 spin_lock(&vb->lock);
  924                 if (vb->free < 1UL << order)
  925                         goto next;
  926 
  927                 i = bitmap_find_free_region(vb->alloc_map,
  928                                                 VMAP_BBMAP_BITS, order);
  929 
  930                 if (i < 0) {
  931                         if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
  932                                 /* fragmented and no outstanding allocations */
  933                                 BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
  934                                 purge = 1;
  935                         }
  936                         goto next;
  937                 }
  938                 addr = vb->va->va_start + (i << PAGE_SHIFT);
  939                 BUG_ON(addr_to_vb_idx(addr) !=
  940                                 addr_to_vb_idx(vb->va->va_start));
  941                 vb->free -= 1UL << order;
  942                 if (vb->free == 0) {
  943                         spin_lock(&vbq->lock);
  944                         list_del_rcu(&vb->free_list);
  945                         spin_unlock(&vbq->lock);
  946                 }
  947                 spin_unlock(&vb->lock);
  948                 break;
  949 next:
  950                 spin_unlock(&vb->lock);
  951         }
  952 
  953         if (purge)
  954                 purge_fragmented_blocks_thiscpu();
  955 
  956         put_cpu_var(vmap_block_queue);
  957         rcu_read_unlock();
  958 
  959         if (!addr) {
  960                 vb = new_vmap_block(gfp_mask);
  961                 if (IS_ERR(vb))
  962                         return vb;
  963                 goto again;
  964         }
  965 
  966         return (void *)addr;
  967 }
  968 
  969 static void vb_free(const void *addr, unsigned long size)
  970 {
  971         unsigned long offset;
  972         unsigned long vb_idx;
  973         unsigned int order;
  974         struct vmap_block *vb;
  975 
  976         BUG_ON(size & ~PAGE_MASK);
  977         BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
  978 
  979         flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
  980 
  981         order = get_order(size);
  982 
  983         offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
  984 
  985         vb_idx = addr_to_vb_idx((unsigned long)addr);
  986         rcu_read_lock();
  987         vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
  988         rcu_read_unlock();
  989         BUG_ON(!vb);
  990 
  991         vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
  992 
  993         spin_lock(&vb->lock);
  994         BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
  995 
  996         vb->dirty += 1UL << order;
  997         if (vb->dirty == VMAP_BBMAP_BITS) {
  998                 BUG_ON(vb->free);
  999                 spin_unlock(&vb->lock);
 1000                 free_vmap_block(vb);
 1001         } else
 1002                 spin_unlock(&vb->lock);
 1003 }
 1004 
 1005 /**
 1006  * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 1007  *
 1008  * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 1009  * to amortize TLB flushing overheads. What this means is that any page you
 1010  * have now, may, in a former life, have been mapped into kernel virtual
 1011  * address by the vmap layer and so there might be some CPUs with TLB entries
 1012  * still referencing that page (additional to the regular 1:1 kernel mapping).
 1013  *
 1014  * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 1015  * be sure that none of the pages we have control over will have any aliases
 1016  * from the vmap layer.
 1017  */
 1018 void vm_unmap_aliases(void)
 1019 {
 1020         unsigned long start = ULONG_MAX, end = 0;
 1021         int cpu;
 1022         int flush = 0;
 1023 
 1024         if (unlikely(!vmap_initialized))
 1025                 return;
 1026 
 1027         for_each_possible_cpu(cpu) {
 1028                 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
 1029                 struct vmap_block *vb;
 1030 
 1031                 rcu_read_lock();
 1032                 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
 1033                         int i;
 1034 
 1035                         spin_lock(&vb->lock);
 1036                         i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
 1037                         while (i < VMAP_BBMAP_BITS) {
 1038                                 unsigned long s, e;
 1039                                 int j;
 1040                                 j = find_next_zero_bit(vb->dirty_map,
 1041                                         VMAP_BBMAP_BITS, i);
 1042 
 1043                                 s = vb->va->va_start + (i << PAGE_SHIFT);
 1044                                 e = vb->va->va_start + (j << PAGE_SHIFT);
 1045                                 flush = 1;
 1046 
 1047                                 if (s < start)
 1048                                         start = s;
 1049                                 if (e > end)
 1050                                         end = e;
 1051 
 1052                                 i = j;
 1053                                 i = find_next_bit(vb->dirty_map,
 1054                                                         VMAP_BBMAP_BITS, i);
 1055                         }
 1056                         spin_unlock(&vb->lock);
 1057                 }
 1058                 rcu_read_unlock();
 1059         }
 1060 
 1061         __purge_vmap_area_lazy(&start, &end, 1, flush);
 1062 }
 1063 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
 1064 
 1065 /**
 1066  * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 1067  * @mem: the pointer returned by vm_map_ram
 1068  * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 1069  */
 1070 void vm_unmap_ram(const void *mem, unsigned int count)
 1071 {
 1072         unsigned long size = count << PAGE_SHIFT;
 1073         unsigned long addr = (unsigned long)mem;
 1074 
 1075         BUG_ON(!addr);
 1076         BUG_ON(addr < VMALLOC_START);
 1077         BUG_ON(addr > VMALLOC_END);
 1078         BUG_ON(addr & (PAGE_SIZE-1));
 1079 
 1080         debug_check_no_locks_freed(mem, size);
 1081         vmap_debug_free_range(addr, addr+size);
 1082 
 1083         if (likely(count <= VMAP_MAX_ALLOC))
 1084                 vb_free(mem, size);
 1085         else
 1086                 free_unmap_vmap_area_addr(addr);
 1087 }
 1088 EXPORT_SYMBOL(vm_unmap_ram);
 1089 
 1090 /**
 1091  * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 1092  * @pages: an array of pointers to the pages to be mapped
 1093  * @count: number of pages
 1094  * @node: prefer to allocate data structures on this node
 1095  * @prot: memory protection to use. PAGE_KERNEL for regular RAM
 1096  *
 1097  * Returns: a pointer to the address that has been mapped, or %NULL on failure
 1098  */
 1099 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
 1100 {
 1101         unsigned long size = count << PAGE_SHIFT;
 1102         unsigned long addr;
 1103         void *mem;
 1104 
 1105         if (likely(count <= VMAP_MAX_ALLOC)) {
 1106                 mem = vb_alloc(size, GFP_KERNEL);
 1107                 if (IS_ERR(mem))
 1108                         return NULL;
 1109                 addr = (unsigned long)mem;
 1110         } else {
 1111                 struct vmap_area *va;
 1112                 va = alloc_vmap_area(size, PAGE_SIZE,
 1113                                 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
 1114                 if (IS_ERR(va))
 1115                         return NULL;
 1116 
 1117                 addr = va->va_start;
 1118                 mem = (void *)addr;
 1119         }
 1120         if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
 1121                 vm_unmap_ram(mem, count);
 1122                 return NULL;
 1123         }
 1124         return mem;
 1125 }
 1126 EXPORT_SYMBOL(vm_map_ram);
 1127 
 1128 /**
 1129  * vm_area_add_early - add vmap area early during boot
 1130  * @vm: vm_struct to add
 1131  *
 1132  * This function is used to add fixed kernel vm area to vmlist before
 1133  * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
 1134  * should contain proper values and the other fields should be zero.
 1135  *
 1136  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 1137  */
 1138 void __init vm_area_add_early(struct vm_struct *vm)
 1139 {
 1140         struct vm_struct *tmp, **p;
 1141 
 1142         BUG_ON(vmap_initialized);
 1143         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
 1144                 if (tmp->addr >= vm->addr) {
 1145                         BUG_ON(tmp->addr < vm->addr + vm->size);
 1146                         break;
 1147                 } else
 1148                         BUG_ON(tmp->addr + tmp->size > vm->addr);
 1149         }
 1150         vm->next = *p;
 1151         *p = vm;
 1152 }
 1153 
 1154 /**
 1155  * vm_area_register_early - register vmap area early during boot
 1156  * @vm: vm_struct to register
 1157  * @align: requested alignment
 1158  *
 1159  * This function is used to register kernel vm area before
 1160  * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 1161  * proper values on entry and other fields should be zero.  On return,
 1162  * vm->addr contains the allocated address.
 1163  *
 1164  * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 1165  */
 1166 void __init vm_area_register_early(struct vm_struct *vm, size_t align)
 1167 {
 1168         static size_t vm_init_off __initdata;
 1169         unsigned long addr;
 1170 
 1171         addr = ALIGN(VMALLOC_START + vm_init_off, align);
 1172         vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
 1173 
 1174         vm->addr = (void *)addr;
 1175 
 1176         vm_area_add_early(vm);
 1177 }
 1178 
 1179 void __init vmalloc_init(void)
 1180 {
 1181         struct vmap_area *va;
 1182         struct vm_struct *tmp;
 1183         int i;
 1184 
 1185         for_each_possible_cpu(i) {
 1186                 struct vmap_block_queue *vbq;
 1187 
 1188                 vbq = &per_cpu(vmap_block_queue, i);
 1189                 spin_lock_init(&vbq->lock);
 1190                 INIT_LIST_HEAD(&vbq->free);
 1191         }
 1192 
 1193         /* Import existing vmlist entries. */
 1194         for (tmp = vmlist; tmp; tmp = tmp->next) {
 1195                 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
 1196                 va->flags = VM_VM_AREA;
 1197                 va->va_start = (unsigned long)tmp->addr;
 1198                 va->va_end = va->va_start + tmp->size;
 1199                 va->vm = tmp;
 1200                 __insert_vmap_area(va);
 1201         }
 1202 
 1203         vmap_area_pcpu_hole = VMALLOC_END;
 1204 
 1205         vmap_initialized = true;
 1206 }
 1207 
 1208 /**
 1209  * map_kernel_range_noflush - map kernel VM area with the specified pages
 1210  * @addr: start of the VM area to map
 1211  * @size: size of the VM area to map
 1212  * @prot: page protection flags to use
 1213  * @pages: pages to map
 1214  *
 1215  * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
 1216  * specify should have been allocated using get_vm_area() and its
 1217  * friends.
 1218  *
 1219  * NOTE:
 1220  * This function does NOT do any cache flushing.  The caller is
 1221  * responsible for calling flush_cache_vmap() on to-be-mapped areas
 1222  * before calling this function.
 1223  *
 1224  * RETURNS:
 1225  * The number of pages mapped on success, -errno on failure.
 1226  */
 1227 int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 1228                              pgprot_t prot, struct page **pages)
 1229 {
 1230         return vmap_page_range_noflush(addr, addr + size, prot, pages);
 1231 }
 1232 
 1233 /**
 1234  * unmap_kernel_range_noflush - unmap kernel VM area
 1235  * @addr: start of the VM area to unmap
 1236  * @size: size of the VM area to unmap
 1237  *
 1238  * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
 1239  * specify should have been allocated using get_vm_area() and its
 1240  * friends.
 1241  *
 1242  * NOTE:
 1243  * This function does NOT do any cache flushing.  The caller is
 1244  * responsible for calling flush_cache_vunmap() on to-be-mapped areas
 1245  * before calling this function and flush_tlb_kernel_range() after.
 1246  */
 1247 void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
 1248 {
 1249         vunmap_page_range(addr, addr + size);
 1250 }
 1251 EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
 1252 
 1253 /**
 1254  * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
 1255  * @addr: start of the VM area to unmap
 1256  * @size: size of the VM area to unmap
 1257  *
 1258  * Similar to unmap_kernel_range_noflush() but flushes vcache before
 1259  * the unmapping and tlb after.
 1260  */
 1261 void unmap_kernel_range(unsigned long addr, unsigned long size)
 1262 {
 1263         unsigned long end = addr + size;
 1264 
 1265         flush_cache_vunmap(addr, end);
 1266         vunmap_page_range(addr, end);
 1267         flush_tlb_kernel_range(addr, end);
 1268 }
 1269 
 1270 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 1271 {
 1272         unsigned long addr = (unsigned long)area->addr;
 1273         unsigned long end = addr + area->size - PAGE_SIZE;
 1274         int err;
 1275 
 1276         err = vmap_page_range(addr, end, prot, *pages);
 1277         if (err > 0) {
 1278                 *pages += err;
 1279                 err = 0;
 1280         }
 1281 
 1282         return err;
 1283 }
 1284 EXPORT_SYMBOL_GPL(map_vm_area);
 1285 
 1286 /*** Old vmalloc interfaces ***/
 1287 DEFINE_RWLOCK(vmlist_lock);
 1288 struct vm_struct *vmlist;
 1289 
 1290 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 1291                               unsigned long flags, const void *caller)
 1292 {
 1293         vm->flags = flags;
 1294         vm->addr = (void *)va->va_start;
 1295         vm->size = va->va_end - va->va_start;
 1296         vm->caller = caller;
 1297         va->vm = vm;
 1298         va->flags |= VM_VM_AREA;
 1299 }
 1300 
 1301 static void insert_vmalloc_vmlist(struct vm_struct *vm)
 1302 {
 1303         struct vm_struct *tmp, **p;
 1304 
 1305         vm->flags &= ~VM_UNLIST;
 1306         write_lock(&vmlist_lock);
 1307         for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
 1308                 if (tmp->addr >= vm->addr)
 1309                         break;
 1310         }
 1311         vm->next = *p;
 1312         *p = vm;
 1313         write_unlock(&vmlist_lock);
 1314 }
 1315 
 1316 static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 1317                               unsigned long flags, const void *caller)
 1318 {
 1319         setup_vmalloc_vm(vm, va, flags, caller);
 1320         insert_vmalloc_vmlist(vm);
 1321 }
 1322 
 1323 static struct vm_struct *__get_vm_area_node(unsigned long size,
 1324                 unsigned long align, unsigned long flags, unsigned long start,
 1325                 unsigned long end, int node, gfp_t gfp_mask, const void *caller)
 1326 {
 1327         struct vmap_area *va;
 1328         struct vm_struct *area;
 1329 
 1330         BUG_ON(in_interrupt());
 1331         if (flags & VM_IOREMAP) {
 1332                 int bit = fls(size);
 1333 
 1334                 if (bit > IOREMAP_MAX_ORDER)
 1335                         bit = IOREMAP_MAX_ORDER;
 1336                 else if (bit < PAGE_SHIFT)
 1337                         bit = PAGE_SHIFT;
 1338 
 1339                 align = 1ul << bit;
 1340         }
 1341 
 1342         size = PAGE_ALIGN(size);
 1343         if (unlikely(!size))
 1344                 return NULL;
 1345 
 1346         area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
 1347         if (unlikely(!area))
 1348                 return NULL;
 1349 
 1350         /*
 1351          * We always allocate a guard page.
 1352          */
 1353         size += PAGE_SIZE;
 1354 
 1355         va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
 1356         if (IS_ERR(va)) {
 1357                 kfree(area);
 1358                 return NULL;
 1359         }
 1360 
 1361         /*
 1362          * When this function is called from __vmalloc_node_range,
 1363          * we do not add vm_struct to vmlist here to avoid
 1364          * accessing uninitialized members of vm_struct such as
 1365          * pages and nr_pages fields. They will be set later.
 1366          * To distinguish it from others, we use a VM_UNLIST flag.
 1367          */
 1368         if (flags & VM_UNLIST)
 1369                 setup_vmalloc_vm(area, va, flags, caller);
 1370         else
 1371                 insert_vmalloc_vm(area, va, flags, caller);
 1372 
 1373         return area;
 1374 }
 1375 
 1376 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
 1377                                 unsigned long start, unsigned long end)
 1378 {
 1379         return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
 1380                                                 __builtin_return_address(0));
 1381 }
 1382 EXPORT_SYMBOL_GPL(__get_vm_area);
 1383 
 1384 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
 1385                                        unsigned long start, unsigned long end,
 1386                                        const void *caller)
 1387 {
 1388         return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
 1389                                   caller);
 1390 }
 1391 
 1392 /**
 1393  *      get_vm_area  -  reserve a contiguous kernel virtual area
 1394  *      @size:          size of the area
 1395  *      @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
 1396  *
 1397  *      Search an area of @size in the kernel virtual mapping area,
 1398  *      and reserved it for out purposes.  Returns the area descriptor
 1399  *      on success or %NULL on failure.
 1400  */
 1401 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 1402 {
 1403         return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
 1404                                 -1, GFP_KERNEL, __builtin_return_address(0));
 1405 }
 1406 
 1407 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
 1408                                 const void *caller)
 1409 {
 1410         return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
 1411                                                 -1, GFP_KERNEL, caller);
 1412 }
 1413 
 1414 /**
 1415  *      find_vm_area  -  find a continuous kernel virtual area
 1416  *      @addr:          base address
 1417  *
 1418  *      Search for the kernel VM area starting at @addr, and return it.
 1419  *      It is up to the caller to do all required locking to keep the returned
 1420  *      pointer valid.
 1421  */
 1422 struct vm_struct *find_vm_area(const void *addr)
 1423 {
 1424         struct vmap_area *va;
 1425 
 1426         va = find_vmap_area((unsigned long)addr);
 1427         if (va && va->flags & VM_VM_AREA)
 1428                 return va->vm;
 1429 
 1430         return NULL;
 1431 }
 1432 
 1433 /**
 1434  *      remove_vm_area  -  find and remove a continuous kernel virtual area
 1435  *      @addr:          base address
 1436  *
 1437  *      Search for the kernel VM area starting at @addr, and remove it.
 1438  *      This function returns the found VM area, but using it is NOT safe
 1439  *      on SMP machines, except for its size or flags.
 1440  */
 1441 struct vm_struct *remove_vm_area(const void *addr)
 1442 {
 1443         struct vmap_area *va;
 1444 
 1445         va = find_vmap_area((unsigned long)addr);
 1446         if (va && va->flags & VM_VM_AREA) {
 1447                 struct vm_struct *vm = va->vm;
 1448 
 1449                 if (!(vm->flags & VM_UNLIST)) {
 1450                         struct vm_struct *tmp, **p;
 1451                         /*
 1452                          * remove from list and disallow access to
 1453                          * this vm_struct before unmap. (address range
 1454                          * confliction is maintained by vmap.)
 1455                          */
 1456                         write_lock(&vmlist_lock);
 1457                         for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
 1458                                 ;
 1459                         *p = tmp->next;
 1460                         write_unlock(&vmlist_lock);
 1461                 }
 1462 
 1463                 vmap_debug_free_range(va->va_start, va->va_end);
 1464                 free_unmap_vmap_area(va);
 1465                 vm->size -= PAGE_SIZE;
 1466 
 1467                 return vm;
 1468         }
 1469         return NULL;
 1470 }
 1471 
 1472 static void __vunmap(const void *addr, int deallocate_pages)
 1473 {
 1474         struct vm_struct *area;
 1475 
 1476         if (!addr)
 1477                 return;
 1478 
 1479         if ((PAGE_SIZE-1) & (unsigned long)addr) {
 1480                 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
 1481                 return;
 1482         }
 1483 
 1484         area = remove_vm_area(addr);
 1485         if (unlikely(!area)) {
 1486                 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
 1487                                 addr);
 1488                 return;
 1489         }
 1490 
 1491         debug_check_no_locks_freed(addr, area->size);
 1492         debug_check_no_obj_freed(addr, area->size);
 1493 
 1494         if (deallocate_pages) {
 1495                 int i;
 1496 
 1497                 for (i = 0; i < area->nr_pages; i++) {
 1498                         struct page *page = area->pages[i];
 1499 
 1500                         BUG_ON(!page);
 1501                         __free_page(page);
 1502                 }
 1503 
 1504                 if (area->flags & VM_VPAGES)
 1505                         vfree(area->pages);
 1506                 else
 1507                         kfree(area->pages);
 1508         }
 1509 
 1510         kfree(area);
 1511         return;
 1512 }
 1513 
 1514 /**
 1515  *      vfree  -  release memory allocated by vmalloc()
 1516  *      @addr:          memory base address
 1517  *
 1518  *      Free the virtually continuous memory area starting at @addr, as
 1519  *      obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
 1520  *      NULL, no operation is performed.
 1521  *
 1522  *      Must not be called in interrupt context.
 1523  */
 1524 void vfree(const void *addr)
 1525 {
 1526         BUG_ON(in_interrupt());
 1527 
 1528         kmemleak_free(addr);
 1529 
 1530         __vunmap(addr, 1);
 1531 }
 1532 EXPORT_SYMBOL(vfree);
 1533 
 1534 /**
 1535  *      vunmap  -  release virtual mapping obtained by vmap()
 1536  *      @addr:          memory base address
 1537  *
 1538  *      Free the virtually contiguous memory area starting at @addr,
 1539  *      which was created from the page array passed to vmap().
 1540  *
 1541  *      Must not be called in interrupt context.
 1542  */
 1543 void vunmap(const void *addr)
 1544 {
 1545         BUG_ON(in_interrupt());
 1546         might_sleep();
 1547         __vunmap(addr, 0);
 1548 }
 1549 EXPORT_SYMBOL(vunmap);
 1550 
 1551 /**
 1552  *      vmap  -  map an array of pages into virtually contiguous space
 1553  *      @pages:         array of page pointers
 1554  *      @count:         number of pages to map
 1555  *      @flags:         vm_area->flags
 1556  *      @prot:          page protection for the mapping
 1557  *
 1558  *      Maps @count pages from @pages into contiguous kernel virtual
 1559  *      space.
 1560  */
 1561 void *vmap(struct page **pages, unsigned int count,
 1562                 unsigned long flags, pgprot_t prot)
 1563 {
 1564         struct vm_struct *area;
 1565 
 1566         might_sleep();
 1567 
 1568         if (count > totalram_pages)
 1569                 return NULL;
 1570 
 1571         area = get_vm_area_caller((count << PAGE_SHIFT), flags,
 1572                                         __builtin_return_address(0));
 1573         if (!area)
 1574                 return NULL;
 1575 
 1576         if (map_vm_area(area, prot, &pages)) {
 1577                 vunmap(area->addr);
 1578                 return NULL;
 1579         }
 1580 
 1581         return area->addr;
 1582 }
 1583 EXPORT_SYMBOL(vmap);
 1584 
 1585 static void *__vmalloc_node(unsigned long size, unsigned long align,
 1586                             gfp_t gfp_mask, pgprot_t prot,
 1587                             int node, const void *caller);
 1588 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 1589                                  pgprot_t prot, int node, const void *caller)
 1590 {
 1591         const int order = 0;
 1592         struct page **pages;
 1593         unsigned int nr_pages, array_size, i;
 1594         gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
 1595 
 1596         nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
 1597         array_size = (nr_pages * sizeof(struct page *));
 1598 
 1599         area->nr_pages = nr_pages;
 1600         /* Please note that the recursion is strictly bounded. */
 1601         if (array_size > PAGE_SIZE) {
 1602                 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
 1603                                 PAGE_KERNEL, node, caller);
 1604                 area->flags |= VM_VPAGES;
 1605         } else {
 1606                 pages = kmalloc_node(array_size, nested_gfp, node);
 1607         }
 1608         area->pages = pages;
 1609         area->caller = caller;
 1610         if (!area->pages) {
 1611                 remove_vm_area(area->addr);
 1612                 kfree(area);
 1613                 return NULL;
 1614         }
 1615 
 1616         for (i = 0; i < area->nr_pages; i++) {
 1617                 struct page *page;
 1618                 gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
 1619 
 1620                 if (node < 0)
 1621                         page = alloc_page(tmp_mask);
 1622                 else
 1623                         page = alloc_pages_node(node, tmp_mask, order);
 1624 
 1625                 if (unlikely(!page)) {
 1626                         /* Successfully allocated i pages, free them in __vunmap() */
 1627                         area->nr_pages = i;
 1628                         goto fail;
 1629                 }
 1630                 area->pages[i] = page;
 1631         }
 1632 
 1633         if (map_vm_area(area, prot, &pages))
 1634                 goto fail;
 1635         return area->addr;
 1636 
 1637 fail:
 1638         warn_alloc_failed(gfp_mask, order,
 1639                           "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
 1640                           (area->nr_pages*PAGE_SIZE), area->size);
 1641         vfree(area->addr);
 1642         return NULL;
 1643 }
 1644 
 1645 /**
 1646  *      __vmalloc_node_range  -  allocate virtually contiguous memory
 1647  *      @size:          allocation size
 1648  *      @align:         desired alignment
 1649  *      @start:         vm area range start
 1650  *      @end:           vm area range end
 1651  *      @gfp_mask:      flags for the page level allocator
 1652  *      @prot:          protection mask for the allocated pages
 1653  *      @node:          node to use for allocation or -1
 1654  *      @caller:        caller's return address
 1655  *
 1656  *      Allocate enough pages to cover @size from the page level
 1657  *      allocator with @gfp_mask flags.  Map them into contiguous
 1658  *      kernel virtual space, using a pagetable protection of @prot.
 1659  */
 1660 void *__vmalloc_node_range(unsigned long size, unsigned long align,
 1661                         unsigned long start, unsigned long end, gfp_t gfp_mask,
 1662                         pgprot_t prot, int node, const void *caller)
 1663 {
 1664         struct vm_struct *area;
 1665         void *addr;
 1666         unsigned long real_size = size;
 1667 
 1668         size = PAGE_ALIGN(size);
 1669         if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 1670                 goto fail;
 1671 
 1672         area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
 1673                                   start, end, node, gfp_mask, caller);
 1674         if (!area)
 1675                 goto fail;
 1676 
 1677         addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
 1678         if (!addr)
 1679                 return NULL;
 1680 
 1681         /*
 1682          * In this function, newly allocated vm_struct is not added
 1683          * to vmlist at __get_vm_area_node(). so, it is added here.
 1684          */
 1685         insert_vmalloc_vmlist(area);
 1686 
 1687         /*
 1688          * A ref_count = 3 is needed because the vm_struct and vmap_area
 1689          * structures allocated in the __get_vm_area_node() function contain
 1690          * references to the virtual address of the vmalloc'ed block.
 1691          */
 1692         kmemleak_alloc(addr, real_size, 3, gfp_mask);
 1693 
 1694         return addr;
 1695 
 1696 fail:
 1697         warn_alloc_failed(gfp_mask, 0,
 1698                           "vmalloc: allocation failure: %lu bytes\n",
 1699                           real_size);
 1700         return NULL;
 1701 }
 1702 
 1703 /**
 1704  *      __vmalloc_node  -  allocate virtually contiguous memory
 1705  *      @size:          allocation size
 1706  *      @align:         desired alignment
 1707  *      @gfp_mask:      flags for the page level allocator
 1708  *      @prot:          protection mask for the allocated pages
 1709  *      @node:          node to use for allocation or -1
 1710  *      @caller:        caller's return address
 1711  *
 1712  *      Allocate enough pages to cover @size from the page level
 1713  *      allocator with @gfp_mask flags.  Map them into contiguous
 1714  *      kernel virtual space, using a pagetable protection of @prot.
 1715  */
 1716 static void *__vmalloc_node(unsigned long size, unsigned long align,
 1717                             gfp_t gfp_mask, pgprot_t prot,
 1718                             int node, const void *caller)
 1719 {
 1720         return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
 1721                                 gfp_mask, prot, node, caller);
 1722 }
 1723 
 1724 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 1725 {
 1726         return __vmalloc_node(size, 1, gfp_mask, prot, -1,
 1727                                 __builtin_return_address(0));
 1728 }
 1729 EXPORT_SYMBOL(__vmalloc);
 1730 
 1731 static inline void *__vmalloc_node_flags(unsigned long size,
 1732                                         int node, gfp_t flags)
 1733 {
 1734         return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
 1735                                         node, __builtin_return_address(0));
 1736 }
 1737 
 1738 /**
 1739  *      vmalloc  -  allocate virtually contiguous memory
 1740  *      @size:          allocation size
 1741  *      Allocate enough pages to cover @size from the page level
 1742  *      allocator and map them into contiguous kernel virtual space.
 1743  *
 1744  *      For tight control over page level allocator and protection flags
 1745  *      use __vmalloc() instead.
 1746  */
 1747 void *vmalloc(unsigned long size)
 1748 {
 1749         return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
 1750 }
 1751 EXPORT_SYMBOL(vmalloc);
 1752 
 1753 /**
 1754  *      vzalloc - allocate virtually contiguous memory with zero fill
 1755  *      @size:  allocation size
 1756  *      Allocate enough pages to cover @size from the page level
 1757  *      allocator and map them into contiguous kernel virtual space.
 1758  *      The memory allocated is set to zero.
 1759  *
 1760  *      For tight control over page level allocator and protection flags
 1761  *      use __vmalloc() instead.
 1762  */
 1763 void *vzalloc(unsigned long size)
 1764 {
 1765         return __vmalloc_node_flags(size, -1,
 1766                                 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
 1767 }
 1768 EXPORT_SYMBOL(vzalloc);
 1769 
 1770 /**
 1771  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 1772  * @size: allocation size
 1773  *
 1774  * The resulting memory area is zeroed so it can be mapped to userspace
 1775  * without leaking data.
 1776  */
 1777 void *vmalloc_user(unsigned long size)
 1778 {
 1779         struct vm_struct *area;
 1780         void *ret;
 1781 
 1782         ret = __vmalloc_node(size, SHMLBA,
 1783                              GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
 1784                              PAGE_KERNEL, -1, __builtin_return_address(0));
 1785         if (ret) {
 1786                 area = find_vm_area(ret);
 1787                 area->flags |= VM_USERMAP;
 1788         }
 1789         return ret;
 1790 }
 1791 EXPORT_SYMBOL(vmalloc_user);
 1792 
 1793 /**
 1794  *      vmalloc_node  -  allocate memory on a specific node
 1795  *      @size:          allocation size
 1796  *      @node:          numa node
 1797  *
 1798  *      Allocate enough pages to cover @size from the page level
 1799  *      allocator and map them into contiguous kernel virtual space.
 1800  *
 1801  *      For tight control over page level allocator and protection flags
 1802  *      use __vmalloc() instead.
 1803  */
 1804 void *vmalloc_node(unsigned long size, int node)
 1805 {
 1806         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
 1807                                         node, __builtin_return_address(0));
 1808 }
 1809 EXPORT_SYMBOL(vmalloc_node);
 1810 
 1811 /**
 1812  * vzalloc_node - allocate memory on a specific node with zero fill
 1813  * @size:       allocation size
 1814  * @node:       numa node
 1815  *
 1816  * Allocate enough pages to cover @size from the page level
 1817  * allocator and map them into contiguous kernel virtual space.
 1818  * The memory allocated is set to zero.
 1819  *
 1820  * For tight control over page level allocator and protection flags
 1821  * use __vmalloc_node() instead.
 1822  */
 1823 void *vzalloc_node(unsigned long size, int node)
 1824 {
 1825         return __vmalloc_node_flags(size, node,
 1826                          GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
 1827 }
 1828 EXPORT_SYMBOL(vzalloc_node);
 1829 
 1830 #ifndef PAGE_KERNEL_EXEC
 1831 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 1832 #endif
 1833 
 1834 /**
 1835  *      vmalloc_exec  -  allocate virtually contiguous, executable memory
 1836  *      @size:          allocation size
 1837  *
 1838  *      Kernel-internal function to allocate enough pages to cover @size
 1839  *      the page level allocator and map them into contiguous and
 1840  *      executable kernel virtual space.
 1841  *
 1842  *      For tight control over page level allocator and protection flags
 1843  *      use __vmalloc() instead.
 1844  */
 1845 
 1846 void *vmalloc_exec(unsigned long size)
 1847 {
 1848         return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
 1849                               -1, __builtin_return_address(0));
 1850 }
 1851 
 1852 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
 1853 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
 1854 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
 1855 #define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
 1856 #else
 1857 #define GFP_VMALLOC32 GFP_KERNEL
 1858 #endif
 1859 
 1860 /**
 1861  *      vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
 1862  *      @size:          allocation size
 1863  *
 1864  *      Allocate enough 32bit PA addressable pages to cover @size from the
 1865  *      page level allocator and map them into contiguous kernel virtual space.
 1866  */
 1867 void *vmalloc_32(unsigned long size)
 1868 {
 1869         return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
 1870                               -1, __builtin_return_address(0));
 1871 }
 1872 EXPORT_SYMBOL(vmalloc_32);
 1873 
 1874 /**
 1875  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 1876  *      @size:          allocation size
 1877  *
 1878  * The resulting memory area is 32bit addressable and zeroed so it can be
 1879  * mapped to userspace without leaking data.
 1880  */
 1881 void *vmalloc_32_user(unsigned long size)
 1882 {
 1883         struct vm_struct *area;
 1884         void *ret;
 1885 
 1886         ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
 1887                              -1, __builtin_return_address(0));
 1888         if (ret) {
 1889                 area = find_vm_area(ret);
 1890                 area->flags |= VM_USERMAP;
 1891         }
 1892         return ret;
 1893 }
 1894 EXPORT_SYMBOL(vmalloc_32_user);
 1895 
 1896 /*
 1897  * small helper routine , copy contents to buf from addr.
 1898  * If the page is not present, fill zero.
 1899  */
 1900 
 1901 static int aligned_vread(char *buf, char *addr, unsigned long count)
 1902 {
 1903         struct page *p;
 1904         int copied = 0;
 1905 
 1906         while (count) {
 1907                 unsigned long offset, length;
 1908 
 1909                 offset = (unsigned long)addr & ~PAGE_MASK;
 1910                 length = PAGE_SIZE - offset;
 1911                 if (length > count)
 1912                         length = count;
 1913                 p = vmalloc_to_page(addr);
 1914                 /*
 1915                  * To do safe access to this _mapped_ area, we need
 1916                  * lock. But adding lock here means that we need to add
 1917                  * overhead of vmalloc()/vfree() calles for this _debug_
 1918                  * interface, rarely used. Instead of that, we'll use
 1919                  * kmap() and get small overhead in this access function.
 1920                  */
 1921                 if (p) {
 1922                         /*
 1923                          * we can expect USER0 is not used (see vread/vwrite's
 1924                          * function description)
 1925                          */
 1926                         void *map = kmap_atomic(p);
 1927                         memcpy(buf, map + offset, length);
 1928                         kunmap_atomic(map);
 1929                 } else
 1930                         memset(buf, 0, length);
 1931 
 1932                 addr += length;
 1933                 buf += length;
 1934                 copied += length;
 1935                 count -= length;
 1936         }
 1937         return copied;
 1938 }
 1939 
 1940 static int aligned_vwrite(char *buf, char *addr, unsigned long count)
 1941 {
 1942         struct page *p;
 1943         int copied = 0;
 1944 
 1945         while (count) {
 1946                 unsigned long offset, length;
 1947 
 1948                 offset = (unsigned long)addr & ~PAGE_MASK;
 1949                 length = PAGE_SIZE - offset;
 1950                 if (length > count)
 1951                         length = count;
 1952                 p = vmalloc_to_page(addr);
 1953                 /*
 1954                  * To do safe access to this _mapped_ area, we need
 1955                  * lock. But adding lock here means that we need to add
 1956                  * overhead of vmalloc()/vfree() calles for this _debug_
 1957                  * interface, rarely used. Instead of that, we'll use
 1958                  * kmap() and get small overhead in this access function.
 1959                  */
 1960                 if (p) {
 1961                         /*
 1962                          * we can expect USER0 is not used (see vread/vwrite's
 1963                          * function description)
 1964                          */
 1965                         void *map = kmap_atomic(p);
 1966                         memcpy(map + offset, buf, length);
 1967                         kunmap_atomic(map);
 1968                 }
 1969                 addr += length;
 1970                 buf += length;
 1971                 copied += length;
 1972                 count -= length;
 1973         }
 1974         return copied;
 1975 }
 1976 
 1977 /**
 1978  *      vread() -  read vmalloc area in a safe way.
 1979  *      @buf:           buffer for reading data
 1980  *      @addr:          vm address.
 1981  *      @count:         number of bytes to be read.
 1982  *
 1983  *      Returns # of bytes which addr and buf should be increased.
 1984  *      (same number to @count). Returns 0 if [addr...addr+count) doesn't
 1985  *      includes any intersect with alive vmalloc area.
 1986  *
 1987  *      This function checks that addr is a valid vmalloc'ed area, and
 1988  *      copy data from that area to a given buffer. If the given memory range
 1989  *      of [addr...addr+count) includes some valid address, data is copied to
 1990  *      proper area of @buf. If there are memory holes, they'll be zero-filled.
 1991  *      IOREMAP area is treated as memory hole and no copy is done.
 1992  *
 1993  *      If [addr...addr+count) doesn't includes any intersects with alive
 1994  *      vm_struct area, returns 0. @buf should be kernel's buffer.
 1995  *
 1996  *      Note: In usual ops, vread() is never necessary because the caller
 1997  *      should know vmalloc() area is valid and can use memcpy().
 1998  *      This is for routines which have to access vmalloc area without
 1999  *      any informaion, as /dev/kmem.
 2000  *
 2001  */
 2002 
 2003 long vread(char *buf, char *addr, unsigned long count)
 2004 {
 2005         struct vm_struct *tmp;
 2006         char *vaddr, *buf_start = buf;
 2007         unsigned long buflen = count;
 2008         unsigned long n;
 2009 
 2010         /* Don't allow overflow */
 2011         if ((unsigned long) addr + count < count)
 2012                 count = -(unsigned long) addr;
 2013 
 2014         read_lock(&vmlist_lock);
 2015         for (tmp = vmlist; count && tmp; tmp = tmp->next) {
 2016                 vaddr = (char *) tmp->addr;
 2017                 if (addr >= vaddr + tmp->size - PAGE_SIZE)
 2018                         continue;
 2019                 while (addr < vaddr) {
 2020                         if (count == 0)
 2021                                 goto finished;
 2022                         *buf = '\0';
 2023                         buf++;
 2024                         addr++;
 2025                         count--;
 2026                 }
 2027                 n = vaddr + tmp->size - PAGE_SIZE - addr;
 2028                 if (n > count)
 2029                         n = count;
 2030                 if (!(tmp->flags & VM_IOREMAP))
 2031                         aligned_vread(buf, addr, n);
 2032                 else /* IOREMAP area is treated as memory hole */
 2033                         memset(buf, 0, n);
 2034                 buf += n;
 2035                 addr += n;
 2036                 count -= n;
 2037         }
 2038 finished:
 2039         read_unlock(&vmlist_lock);
 2040 
 2041         if (buf == buf_start)
 2042                 return 0;
 2043         /* zero-fill memory holes */
 2044         if (buf != buf_start + buflen)
 2045                 memset(buf, 0, buflen - (buf - buf_start));
 2046 
 2047         return buflen;
 2048 }
 2049 
 2050 /**
 2051  *      vwrite() -  write vmalloc area in a safe way.
 2052  *      @buf:           buffer for source data
 2053  *      @addr:          vm address.
 2054  *      @count:         number of bytes to be read.
 2055  *
 2056  *      Returns # of bytes which addr and buf should be incresed.
 2057  *      (same number to @count).
 2058  *      If [addr...addr+count) doesn't includes any intersect with valid
 2059  *      vmalloc area, returns 0.
 2060  *
 2061  *      This function checks that addr is a valid vmalloc'ed area, and
 2062  *      copy data from a buffer to the given addr. If specified range of
 2063  *      [addr...addr+count) includes some valid address, data is copied from
 2064  *      proper area of @buf. If there are memory holes, no copy to hole.
 2065  *      IOREMAP area is treated as memory hole and no copy is done.
 2066  *
 2067  *      If [addr...addr+count) doesn't includes any intersects with alive
 2068  *      vm_struct area, returns 0. @buf should be kernel's buffer.
 2069  *
 2070  *      Note: In usual ops, vwrite() is never necessary because the caller
 2071  *      should know vmalloc() area is valid and can use memcpy().
 2072  *      This is for routines which have to access vmalloc area without
 2073  *      any informaion, as /dev/kmem.
 2074  */
 2075 
 2076 long vwrite(char *buf, char *addr, unsigned long count)
 2077 {
 2078         struct vm_struct *tmp;
 2079         char *vaddr;
 2080         unsigned long n, buflen;
 2081         int copied = 0;
 2082 
 2083         /* Don't allow overflow */
 2084         if ((unsigned long) addr + count < count)
 2085                 count = -(unsigned long) addr;
 2086         buflen = count;
 2087 
 2088         read_lock(&vmlist_lock);
 2089         for (tmp = vmlist; count && tmp; tmp = tmp->next) {
 2090                 vaddr = (char *) tmp->addr;
 2091                 if (addr >= vaddr + tmp->size - PAGE_SIZE)
 2092                         continue;
 2093                 while (addr < vaddr) {
 2094                         if (count == 0)
 2095                                 goto finished;
 2096                         buf++;
 2097                         addr++;
 2098                         count--;
 2099                 }
 2100                 n = vaddr + tmp->size - PAGE_SIZE - addr;
 2101                 if (n > count)
 2102                         n = count;
 2103                 if (!(tmp->flags & VM_IOREMAP)) {
 2104                         aligned_vwrite(buf, addr, n);
 2105                         copied++;
 2106                 }
 2107                 buf += n;
 2108                 addr += n;
 2109                 count -= n;
 2110         }
 2111 finished:
 2112         read_unlock(&vmlist_lock);
 2113         if (!copied)
 2114                 return 0;
 2115         return buflen;
 2116 }
 2117 
 2118 /**
 2119  *      remap_vmalloc_range  -  map vmalloc pages to userspace
 2120  *      @vma:           vma to cover (map full range of vma)
 2121  *      @addr:          vmalloc memory
 2122  *      @pgoff:         number of pages into addr before first page to map
 2123  *
 2124  *      Returns:        0 for success, -Exxx on failure
 2125  *
 2126  *      This function checks that addr is a valid vmalloc'ed area, and
 2127  *      that it is big enough to cover the vma. Will return failure if
 2128  *      that criteria isn't met.
 2129  *
 2130  *      Similar to remap_pfn_range() (see mm/memory.c)
 2131  */
 2132 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 2133                                                 unsigned long pgoff)
 2134 {
 2135         struct vm_struct *area;
 2136         unsigned long uaddr = vma->vm_start;
 2137         unsigned long usize = vma->vm_end - vma->vm_start;
 2138 
 2139         if ((PAGE_SIZE-1) & (unsigned long)addr)
 2140                 return -EINVAL;
 2141 
 2142         area = find_vm_area(addr);
 2143         if (!area)
 2144                 return -EINVAL;
 2145 
 2146         if (!(area->flags & VM_USERMAP))
 2147                 return -EINVAL;
 2148 
 2149         if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
 2150                 return -EINVAL;
 2151 
 2152         addr += pgoff << PAGE_SHIFT;
 2153         do {
 2154                 struct page *page = vmalloc_to_page(addr);
 2155                 int ret;
 2156 
 2157                 ret = vm_insert_page(vma, uaddr, page);
 2158                 if (ret)
 2159                         return ret;
 2160 
 2161                 uaddr += PAGE_SIZE;
 2162                 addr += PAGE_SIZE;
 2163                 usize -= PAGE_SIZE;
 2164         } while (usize > 0);
 2165 
 2166         vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 2167 
 2168         return 0;
 2169 }
 2170 EXPORT_SYMBOL(remap_vmalloc_range);
 2171 
 2172 /*
 2173  * Implement a stub for vmalloc_sync_all() if the architecture chose not to
 2174  * have one.
 2175  */
 2176 void  __attribute__((weak)) vmalloc_sync_all(void)
 2177 {
 2178 }
 2179 
 2180 
 2181 static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
 2182 {
 2183         pte_t ***p = data;
 2184 
 2185         if (p) {
 2186                 *(*p) = pte;
 2187                 (*p)++;
 2188         }
 2189         return 0;
 2190 }
 2191 
 2192 /**
 2193  *      alloc_vm_area - allocate a range of kernel address space
 2194  *      @size:          size of the area
 2195  *      @ptes:          returns the PTEs for the address space
 2196  *
 2197  *      Returns:        NULL on failure, vm_struct on success
 2198  *
 2199  *      This function reserves a range of kernel address space, and
 2200  *      allocates pagetables to map that range.  No actual mappings
 2201  *      are created.
 2202  *
 2203  *      If @ptes is non-NULL, pointers to the PTEs (in init_mm)
 2204  *      allocated for the VM area are returned.
 2205  */
 2206 struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
 2207 {
 2208         struct vm_struct *area;
 2209 
 2210         area = get_vm_area_caller(size, VM_IOREMAP,
 2211                                 __builtin_return_address(0));
 2212         if (area == NULL)
 2213                 return NULL;
 2214 
 2215         /*
 2216          * This ensures that page tables are constructed for this region
 2217          * of kernel virtual address space and mapped into init_mm.
 2218          */
 2219         if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
 2220                                 size, f, ptes ? &ptes : NULL)) {
 2221                 free_vm_area(area);
 2222                 return NULL;
 2223         }
 2224 
 2225         return area;
 2226 }
 2227 EXPORT_SYMBOL_GPL(alloc_vm_area);
 2228 
 2229 void free_vm_area(struct vm_struct *area)
 2230 {
 2231         struct vm_struct *ret;
 2232         ret = remove_vm_area(area->addr);
 2233         BUG_ON(ret != area);
 2234         kfree(area);
 2235 }
 2236 EXPORT_SYMBOL_GPL(free_vm_area);
 2237 
 2238 #ifdef CONFIG_SMP
 2239 static struct vmap_area *node_to_va(struct rb_node *n)
 2240 {
 2241         return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
 2242 }
 2243 
 2244 /**
 2245  * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
 2246  * @end: target address
 2247  * @pnext: out arg for the next vmap_area
 2248  * @pprev: out arg for the previous vmap_area
 2249  *
 2250  * Returns: %true if either or both of next and prev are found,
 2251  *          %false if no vmap_area exists
 2252  *
 2253  * Find vmap_areas end addresses of which enclose @end.  ie. if not
 2254  * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
 2255  */
 2256 static bool pvm_find_next_prev(unsigned long end,
 2257                                struct vmap_area **pnext,
 2258                                struct vmap_area **pprev)
 2259 {
 2260         struct rb_node *n = vmap_area_root.rb_node;
 2261         struct vmap_area *va = NULL;
 2262 
 2263         while (n) {
 2264                 va = rb_entry(n, struct vmap_area, rb_node);
 2265                 if (end < va->va_end)
 2266                         n = n->rb_left;
 2267                 else if (end > va->va_end)
 2268                         n = n->rb_right;
 2269                 else
 2270                         break;
 2271         }
 2272 
 2273         if (!va)
 2274                 return false;
 2275 
 2276         if (va->va_end > end) {
 2277                 *pnext = va;
 2278                 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
 2279         } else {
 2280                 *pprev = va;
 2281                 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
 2282         }
 2283         return true;
 2284 }
 2285 
 2286 /**
 2287  * pvm_determine_end - find the highest aligned address between two vmap_areas
 2288  * @pnext: in/out arg for the next vmap_area
 2289  * @pprev: in/out arg for the previous vmap_area
 2290  * @align: alignment
 2291  *
 2292  * Returns: determined end address
 2293  *
 2294  * Find the highest aligned address between *@pnext and *@pprev below
 2295  * VMALLOC_END.  *@pnext and *@pprev are adjusted so that the aligned
 2296  * down address is between the end addresses of the two vmap_areas.
 2297  *
 2298  * Please note that the address returned by this function may fall
 2299  * inside *@pnext vmap_area.  The caller is responsible for checking
 2300  * that.
 2301  */
 2302 static unsigned long pvm_determine_end(struct vmap_area **pnext,
 2303                                        struct vmap_area **pprev,
 2304                                        unsigned long align)
 2305 {
 2306         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
 2307         unsigned long addr;
 2308 
 2309         if (*pnext)
 2310                 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
 2311         else
 2312                 addr = vmalloc_end;
 2313 
 2314         while (*pprev && (*pprev)->va_end > addr) {
 2315                 *pnext = *pprev;
 2316                 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
 2317         }
 2318 
 2319         return addr;
 2320 }
 2321 
 2322 /**
 2323  * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
 2324  * @offsets: array containing offset of each area
 2325  * @sizes: array containing size of each area
 2326  * @nr_vms: the number of areas to allocate
 2327  * @align: alignment, all entries in @offsets and @sizes must be aligned to this
 2328  *
 2329  * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 2330  *          vm_structs on success, %NULL on failure
 2331  *
 2332  * Percpu allocator wants to use congruent vm areas so that it can
 2333  * maintain the offsets among percpu areas.  This function allocates
 2334  * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
 2335  * be scattered pretty far, distance between two areas easily going up
 2336  * to gigabytes.  To avoid interacting with regular vmallocs, these
 2337  * areas are allocated from top.
 2338  *
 2339  * Despite its complicated look, this allocator is rather simple.  It
 2340  * does everything top-down and scans areas from the end looking for
 2341  * matching slot.  While scanning, if any of the areas overlaps with
 2342  * existing vmap_area, the base address is pulled down to fit the
 2343  * area.  Scanning is repeated till all the areas fit and then all
 2344  * necessary data structres are inserted and the result is returned.
 2345  */
 2346 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 2347                                      const size_t *sizes, int nr_vms,
 2348                                      size_t align)
 2349 {
 2350         const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
 2351         const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
 2352         struct vmap_area **vas, *prev, *next;
 2353         struct vm_struct **vms;
 2354         int area, area2, last_area, term_area;
 2355         unsigned long base, start, end, last_end;
 2356         bool purged = false;
 2357 
 2358         /* verify parameters and allocate data structures */
 2359         BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
 2360         for (last_area = 0, area = 0; area < nr_vms; area++) {
 2361                 start = offsets[area];
 2362                 end = start + sizes[area];
 2363 
 2364                 /* is everything aligned properly? */
 2365                 BUG_ON(!IS_ALIGNED(offsets[area], align));
 2366                 BUG_ON(!IS_ALIGNED(sizes[area], align));
 2367 
 2368                 /* detect the area with the highest address */
 2369                 if (start > offsets[last_area])
 2370                         last_area = area;
 2371 
 2372                 for (area2 = 0; area2 < nr_vms; area2++) {
 2373                         unsigned long start2 = offsets[area2];
 2374                         unsigned long end2 = start2 + sizes[area2];
 2375 
 2376                         if (area2 == area)
 2377                                 continue;
 2378 
 2379                         BUG_ON(start2 >= start && start2 < end);
 2380                         BUG_ON(end2 <= end && end2 > start);
 2381                 }
 2382         }
 2383         last_end = offsets[last_area] + sizes[last_area];
 2384 
 2385         if (vmalloc_end - vmalloc_start < last_end) {
 2386                 WARN_ON(true);
 2387                 return NULL;
 2388         }
 2389 
 2390         vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
 2391         vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
 2392         if (!vas || !vms)
 2393                 goto err_free2;
 2394 
 2395         for (area = 0; area < nr_vms; area++) {
 2396                 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
 2397                 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
 2398                 if (!vas[area] || !vms[area])
 2399                         goto err_free;
 2400         }
 2401 retry:
 2402         spin_lock(&vmap_area_lock);
 2403 
 2404         /* start scanning - we scan from the top, begin with the last area */
 2405         area = term_area = last_area;
 2406         start = offsets[area];
 2407         end = start + sizes[area];
 2408 
 2409         if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
 2410                 base = vmalloc_end - last_end;
 2411                 goto found;
 2412         }
 2413         base = pvm_determine_end(&next, &prev, align) - end;
 2414 
 2415         while (true) {
 2416                 BUG_ON(next && next->va_end <= base + end);
 2417                 BUG_ON(prev && prev->va_end > base + end);
 2418 
 2419                 /*
 2420                  * base might have underflowed, add last_end before
 2421                  * comparing.
 2422                  */
 2423                 if (base + last_end < vmalloc_start + last_end) {
 2424                         spin_unlock(&vmap_area_lock);
 2425                         if (!purged) {
 2426                                 purge_vmap_area_lazy();
 2427                                 purged = true;
 2428                                 goto retry;
 2429                         }
 2430                         goto err_free;
 2431                 }
 2432 
 2433                 /*
 2434                  * If next overlaps, move base downwards so that it's
 2435                  * right below next and then recheck.
 2436                  */
 2437                 if (next && next->va_start < base + end) {
 2438                         base = pvm_determine_end(&next, &prev, align) - end;
 2439                         term_area = area;
 2440                         continue;
 2441                 }
 2442 
 2443                 /*
 2444                  * If prev overlaps, shift down next and prev and move
 2445                  * base so that it's right below new next and then
 2446                  * recheck.
 2447                  */
 2448                 if (prev && prev->va_end > base + start)  {
 2449                         next = prev;
 2450                         prev = node_to_va(rb_prev(&next->rb_node));
 2451                         base = pvm_determine_end(&next, &prev, align) - end;
 2452                         term_area = area;
 2453                         continue;
 2454                 }
 2455 
 2456                 /*
 2457                  * This area fits, move on to the previous one.  If
 2458                  * the previous one is the terminal one, we're done.
 2459                  */
 2460                 area = (area + nr_vms - 1) % nr_vms;
 2461                 if (area == term_area)
 2462                         break;
 2463                 start = offsets[area];
 2464                 end = start + sizes[area];
 2465                 pvm_find_next_prev(base + end, &next, &prev);
 2466         }
 2467 found:
 2468         /* we've found a fitting base, insert all va's */
 2469         for (area = 0; area < nr_vms; area++) {
 2470                 struct vmap_area *va = vas[area];
 2471 
 2472                 va->va_start = base + offsets[area];
 2473                 va->va_end = va->va_start + sizes[area];
 2474                 __insert_vmap_area(va);
 2475         }
 2476 
 2477         vmap_area_pcpu_hole = base + offsets[last_area];
 2478 
 2479         spin_unlock(&vmap_area_lock);
 2480 
 2481         /* insert all vm's */
 2482         for (area = 0; area < nr_vms; area++)
 2483                 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
 2484                                   pcpu_get_vm_areas);
 2485 
 2486         kfree(vas);
 2487         return vms;
 2488 
 2489 err_free:
 2490         for (area = 0; area < nr_vms; area++) {
 2491                 kfree(vas[area]);
 2492                 kfree(vms[area]);
 2493         }
 2494 err_free2:
 2495         kfree(vas);
 2496         kfree(vms);
 2497         return NULL;
 2498 }
 2499 
 2500 /**
 2501  * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
 2502  * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
 2503  * @nr_vms: the number of allocated areas
 2504  *
 2505  * Free vm_structs and the array allocated by pcpu_get_vm_areas().
 2506  */
 2507 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 2508 {
 2509         int i;
 2510 
 2511         for (i = 0; i < nr_vms; i++)
 2512                 free_vm_area(vms[i]);
 2513         kfree(vms);
 2514 }
 2515 #endif  /* CONFIG_SMP */
 2516 
 2517 #ifdef CONFIG_PROC_FS
 2518 static void *s_start(struct seq_file *m, loff_t *pos)
 2519         __acquires(&vmlist_lock)
 2520 {
 2521         loff_t n = *pos;
 2522         struct vm_struct *v;
 2523 
 2524         read_lock(&vmlist_lock);
 2525         v = vmlist;
 2526         while (n > 0 && v) {
 2527                 n--;
 2528                 v = v->next;
 2529         }
 2530         if (!n)
 2531                 return v;
 2532 
 2533         return NULL;
 2534 
 2535 }
 2536 
 2537 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 2538 {
 2539         struct vm_struct *v = p;
 2540 
 2541         ++*pos;
 2542         return v->next;
 2543 }
 2544 
 2545 static void s_stop(struct seq_file *m, void *p)
 2546         __releases(&vmlist_lock)
 2547 {
 2548         read_unlock(&vmlist_lock);
 2549 }
 2550 
 2551 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
 2552 {
 2553         if (IS_ENABLED(CONFIG_NUMA)) {
 2554                 unsigned int nr, *counters = m->private;
 2555 
 2556                 if (!counters)
 2557                         return;
 2558 
 2559                 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
 2560 
 2561                 for (nr = 0; nr < v->nr_pages; nr++)
 2562                         counters[page_to_nid(v->pages[nr])]++;
 2563 
 2564                 for_each_node_state(nr, N_HIGH_MEMORY)
 2565                         if (counters[nr])
 2566                                 seq_printf(m, " N%u=%u", nr, counters[nr]);
 2567         }
 2568 }
 2569 
 2570 static int s_show(struct seq_file *m, void *p)
 2571 {
 2572         struct vm_struct *v = p;
 2573 
 2574         seq_printf(m, "0x%pK-0x%pK %7ld",
 2575                 v->addr, v->addr + v->size, v->size);
 2576 
 2577         if (v->caller)
 2578                 seq_printf(m, " %pS", v->caller);
 2579 
 2580         if (v->nr_pages)
 2581                 seq_printf(m, " pages=%d", v->nr_pages);
 2582 
 2583         if (v->phys_addr)
 2584                 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
 2585 
 2586         if (v->flags & VM_IOREMAP)
 2587                 seq_printf(m, " ioremap");
 2588 
 2589         if (v->flags & VM_ALLOC)
 2590                 seq_printf(m, " vmalloc");
 2591 
 2592         if (v->flags & VM_MAP)
 2593                 seq_printf(m, " vmap");
 2594 
 2595         if (v->flags & VM_USERMAP)
 2596                 seq_printf(m, " user");
 2597 
 2598         if (v->flags & VM_VPAGES)
 2599                 seq_printf(m, " vpages");
 2600 
 2601         show_numa_info(m, v);
 2602         seq_putc(m, '\n');
 2603         return 0;
 2604 }
 2605 
 2606 static const struct seq_operations vmalloc_op = {
 2607         .start = s_start,
 2608         .next = s_next,
 2609         .stop = s_stop,
 2610         .show = s_show,
 2611 };
 2612 
 2613 static int vmalloc_open(struct inode *inode, struct file *file)
 2614 {
 2615         unsigned int *ptr = NULL;
 2616         int ret;
 2617 
 2618         if (IS_ENABLED(CONFIG_NUMA)) {
 2619                 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
 2620                 if (ptr == NULL)
 2621                         return -ENOMEM;
 2622         }
 2623         ret = seq_open(file, &vmalloc_op);
 2624         if (!ret) {
 2625                 struct seq_file *m = file->private_data;
 2626                 m->private = ptr;
 2627         } else
 2628                 kfree(ptr);
 2629         return ret;
 2630 }
 2631 
 2632 static const struct file_operations proc_vmalloc_operations = {
 2633         .open           = vmalloc_open,
 2634         .read           = seq_read,
 2635         .llseek         = seq_lseek,
 2636         .release        = seq_release_private,
 2637 };
 2638 
 2639 static int __init proc_vmalloc_init(void)
 2640 {
 2641         proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
 2642         return 0;
 2643 }
 2644 module_init(proc_vmalloc_init);
 2645 #endif
 2646
Cache object: 201c1b6d8a01fb0e97ff7e589b87d722
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/mm/vmalloc.c

FreeBSD/Linux Kernel Cross Reference
sys/mm/vmalloc.c