The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/mm/page_cgroup.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 #include <linux/mm.h>
    2 #include <linux/mmzone.h>
    3 #include <linux/bootmem.h>
    4 #include <linux/bit_spinlock.h>
    5 #include <linux/page_cgroup.h>
    6 #include <linux/hash.h>
    7 #include <linux/slab.h>
    8 #include <linux/memory.h>
    9 #include <linux/vmalloc.h>
   10 #include <linux/cgroup.h>
   11 #include <linux/swapops.h>
   12 #include <linux/kmemleak.h>
   13 
   14 static unsigned long total_usage;
   15 
   16 #if !defined(CONFIG_SPARSEMEM)
   17 
   18 
   19 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
   20 {
   21         pgdat->node_page_cgroup = NULL;
   22 }
   23 
   24 struct page_cgroup *lookup_page_cgroup(struct page *page)
   25 {
   26         unsigned long pfn = page_to_pfn(page);
   27         unsigned long offset;
   28         struct page_cgroup *base;
   29 
   30         base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
   31 #ifdef CONFIG_DEBUG_VM
   32         /*
   33          * The sanity checks the page allocator does upon freeing a
   34          * page can reach here before the page_cgroup arrays are
   35          * allocated when feeding a range of pages to the allocator
   36          * for the first time during bootup or memory hotplug.
   37          */
   38         if (unlikely(!base))
   39                 return NULL;
   40 #endif
   41         offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
   42         return base + offset;
   43 }
   44 
   45 static int __init alloc_node_page_cgroup(int nid)
   46 {
   47         struct page_cgroup *base;
   48         unsigned long table_size;
   49         unsigned long nr_pages;
   50 
   51         nr_pages = NODE_DATA(nid)->node_spanned_pages;
   52         if (!nr_pages)
   53                 return 0;
   54 
   55         table_size = sizeof(struct page_cgroup) * nr_pages;
   56 
   57         base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
   58                         table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
   59         if (!base)
   60                 return -ENOMEM;
   61         NODE_DATA(nid)->node_page_cgroup = base;
   62         total_usage += table_size;
   63         return 0;
   64 }
   65 
   66 void __init page_cgroup_init_flatmem(void)
   67 {
   68 
   69         int nid, fail;
   70 
   71         if (mem_cgroup_disabled())
   72                 return;
   73 
   74         for_each_online_node(nid)  {
   75                 fail = alloc_node_page_cgroup(nid);
   76                 if (fail)
   77                         goto fail;
   78         }
   79         printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
   80         printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
   81         " don't want memory cgroups\n");
   82         return;
   83 fail:
   84         printk(KERN_CRIT "allocation of page_cgroup failed.\n");
   85         printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
   86         panic("Out of memory");
   87 }
   88 
   89 #else /* CONFIG_FLAT_NODE_MEM_MAP */
   90 
   91 struct page_cgroup *lookup_page_cgroup(struct page *page)
   92 {
   93         unsigned long pfn = page_to_pfn(page);
   94         struct mem_section *section = __pfn_to_section(pfn);
   95 #ifdef CONFIG_DEBUG_VM
   96         /*
   97          * The sanity checks the page allocator does upon freeing a
   98          * page can reach here before the page_cgroup arrays are
   99          * allocated when feeding a range of pages to the allocator
  100          * for the first time during bootup or memory hotplug.
  101          */
  102         if (!section->page_cgroup)
  103                 return NULL;
  104 #endif
  105         return section->page_cgroup + pfn;
  106 }
  107 
  108 static void *__meminit alloc_page_cgroup(size_t size, int nid)
  109 {
  110         gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
  111         void *addr = NULL;
  112 
  113         addr = alloc_pages_exact_nid(nid, size, flags);
  114         if (addr) {
  115                 kmemleak_alloc(addr, size, 1, flags);
  116                 return addr;
  117         }
  118 
  119         if (node_state(nid, N_HIGH_MEMORY))
  120                 addr = vzalloc_node(size, nid);
  121         else
  122                 addr = vzalloc(size);
  123 
  124         return addr;
  125 }
  126 
  127 static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
  128 {
  129         struct mem_section *section;
  130         struct page_cgroup *base;
  131         unsigned long table_size;
  132 
  133         section = __pfn_to_section(pfn);
  134 
  135         if (section->page_cgroup)
  136                 return 0;
  137 
  138         table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
  139         base = alloc_page_cgroup(table_size, nid);
  140 
  141         /*
  142          * The value stored in section->page_cgroup is (base - pfn)
  143          * and it does not point to the memory block allocated above,
  144          * causing kmemleak false positives.
  145          */
  146         kmemleak_not_leak(base);
  147 
  148         if (!base) {
  149                 printk(KERN_ERR "page cgroup allocation failure\n");
  150                 return -ENOMEM;
  151         }
  152 
  153         /*
  154          * The passed "pfn" may not be aligned to SECTION.  For the calculation
  155          * we need to apply a mask.
  156          */
  157         pfn &= PAGE_SECTION_MASK;
  158         section->page_cgroup = base - pfn;
  159         total_usage += table_size;
  160         return 0;
  161 }
  162 #ifdef CONFIG_MEMORY_HOTPLUG
  163 static void free_page_cgroup(void *addr)
  164 {
  165         if (is_vmalloc_addr(addr)) {
  166                 vfree(addr);
  167         } else {
  168                 struct page *page = virt_to_page(addr);
  169                 size_t table_size =
  170                         sizeof(struct page_cgroup) * PAGES_PER_SECTION;
  171 
  172                 BUG_ON(PageReserved(page));
  173                 free_pages_exact(addr, table_size);
  174         }
  175 }
  176 
  177 void __free_page_cgroup(unsigned long pfn)
  178 {
  179         struct mem_section *ms;
  180         struct page_cgroup *base;
  181 
  182         ms = __pfn_to_section(pfn);
  183         if (!ms || !ms->page_cgroup)
  184                 return;
  185         base = ms->page_cgroup + pfn;
  186         free_page_cgroup(base);
  187         ms->page_cgroup = NULL;
  188 }
  189 
  190 int __meminit online_page_cgroup(unsigned long start_pfn,
  191                         unsigned long nr_pages,
  192                         int nid)
  193 {
  194         unsigned long start, end, pfn;
  195         int fail = 0;
  196 
  197         start = SECTION_ALIGN_DOWN(start_pfn);
  198         end = SECTION_ALIGN_UP(start_pfn + nr_pages);
  199 
  200         if (nid == -1) {
  201                 /*
  202                  * In this case, "nid" already exists and contains valid memory.
  203                  * "start_pfn" passed to us is a pfn which is an arg for
  204                  * online__pages(), and start_pfn should exist.
  205                  */
  206                 nid = pfn_to_nid(start_pfn);
  207                 VM_BUG_ON(!node_state(nid, N_ONLINE));
  208         }
  209 
  210         for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
  211                 if (!pfn_present(pfn))
  212                         continue;
  213                 fail = init_section_page_cgroup(pfn, nid);
  214         }
  215         if (!fail)
  216                 return 0;
  217 
  218         /* rollback */
  219         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
  220                 __free_page_cgroup(pfn);
  221 
  222         return -ENOMEM;
  223 }
  224 
  225 int __meminit offline_page_cgroup(unsigned long start_pfn,
  226                 unsigned long nr_pages, int nid)
  227 {
  228         unsigned long start, end, pfn;
  229 
  230         start = SECTION_ALIGN_DOWN(start_pfn);
  231         end = SECTION_ALIGN_UP(start_pfn + nr_pages);
  232 
  233         for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
  234                 __free_page_cgroup(pfn);
  235         return 0;
  236 
  237 }
  238 
  239 static int __meminit page_cgroup_callback(struct notifier_block *self,
  240                                unsigned long action, void *arg)
  241 {
  242         struct memory_notify *mn = arg;
  243         int ret = 0;
  244         switch (action) {
  245         case MEM_GOING_ONLINE:
  246                 ret = online_page_cgroup(mn->start_pfn,
  247                                    mn->nr_pages, mn->status_change_nid);
  248                 break;
  249         case MEM_OFFLINE:
  250                 offline_page_cgroup(mn->start_pfn,
  251                                 mn->nr_pages, mn->status_change_nid);
  252                 break;
  253         case MEM_CANCEL_ONLINE:
  254                 offline_page_cgroup(mn->start_pfn,
  255                                 mn->nr_pages, mn->status_change_nid);
  256                 break;
  257         case MEM_GOING_OFFLINE:
  258                 break;
  259         case MEM_ONLINE:
  260         case MEM_CANCEL_OFFLINE:
  261                 break;
  262         }
  263 
  264         return notifier_from_errno(ret);
  265 }
  266 
  267 #endif
  268 
  269 void __init page_cgroup_init(void)
  270 {
  271         unsigned long pfn;
  272         int nid;
  273 
  274         if (mem_cgroup_disabled())
  275                 return;
  276 
  277         for_each_node_state(nid, N_MEMORY) {
  278                 unsigned long start_pfn, end_pfn;
  279 
  280                 start_pfn = node_start_pfn(nid);
  281                 end_pfn = node_end_pfn(nid);
  282                 /*
  283                  * start_pfn and end_pfn may not be aligned to SECTION and the
  284                  * page->flags of out of node pages are not initialized.  So we
  285                  * scan [start_pfn, the biggest section's pfn < end_pfn) here.
  286                  */
  287                 for (pfn = start_pfn;
  288                      pfn < end_pfn;
  289                      pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
  290 
  291                         if (!pfn_valid(pfn))
  292                                 continue;
  293                         /*
  294                          * Nodes's pfns can be overlapping.
  295                          * We know some arch can have a nodes layout such as
  296                          * -------------pfn-------------->
  297                          * N0 | N1 | N2 | N0 | N1 | N2|....
  298                          */
  299                         if (pfn_to_nid(pfn) != nid)
  300                                 continue;
  301                         if (init_section_page_cgroup(pfn, nid))
  302                                 goto oom;
  303                 }
  304         }
  305         hotplug_memory_notifier(page_cgroup_callback, 0);
  306         printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
  307         printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
  308                          "don't want memory cgroups\n");
  309         return;
  310 oom:
  311         printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
  312         panic("Out of memory");
  313 }
  314 
  315 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
  316 {
  317         return;
  318 }
  319 
  320 #endif
  321 
  322 
  323 #ifdef CONFIG_MEMCG_SWAP
  324 
  325 static DEFINE_MUTEX(swap_cgroup_mutex);
  326 struct swap_cgroup_ctrl {
  327         struct page **map;
  328         unsigned long length;
  329         spinlock_t      lock;
  330 };
  331 
  332 static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
  333 
  334 struct swap_cgroup {
  335         unsigned short          id;
  336 };
  337 #define SC_PER_PAGE     (PAGE_SIZE/sizeof(struct swap_cgroup))
  338 
  339 /*
  340  * SwapCgroup implements "lookup" and "exchange" operations.
  341  * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
  342  * against SwapCache. At swap_free(), this is accessed directly from swap.
  343  *
  344  * This means,
  345  *  - we have no race in "exchange" when we're accessed via SwapCache because
  346  *    SwapCache(and its swp_entry) is under lock.
  347  *  - When called via swap_free(), there is no user of this entry and no race.
  348  * Then, we don't need lock around "exchange".
  349  *
  350  * TODO: we can push these buffers out to HIGHMEM.
  351  */
  352 
  353 /*
  354  * allocate buffer for swap_cgroup.
  355  */
  356 static int swap_cgroup_prepare(int type)
  357 {
  358         struct page *page;
  359         struct swap_cgroup_ctrl *ctrl;
  360         unsigned long idx, max;
  361 
  362         ctrl = &swap_cgroup_ctrl[type];
  363 
  364         for (idx = 0; idx < ctrl->length; idx++) {
  365                 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  366                 if (!page)
  367                         goto not_enough_page;
  368                 ctrl->map[idx] = page;
  369         }
  370         return 0;
  371 not_enough_page:
  372         max = idx;
  373         for (idx = 0; idx < max; idx++)
  374                 __free_page(ctrl->map[idx]);
  375 
  376         return -ENOMEM;
  377 }
  378 
  379 static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
  380                                         struct swap_cgroup_ctrl **ctrlp)
  381 {
  382         pgoff_t offset = swp_offset(ent);
  383         struct swap_cgroup_ctrl *ctrl;
  384         struct page *mappage;
  385         struct swap_cgroup *sc;
  386 
  387         ctrl = &swap_cgroup_ctrl[swp_type(ent)];
  388         if (ctrlp)
  389                 *ctrlp = ctrl;
  390 
  391         mappage = ctrl->map[offset / SC_PER_PAGE];
  392         sc = page_address(mappage);
  393         return sc + offset % SC_PER_PAGE;
  394 }
  395 
  396 /**
  397  * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
  398  * @ent: swap entry to be cmpxchged
  399  * @old: old id
  400  * @new: new id
  401  *
  402  * Returns old id at success, 0 at failure.
  403  * (There is no mem_cgroup using 0 as its id)
  404  */
  405 unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
  406                                         unsigned short old, unsigned short new)
  407 {
  408         struct swap_cgroup_ctrl *ctrl;
  409         struct swap_cgroup *sc;
  410         unsigned long flags;
  411         unsigned short retval;
  412 
  413         sc = lookup_swap_cgroup(ent, &ctrl);
  414 
  415         spin_lock_irqsave(&ctrl->lock, flags);
  416         retval = sc->id;
  417         if (retval == old)
  418                 sc->id = new;
  419         else
  420                 retval = 0;
  421         spin_unlock_irqrestore(&ctrl->lock, flags);
  422         return retval;
  423 }
  424 
  425 /**
  426  * swap_cgroup_record - record mem_cgroup for this swp_entry.
  427  * @ent: swap entry to be recorded into
  428  * @id: mem_cgroup to be recorded
  429  *
  430  * Returns old value at success, 0 at failure.
  431  * (Of course, old value can be 0.)
  432  */
  433 unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
  434 {
  435         struct swap_cgroup_ctrl *ctrl;
  436         struct swap_cgroup *sc;
  437         unsigned short old;
  438         unsigned long flags;
  439 
  440         sc = lookup_swap_cgroup(ent, &ctrl);
  441 
  442         spin_lock_irqsave(&ctrl->lock, flags);
  443         old = sc->id;
  444         sc->id = id;
  445         spin_unlock_irqrestore(&ctrl->lock, flags);
  446 
  447         return old;
  448 }
  449 
  450 /**
  451  * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
  452  * @ent: swap entry to be looked up.
  453  *
  454  * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
  455  */
  456 unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
  457 {
  458         return lookup_swap_cgroup(ent, NULL)->id;
  459 }
  460 
  461 int swap_cgroup_swapon(int type, unsigned long max_pages)
  462 {
  463         void *array;
  464         unsigned long array_size;
  465         unsigned long length;
  466         struct swap_cgroup_ctrl *ctrl;
  467 
  468         if (!do_swap_account)
  469                 return 0;
  470 
  471         length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
  472         array_size = length * sizeof(void *);
  473 
  474         array = vzalloc(array_size);
  475         if (!array)
  476                 goto nomem;
  477 
  478         ctrl = &swap_cgroup_ctrl[type];
  479         mutex_lock(&swap_cgroup_mutex);
  480         ctrl->length = length;
  481         ctrl->map = array;
  482         spin_lock_init(&ctrl->lock);
  483         if (swap_cgroup_prepare(type)) {
  484                 /* memory shortage */
  485                 ctrl->map = NULL;
  486                 ctrl->length = 0;
  487                 mutex_unlock(&swap_cgroup_mutex);
  488                 vfree(array);
  489                 goto nomem;
  490         }
  491         mutex_unlock(&swap_cgroup_mutex);
  492 
  493         return 0;
  494 nomem:
  495         printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
  496         printk(KERN_INFO
  497                 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
  498         return -ENOMEM;
  499 }
  500 
  501 void swap_cgroup_swapoff(int type)
  502 {
  503         struct page **map;
  504         unsigned long i, length;
  505         struct swap_cgroup_ctrl *ctrl;
  506 
  507         if (!do_swap_account)
  508                 return;
  509 
  510         mutex_lock(&swap_cgroup_mutex);
  511         ctrl = &swap_cgroup_ctrl[type];
  512         map = ctrl->map;
  513         length = ctrl->length;
  514         ctrl->map = NULL;
  515         ctrl->length = 0;
  516         mutex_unlock(&swap_cgroup_mutex);
  517 
  518         if (map) {
  519                 for (i = 0; i < length; i++) {
  520                         struct page *page = map[i];
  521                         if (page)
  522                                 __free_page(page);
  523                 }
  524                 vfree(map);
  525         }
  526 }
  527 
  528 #endif

Cache object: cb547ba44437cf5f198beb5c611c8a98


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.