The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/mm/page_alloc.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  *  linux/mm/page_alloc.c
    3  *
    4  *  Manages the free list, the system allocates free pages here.
    5  *  Note that kmalloc() lives in slab.c
    6  *
    7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
    8  *  Swap reorganised 29.12.95, Stephen Tweedie
    9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
   11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
   12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
   13  */
   14 
   15 #include <linux/config.h>
   16 #include <linux/mm.h>
   17 #include <linux/swap.h>
   18 #include <linux/swapctl.h>
   19 #include <linux/interrupt.h>
   20 #include <linux/pagemap.h>
   21 #include <linux/bootmem.h>
   22 #include <linux/slab.h>
   23 #include <linux/module.h>
   24 
   25 int nr_swap_pages;
   26 int nr_active_pages;
   27 int nr_inactive_pages;
   28 LIST_HEAD(inactive_list);
   29 LIST_HEAD(active_list);
   30 pg_data_t *pgdat_list;
   31 
   32 /*
   33  *
   34  * The zone_table array is used to look up the address of the
   35  * struct zone corresponding to a given zone number (ZONE_DMA,
   36  * ZONE_NORMAL, or ZONE_HIGHMEM).
   37  */
   38 zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
   39 EXPORT_SYMBOL(zone_table);
   40 
   41 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
   42 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
   43 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
   44 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
   45 
   46 /*
   47  * Temporary debugging check.
   48  */
   49 #define BAD_RANGE(zone, page)                                           \
   50 (                                                                       \
   51         (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
   52         || (((page) - mem_map) < (zone)->zone_start_mapnr)              \
   53         || ((zone) != page_zone(page))                                  \
   54 )
   55 
   56 /*
   57  * Freeing function for a buddy system allocator.
   58  * Contrary to prior comments, this is *NOT* hairy, and there
   59  * is no reason for anyone not to understand it.
   60  *
   61  * The concept of a buddy system is to maintain direct-mapped tables
   62  * (containing bit values) for memory blocks of various "orders".
   63  * The bottom level table contains the map for the smallest allocatable
   64  * units of memory (here, pages), and each level above it describes
   65  * pairs of units from the levels below, hence, "buddies".
   66  * At a high level, all that happens here is marking the table entry
   67  * at the bottom level available, and propagating the changes upward
   68  * as necessary, plus some accounting needed to play nicely with other
   69  * parts of the VM system.
   70  * At each level, we keep one bit for each pair of blocks, which
   71  * is set to 1 iff only one of the pair is allocated.  So when we
   72  * are allocating or freeing one, we can derive the state of the
   73  * other.  That is, if we allocate a small block, and both were   
   74  * free, the remainder of the region must be split into blocks.   
   75  * If a block is freed, and its buddy is also free, then this
   76  * triggers coalescing into a block of larger size.            
   77  *
   78  * -- wli
   79  */
   80 
   81 static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
   82 static void __free_pages_ok (struct page *page, unsigned int order)
   83 {
   84         unsigned long index, page_idx, mask, flags;
   85         free_area_t *area;
   86         struct page *base;
   87         zone_t *zone;
   88 
   89         /*
   90          * Yes, think what happens when other parts of the kernel take 
   91          * a reference to a page in order to pin it for io. -ben
   92          */
   93         if (PageLRU(page)) {
   94                 if (unlikely(in_interrupt()))
   95                         BUG();
   96                 lru_cache_del(page);
   97         }
   98 
   99         if (page->buffers)
  100                 BUG();
  101         if (page->mapping)
  102                 BUG();
  103         if (!VALID_PAGE(page))
  104                 BUG();
  105         if (PageLocked(page))
  106                 BUG();
  107         if (PageActive(page))
  108                 BUG();
  109         page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
  110 
  111         if (current->flags & PF_FREE_PAGES)
  112                 goto local_freelist;
  113  back_local_freelist:
  114 
  115         zone = page_zone(page);
  116 
  117         mask = (~0UL) << order;
  118         base = zone->zone_mem_map;
  119         page_idx = page - base;
  120         if (page_idx & ~mask)
  121                 BUG();
  122         index = page_idx >> (1 + order);
  123 
  124         area = zone->free_area + order;
  125 
  126         spin_lock_irqsave(&zone->lock, flags);
  127 
  128         zone->free_pages -= mask;
  129 
  130         while (mask + (1 << (MAX_ORDER-1))) {
  131                 struct page *buddy1, *buddy2;
  132 
  133                 if (area >= zone->free_area + MAX_ORDER)
  134                         BUG();
  135                 if (!__test_and_change_bit(index, area->map))
  136                         /*
  137                          * the buddy page is still allocated.
  138                          */
  139                         break;
  140                 /*
  141                  * Move the buddy up one level.
  142                  * This code is taking advantage of the identity:
  143                  *      -mask = 1+~mask
  144                  */
  145                 buddy1 = base + (page_idx ^ -mask);
  146                 buddy2 = base + page_idx;
  147                 if (BAD_RANGE(zone,buddy1))
  148                         BUG();
  149                 if (BAD_RANGE(zone,buddy2))
  150                         BUG();
  151 
  152                 list_del(&buddy1->list);
  153                 mask <<= 1;
  154                 area++;
  155                 index >>= 1;
  156                 page_idx &= mask;
  157         }
  158         list_add(&(base + page_idx)->list, &area->free_list);
  159 
  160         spin_unlock_irqrestore(&zone->lock, flags);
  161         return;
  162 
  163  local_freelist:
  164         if (current->nr_local_pages)
  165                 goto back_local_freelist;
  166         if (in_interrupt())
  167                 goto back_local_freelist;               
  168 
  169         list_add(&page->list, &current->local_pages);
  170         page->index = order;
  171         current->nr_local_pages++;
  172 }
  173 
  174 #define MARK_USED(index, order, area) \
  175         __change_bit((index) >> (1+(order)), (area)->map)
  176 
  177 static inline struct page * expand (zone_t *zone, struct page *page,
  178          unsigned long index, int low, int high, free_area_t * area)
  179 {
  180         unsigned long size = 1 << high;
  181 
  182         while (high > low) {
  183                 if (BAD_RANGE(zone,page))
  184                         BUG();
  185                 area--;
  186                 high--;
  187                 size >>= 1;
  188                 list_add(&(page)->list, &(area)->free_list);
  189                 MARK_USED(index, high, area);
  190                 index += size;
  191                 page += size;
  192         }
  193         if (BAD_RANGE(zone,page))
  194                 BUG();
  195         return page;
  196 }
  197 
  198 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
  199 static struct page * rmqueue(zone_t *zone, unsigned int order)
  200 {
  201         free_area_t * area = zone->free_area + order;
  202         unsigned int curr_order = order;
  203         struct list_head *head, *curr;
  204         unsigned long flags;
  205         struct page *page;
  206 
  207         spin_lock_irqsave(&zone->lock, flags);
  208         do {
  209                 head = &area->free_list;
  210                 curr = head->next;
  211 
  212                 if (curr != head) {
  213                         unsigned int index;
  214 
  215                         page = list_entry(curr, struct page, list);
  216                         if (BAD_RANGE(zone,page))
  217                                 BUG();
  218                         list_del(curr);
  219                         index = page - zone->zone_mem_map;
  220                         if (curr_order != MAX_ORDER-1)
  221                                 MARK_USED(index, curr_order, area);
  222                         zone->free_pages -= 1UL << order;
  223 
  224                         page = expand(zone, page, index, order, curr_order, area);
  225                         spin_unlock_irqrestore(&zone->lock, flags);
  226 
  227                         set_page_count(page, 1);
  228                         if (BAD_RANGE(zone,page))
  229                                 BUG();
  230                         if (PageLRU(page))
  231                                 BUG();
  232                         if (PageActive(page))
  233                                 BUG();
  234                         return page;    
  235                 }
  236                 curr_order++;
  237                 area++;
  238         } while (curr_order < MAX_ORDER);
  239         spin_unlock_irqrestore(&zone->lock, flags);
  240 
  241         return NULL;
  242 }
  243 
  244 #ifndef CONFIG_DISCONTIGMEM
  245 struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
  246 {
  247         return __alloc_pages(gfp_mask, order,
  248                 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
  249 }
  250 #endif
  251 
  252 static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
  253 static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
  254 {
  255         struct page * page = NULL;
  256         int __freed = 0;
  257 
  258         if (!(gfp_mask & __GFP_WAIT))
  259                 goto out;
  260         if (in_interrupt())
  261                 BUG();
  262 
  263         current->allocation_order = order;
  264         current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
  265 
  266         __freed = try_to_free_pages_zone(classzone, gfp_mask);
  267 
  268         current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
  269 
  270         if (current->nr_local_pages) {
  271                 struct list_head * entry, * local_pages;
  272                 struct page * tmp;
  273                 int nr_pages;
  274 
  275                 local_pages = &current->local_pages;
  276 
  277                 if (likely(__freed)) {
  278                         /* pick from the last inserted so we're lifo */
  279                         entry = local_pages->next;
  280                         do {
  281                                 tmp = list_entry(entry, struct page, list);
  282                                 if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
  283                                         list_del(entry);
  284                                         current->nr_local_pages--;
  285                                         set_page_count(tmp, 1);
  286                                         page = tmp;
  287 
  288                                         if (page->buffers)
  289                                                 BUG();
  290                                         if (page->mapping)
  291                                                 BUG();
  292                                         if (!VALID_PAGE(page))
  293                                                 BUG();
  294                                         if (PageLocked(page))
  295                                                 BUG();
  296                                         if (PageLRU(page))
  297                                                 BUG();
  298                                         if (PageActive(page))
  299                                                 BUG();
  300                                         if (PageDirty(page))
  301                                                 BUG();
  302 
  303                                         break;
  304                                 }
  305                         } while ((entry = entry->next) != local_pages);
  306                 }
  307 
  308                 nr_pages = current->nr_local_pages;
  309                 /* free in reverse order so that the global order will be lifo */
  310                 while ((entry = local_pages->prev) != local_pages) {
  311                         list_del(entry);
  312                         tmp = list_entry(entry, struct page, list);
  313                         __free_pages_ok(tmp, tmp->index);
  314                         if (!nr_pages--)
  315                                 BUG();
  316                 }
  317                 current->nr_local_pages = 0;
  318         }
  319  out:
  320         *freed = __freed;
  321         return page;
  322 }
  323 
  324 /*
  325  * This is the 'heart' of the zoned buddy allocator:
  326  */
  327 struct page * __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
  328 {
  329         unsigned long min;
  330         zone_t **zone, * classzone;
  331         struct page * page;
  332         int freed;
  333 
  334         zone = zonelist->zones;
  335         classzone = *zone;
  336         if (classzone == NULL)
  337                 return NULL;
  338         min = 1UL << order;
  339         for (;;) {
  340                 zone_t *z = *(zone++);
  341                 if (!z)
  342                         break;
  343 
  344                 min += z->pages_low;
  345                 if (z->free_pages > min) {
  346                         page = rmqueue(z, order);
  347                         if (page)
  348                                 return page;
  349                 }
  350         }
  351 
  352         classzone->need_balance = 1;
  353         mb();
  354         if (waitqueue_active(&kswapd_wait))
  355                 wake_up_interruptible(&kswapd_wait);
  356 
  357         zone = zonelist->zones;
  358         min = 1UL << order;
  359         for (;;) {
  360                 unsigned long local_min;
  361                 zone_t *z = *(zone++);
  362                 if (!z)
  363                         break;
  364 
  365                 local_min = z->pages_min;
  366                 if (!(gfp_mask & __GFP_WAIT))
  367                         local_min >>= 2;
  368                 min += local_min;
  369                 if (z->free_pages > min) {
  370                         page = rmqueue(z, order);
  371                         if (page)
  372                                 return page;
  373                 }
  374         }
  375 
  376         /* here we're in the low on memory slow path */
  377 
  378 rebalance:
  379         if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
  380                 zone = zonelist->zones;
  381                 for (;;) {
  382                         zone_t *z = *(zone++);
  383                         if (!z)
  384                                 break;
  385 
  386                         page = rmqueue(z, order);
  387                         if (page)
  388                                 return page;
  389                 }
  390                 return NULL;
  391         }
  392 
  393         /* Atomic allocations - we can't balance anything */
  394         if (!(gfp_mask & __GFP_WAIT))
  395                 return NULL;
  396 
  397         page = balance_classzone(classzone, gfp_mask, order, &freed);
  398         if (page)
  399                 return page;
  400 
  401         zone = zonelist->zones;
  402         min = 1UL << order;
  403         for (;;) {
  404                 zone_t *z = *(zone++);
  405                 if (!z)
  406                         break;
  407 
  408                 min += z->pages_min;
  409                 if (z->free_pages > min) {
  410                         page = rmqueue(z, order);
  411                         if (page)
  412                                 return page;
  413                 }
  414         }
  415 
  416         /* Don't let big-order allocations loop */
  417         if (order > 3)
  418                 return NULL;
  419 
  420         /* Yield for kswapd, and try again */
  421         yield();
  422         goto rebalance;
  423 }
  424 
  425 /*
  426  * Common helper functions.
  427  */
  428 unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
  429 {
  430         struct page * page;
  431 
  432         page = alloc_pages(gfp_mask, order);
  433         if (!page)
  434                 return 0;
  435         return (unsigned long) page_address(page);
  436 }
  437 
  438 unsigned long get_zeroed_page(unsigned int gfp_mask)
  439 {
  440         struct page * page;
  441 
  442         page = alloc_pages(gfp_mask, 0);
  443         if (page) {
  444                 void *address = page_address(page);
  445                 clear_page(address);
  446                 return (unsigned long) address;
  447         }
  448         return 0;
  449 }
  450 
  451 void __free_pages(struct page *page, unsigned int order)
  452 {
  453         if (!PageReserved(page) && put_page_testzero(page))
  454                 __free_pages_ok(page, order);
  455 }
  456 
  457 void free_pages(unsigned long addr, unsigned int order)
  458 {
  459         if (addr != 0)
  460                 __free_pages(virt_to_page(addr), order);
  461 }
  462 
  463 /*
  464  * Total amount of free (allocatable) RAM:
  465  */
  466 unsigned int nr_free_pages (void)
  467 {
  468         unsigned int sum = 0;
  469         zone_t *zone;
  470 
  471         for_each_zone(zone)
  472                 sum += zone->free_pages;
  473 
  474         return sum;
  475 }
  476 
  477 /*
  478  * Amount of free RAM allocatable as buffer memory:
  479  */
  480 unsigned int nr_free_buffer_pages (void)
  481 {
  482         pg_data_t *pgdat;
  483         unsigned int sum = 0;
  484 
  485         for_each_pgdat(pgdat) {
  486                 zonelist_t *zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
  487                 zone_t **zonep = zonelist->zones;
  488                 zone_t *zone;
  489 
  490                 for (zone = *zonep++; zone; zone = *zonep++) {
  491                         unsigned long size = zone->size;
  492                         unsigned long high = zone->pages_high;
  493                         if (size > high)
  494                                 sum += size - high;
  495                 }
  496         }
  497 
  498         return sum;
  499 }
  500 
  501 #if CONFIG_HIGHMEM
  502 unsigned int nr_free_highpages (void)
  503 {
  504         pg_data_t *pgdat;
  505         unsigned int pages = 0;
  506 
  507         for_each_pgdat(pgdat)
  508                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
  509 
  510         return pages;
  511 }
  512 #endif
  513 
  514 #define K(x) ((x) << (PAGE_SHIFT-10))
  515 
  516 /*
  517  * Show free area list (used inside shift_scroll-lock stuff)
  518  * We also calculate the percentage fragmentation. We do this by counting the
  519  * memory on each free list with the exception of the first item on the list.
  520  */
  521 void show_free_areas_core(pg_data_t *pgdat)
  522 {
  523         unsigned int order;
  524         unsigned type;
  525         pg_data_t *tmpdat = pgdat;
  526 
  527         printk("Free pages:      %6dkB (%6dkB HighMem)\n",
  528                 K(nr_free_pages()),
  529                 K(nr_free_highpages()));
  530 
  531         while (tmpdat) {
  532                 zone_t *zone;
  533                 for (zone = tmpdat->node_zones;
  534                                 zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
  535                         printk("Zone:%s freepages:%6lukB min:%6lukB low:%6lukB " 
  536                                        "high:%6lukB\n", 
  537                                         zone->name,
  538                                         K(zone->free_pages),
  539                                         K(zone->pages_min),
  540                                         K(zone->pages_low),
  541                                         K(zone->pages_high));
  542                         
  543                 tmpdat = tmpdat->node_next;
  544         }
  545 
  546         printk("( Active: %d, inactive: %d, free: %d )\n",
  547                nr_active_pages,
  548                nr_inactive_pages,
  549                nr_free_pages());
  550 
  551         for (type = 0; type < MAX_NR_ZONES; type++) {
  552                 struct list_head *head, *curr;
  553                 zone_t *zone = pgdat->node_zones + type;
  554                 unsigned long nr, total, flags;
  555 
  556                 total = 0;
  557                 if (zone->size) {
  558                         spin_lock_irqsave(&zone->lock, flags);
  559                         for (order = 0; order < MAX_ORDER; order++) {
  560                                 head = &(zone->free_area + order)->free_list;
  561                                 curr = head;
  562                                 nr = 0;
  563                                 for (;;) {
  564                                         if ((curr = curr->next) == head)
  565                                                 break;
  566                                         nr++;
  567                                 }
  568                                 total += nr * (1 << order);
  569                                 printk("%lu*%lukB ", nr, K(1UL) << order);
  570                         }
  571                         spin_unlock_irqrestore(&zone->lock, flags);
  572                 }
  573                 printk("= %lukB)\n", K(total));
  574         }
  575 
  576 #ifdef SWAP_CACHE_INFO
  577         show_swap_cache_info();
  578 #endif  
  579 }
  580 
  581 void show_free_areas(void)
  582 {
  583         show_free_areas_core(pgdat_list);
  584 }
  585 
  586 /*
  587  * Builds allocation fallback zone lists.
  588  */
  589 static inline void build_zonelists(pg_data_t *pgdat)
  590 {
  591         int i, j, k;
  592 
  593         for (i = 0; i <= GFP_ZONEMASK; i++) {
  594                 zonelist_t *zonelist;
  595                 zone_t *zone;
  596 
  597                 zonelist = pgdat->node_zonelists + i;
  598                 memset(zonelist, 0, sizeof(*zonelist));
  599 
  600                 j = 0;
  601                 k = ZONE_NORMAL;
  602                 if (i & __GFP_HIGHMEM)
  603                         k = ZONE_HIGHMEM;
  604                 if (i & __GFP_DMA)
  605                         k = ZONE_DMA;
  606 
  607                 switch (k) {
  608                         default:
  609                                 BUG();
  610                         /*
  611                          * fallthrough:
  612                          */
  613                         case ZONE_HIGHMEM:
  614                                 zone = pgdat->node_zones + ZONE_HIGHMEM;
  615                                 if (zone->size) {
  616 #ifndef CONFIG_HIGHMEM
  617                                         BUG();
  618 #endif
  619                                         zonelist->zones[j++] = zone;
  620                                 }
  621                         case ZONE_NORMAL:
  622                                 zone = pgdat->node_zones + ZONE_NORMAL;
  623                                 if (zone->size)
  624                                         zonelist->zones[j++] = zone;
  625                         case ZONE_DMA:
  626                                 zone = pgdat->node_zones + ZONE_DMA;
  627                                 if (zone->size)
  628                                         zonelist->zones[j++] = zone;
  629                 }
  630                 zonelist->zones[j++] = NULL;
  631         } 
  632 }
  633 
  634 /*
  635  * Helper functions to size the waitqueue hash table.
  636  * Essentially these want to choose hash table sizes sufficiently
  637  * large so that collisions trying to wait on pages are rare.
  638  * But in fact, the number of active page waitqueues on typical
  639  * systems is ridiculously low, less than 200. So this is even
  640  * conservative, even though it seems large.
  641  *
  642  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
  643  * waitqueues, i.e. the size of the waitq table given the number of pages.
  644  */
  645 #define PAGES_PER_WAITQUEUE     256
  646 
  647 static inline unsigned long wait_table_size(unsigned long pages)
  648 {
  649         unsigned long size = 1;
  650 
  651         pages /= PAGES_PER_WAITQUEUE;
  652 
  653         while (size < pages)
  654                 size <<= 1;
  655 
  656         /*
  657          * Once we have dozens or even hundreds of threads sleeping
  658          * on IO we've got bigger problems than wait queue collision.
  659          * Limit the size of the wait table to a reasonable size.
  660          */
  661         size = min(size, 4096UL);
  662 
  663         return size;
  664 }
  665 
  666 /*
  667  * This is an integer logarithm so that shifts can be used later
  668  * to extract the more random high bits from the multiplicative
  669  * hash function before the remainder is taken.
  670  */
  671 static inline unsigned long wait_table_bits(unsigned long size)
  672 {
  673         return ffz(~size);
  674 }
  675 
  676 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
  677 
  678 /*
  679  * Set up the zone data structures:
  680  *   - mark all pages reserved
  681  *   - mark all memory queues empty
  682  *   - clear the memory bitmaps
  683  */
  684 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
  685         unsigned long *zones_size, unsigned long zone_start_paddr, 
  686         unsigned long *zholes_size, struct page *lmem_map)
  687 {
  688         unsigned long i, j;
  689         unsigned long map_size;
  690         unsigned long totalpages, offset, realtotalpages;
  691         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
  692 
  693         if (zone_start_paddr & ~PAGE_MASK)
  694                 BUG();
  695 
  696         totalpages = 0;
  697         for (i = 0; i < MAX_NR_ZONES; i++) {
  698                 unsigned long size = zones_size[i];
  699                 totalpages += size;
  700         }
  701         realtotalpages = totalpages;
  702         if (zholes_size)
  703                 for (i = 0; i < MAX_NR_ZONES; i++)
  704                         realtotalpages -= zholes_size[i];
  705                         
  706         printk("On node %d totalpages: %lu\n", nid, realtotalpages);
  707 
  708         /*
  709          * Some architectures (with lots of mem and discontinous memory
  710          * maps) have to search for a good mem_map area:
  711          * For discontigmem, the conceptual mem map array starts from 
  712          * PAGE_OFFSET, we need to align the actual array onto a mem map 
  713          * boundary, so that MAP_NR works.
  714          */
  715         map_size = (totalpages + 1)*sizeof(struct page);
  716         if (lmem_map == (struct page *)0) {
  717                 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
  718                 lmem_map = (struct page *)(PAGE_OFFSET + 
  719                         MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
  720         }
  721         *gmap = pgdat->node_mem_map = lmem_map;
  722         pgdat->node_size = totalpages;
  723         pgdat->node_start_paddr = zone_start_paddr;
  724         pgdat->node_start_mapnr = (lmem_map - mem_map);
  725         pgdat->nr_zones = 0;
  726 
  727         offset = lmem_map - mem_map;    
  728         for (j = 0; j < MAX_NR_ZONES; j++) {
  729                 zone_t *zone = pgdat->node_zones + j;
  730                 unsigned long mask;
  731                 unsigned long size, realsize;
  732 
  733                 zone_table[nid * MAX_NR_ZONES + j] = zone;
  734                 realsize = size = zones_size[j];
  735                 if (zholes_size)
  736                         realsize -= zholes_size[j];
  737 
  738                 printk("zone(%lu): %lu pages.\n", j, size);
  739                 zone->size = size;
  740                 zone->name = zone_names[j];
  741                 zone->lock = SPIN_LOCK_UNLOCKED;
  742                 zone->zone_pgdat = pgdat;
  743                 zone->free_pages = 0;
  744                 zone->need_balance = 0;
  745                 if (!size)
  746                         continue;
  747 
  748                 /*
  749                  * The per-page waitqueue mechanism uses hashed waitqueues
  750                  * per zone.
  751                  */
  752                 zone->wait_table_size = wait_table_size(size);
  753                 zone->wait_table_shift =
  754                         BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
  755                 zone->wait_table = (wait_queue_head_t *)
  756                         alloc_bootmem_node(pgdat, zone->wait_table_size
  757                                                 * sizeof(wait_queue_head_t));
  758 
  759                 for(i = 0; i < zone->wait_table_size; ++i)
  760                         init_waitqueue_head(zone->wait_table + i);
  761 
  762                 pgdat->nr_zones = j+1;
  763 
  764                 mask = (realsize / zone_balance_ratio[j]);
  765                 if (mask < zone_balance_min[j])
  766                         mask = zone_balance_min[j];
  767                 else if (mask > zone_balance_max[j])
  768                         mask = zone_balance_max[j];
  769                 zone->pages_min = mask;
  770                 zone->pages_low = mask*2;
  771                 zone->pages_high = mask*3;
  772 
  773                 zone->zone_mem_map = mem_map + offset;
  774                 zone->zone_start_mapnr = offset;
  775                 zone->zone_start_paddr = zone_start_paddr;
  776 
  777                 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
  778                         printk("BUG: wrong zone alignment, it will crash\n");
  779 
  780                 /*
  781                  * Initially all pages are reserved - free ones are freed
  782                  * up by free_all_bootmem() once the early boot process is
  783                  * done. Non-atomic initialization, single-pass.
  784                  */
  785                 for (i = 0; i < size; i++) {
  786                         struct page *page = mem_map + offset + i;
  787                         set_page_zone(page, nid * MAX_NR_ZONES + j);
  788                         set_page_count(page, 0);
  789                         SetPageReserved(page);
  790                         INIT_LIST_HEAD(&page->list);
  791                         if (j != ZONE_HIGHMEM)
  792                                 set_page_address(page, __va(zone_start_paddr));
  793                         zone_start_paddr += PAGE_SIZE;
  794                 }
  795 
  796                 offset += size;
  797                 for (i = 0; ; i++) {
  798                         unsigned long bitmap_size;
  799 
  800                         INIT_LIST_HEAD(&zone->free_area[i].free_list);
  801                         if (i == MAX_ORDER-1) {
  802                                 zone->free_area[i].map = NULL;
  803                                 break;
  804                         }
  805 
  806                         /*
  807                          * Page buddy system uses "index >> (i+1)",
  808                          * where "index" is at most "size-1".
  809                          *
  810                          * The extra "+3" is to round down to byte
  811                          * size (8 bits per byte assumption). Thus
  812                          * we get "(size-1) >> (i+4)" as the last byte
  813                          * we can access.
  814                          *
  815                          * The "+1" is because we want to round the
  816                          * byte allocation up rather than down. So
  817                          * we should have had a "+7" before we shifted
  818                          * down by three. Also, we have to add one as
  819                          * we actually _use_ the last bit (it's [0,n]
  820                          * inclusive, not [0,n[).
  821                          *
  822                          * So we actually had +7+1 before we shift
  823                          * down by 3. But (n+8) >> 3 == (n >> 3) + 1
  824                          * (modulo overflows, which we do not have).
  825                          *
  826                          * Finally, we LONG_ALIGN because all bitmap
  827                          * operations are on longs.
  828                          */
  829                         bitmap_size = (size-1) >> (i+4);
  830                         bitmap_size = LONG_ALIGN(bitmap_size+1);
  831                         zone->free_area[i].map = 
  832                           (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
  833                 }
  834         }
  835         build_zonelists(pgdat);
  836 }
  837 
  838 void __init free_area_init(unsigned long *zones_size)
  839 {
  840         free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
  841 }
  842 
  843 static int __init setup_mem_frac(char *str)
  844 {
  845         int j = 0;
  846 
  847         while (get_option(&str, &zone_balance_ratio[j++]) == 2);
  848         printk("setup_mem_frac: ");
  849         for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
  850         printk("\n");
  851         return 1;
  852 }
  853 
  854 __setup("memfrac=", setup_mem_frac);

Cache object: 10104706c3679e3f366696e72e755642


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.