Newer
Older
/* Construct the zonelist performance cache - see further mmzone.h */
static void build_zonelist_cache(pg_data_t *pgdat)
{
int i;
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zonelist *zonelist;
struct zonelist_cache *zlc;
struct zone **z;
zonelist = pgdat->node_zonelists + i;
zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
for (z = zonelist->zones; *z; z++)
zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
}
}
static void set_zonelist_order(void)
{
current_zonelist_order = ZONELIST_ORDER_ZONE;
}
static void build_zonelists(pg_data_t *pgdat)
int node, local_node;
enum zone_type i,j;
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zonelist *zonelist;
zonelist = pgdat->node_zonelists + i;
j = build_zonelists_node(pgdat, zonelist, 0, i);
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
}
zonelist->zones[j] = NULL;
}
}
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
static void build_zonelist_cache(pg_data_t *pgdat)
{
int i;
for (i = 0; i < MAX_NR_ZONES; i++)
pgdat->node_zonelists[i].zlcache_ptr = NULL;
}
Yasunori Goto
committed
/* return values int ....just for stop_machine_run() */
static int __build_all_zonelists(void *dummy)
Yasunori Goto
committed
int nid;
for_each_online_node(nid) {
Yasunori Goto
committed
build_zonelists(NODE_DATA(nid));
build_zonelist_cache(NODE_DATA(nid));
}
Yasunori Goto
committed
return 0;
}
void build_all_zonelists(void)
Yasunori Goto
committed
{
set_zonelist_order();
Yasunori Goto
committed
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
Yasunori Goto
committed
cpuset_init_current_mems_allowed();
} else {
/* we have to stop all cpus to guaranntee there is no user
of zonelist */
stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
printk("Built %i zonelists in %s order. Total pages: %ld\n",
num_online_nodes(),
zonelist_order_name[current_zonelist_order],
vm_total_pages);
#ifdef CONFIG_NUMA
printk("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}
/*
* Helper functions to size the waitqueue hash table.
* Essentially these want to choose hash table sizes sufficiently
* large so that collisions trying to wait on pages are rare.
* But in fact, the number of active page waitqueues on typical
* systems is ridiculously low, less than 200. So this is even
* conservative, even though it seems large.
*
* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
* waitqueues, i.e. the size of the waitq table given the number of pages.
*/
#define PAGES_PER_WAITQUEUE 256
Yasunori Goto
committed
#ifndef CONFIG_MEMORY_HOTPLUG
Yasunori Goto
committed
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
unsigned long size = 1;
pages /= PAGES_PER_WAITQUEUE;
while (size < pages)
size <<= 1;
/*
* Once we have dozens or even hundreds of threads sleeping
* on IO we've got bigger problems than wait queue collision.
* Limit the size of the wait table to a reasonable size.
*/
size = min(size, 4096UL);
return max(size, 4UL);
}
Yasunori Goto
committed
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
#else
/*
* A zone's size might be changed by hot-add, so it is not possible to determine
* a suitable size for its wait_table. So we use the maximum size now.
*
* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
*
* i386 (preemption config) : 4096 x 16 = 64Kbyte.
* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
*
* The maximum entries are prepared when a zone's memory is (512K + 256) pages
* or more by the traditional way. (See above). It equals:
*
* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
* ia64(16K page size) : = ( 8G + 4M)byte.
* powerpc (64K page size) : = (32G +16M)byte.
*/
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
return 4096UL;
}
#endif
/*
* This is an integer logarithm so that shifts can be used later
* to extract the more random high bits from the multiplicative
* hash function before the remainder is taken.
*/
static inline unsigned long wait_table_bits(unsigned long size)
{
return ffz(~size);
}
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
}
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
reset_page_mapcount(page);
SetPageReserved(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
struct zone *zone, unsigned long size)
{
int order;
for (order = 0; order < MAX_ORDER ; order++) {
INIT_LIST_HEAD(&zone->free_area[order].free_list);
zone->free_area[order].nr_free = 0;
}
}
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
static int __devinit zone_batchsize(struct zone *zone)
{
int batch;
/*
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone. But no more than 1/2 of a meg.
*
* OK, so we don't know how big the cache is. So guess.
*/
batch = zone->present_pages / 1024;
if (batch * PAGE_SIZE > 512 * 1024)
batch = (512 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
batch = (1 << (fls(batch + batch/2)-1)) - 1;
inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
struct per_cpu_pages *pcp;
memset(p, 0, sizeof(*p));
pcp = &p->pcp[0]; /* hot */
pcp->count = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&pcp->list);
pcp = &p->pcp[1]; /* cold*/
pcp->count = 0;
pcp->high = 2 * batch;
pcp->batch = max(1UL, batch/2);
INIT_LIST_HEAD(&pcp->list);
}
/*
* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
* to the value high for the pageset p.
*/
static void setup_pagelist_highmark(struct per_cpu_pageset *p,
unsigned long high)
{
struct per_cpu_pages *pcp;
pcp = &p->pcp[0]; /* hot list */
pcp->high = high;
pcp->batch = max(1UL, high/4);
if ((high/4) > (PAGE_SHIFT * 8))
pcp->batch = PAGE_SHIFT * 8;
}
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
* that an item put on a list will immediately be handed over to
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
* Some NUMA counter updates may also be caught by the boot pagesets.
*
* The boot_pagesets must be kept even after bootup is complete for
* unused processors and/or zones. They do play a role for bootstrapping
* hotplugged processors.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
static struct per_cpu_pageset boot_pageset[NR_CPUS];
/*
* Dynamically allocate memory for the
* per cpu pageset array in struct zone.
*/
static int __cpuinit process_zones(int cpu)
{
struct zone *zone, *dzone;
for_each_zone(zone) {
if (!populated_zone(zone))
continue;
zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
if (percpu_pagelist_fraction)
setup_pagelist_highmark(zone_pcp(zone, cpu),
(zone->present_pages / percpu_pagelist_fraction));
}
return 0;
bad:
for_each_zone(dzone) {
if (dzone == zone)
break;
kfree(zone_pcp(dzone, cpu));
zone_pcp(dzone, cpu) = NULL;
}
return -ENOMEM;
}
static inline void free_zone_pagesets(int cpu)
{
struct zone *zone;
for_each_zone(zone) {
struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
/* Free per_cpu_pageset if it is slab allocated */
if (pset != &boot_pageset[cpu])
kfree(pset);
zone_pcp(zone, cpu) = NULL;
}
}
static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
int ret = NOTIFY_OK;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
if (process_zones(cpu))
ret = NOTIFY_BAD;
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_zone_pagesets(cpu);
break;
default:
break;
Chandra Seetharaman
committed
static struct notifier_block __cpuinitdata pageset_notifier =
{ &pageset_cpuup_callback, NULL, 0 };
void __init setup_per_cpu_pageset(void)
{
int err;
/* Initialize per_cpu_pageset for cpu 0.
* A cpuup callback will do this for every cpu
* as it comes online
*/
err = process_zones(smp_processor_id());
BUG_ON(err);
register_cpu_notifier(&pageset_notifier);
}
#endif
Yasunori Goto
committed
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
struct pglist_data *pgdat = zone->zone_pgdat;
Yasunori Goto
committed
size_t alloc_size;
/*
* The per-page waitqueue mechanism uses hashed waitqueues
* per zone.
*/
Yasunori Goto
committed
zone->wait_table_hash_nr_entries =
wait_table_hash_nr_entries(zone_size_pages);
zone->wait_table_bits =
wait_table_bits(zone->wait_table_hash_nr_entries);
Yasunori Goto
committed
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
alloc_size = zone->wait_table_hash_nr_entries
* sizeof(wait_queue_head_t);
if (system_state == SYSTEM_BOOTING) {
zone->wait_table = (wait_queue_head_t *)
alloc_bootmem_node(pgdat, alloc_size);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
* via memory hot-add.
* But it may be the case that a new node was hot-added. In
* this case vmalloc() will not be able to use this new node's
* memory - this wait_table must be initialized to use this new
* node itself as well.
* To use this new node's memory, further consideration will be
* necessary.
*/
zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
}
if (!zone->wait_table)
return -ENOMEM;
Yasunori Goto
committed
for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
init_waitqueue_head(zone->wait_table + i);
Yasunori Goto
committed
return 0;
static __meminit void zone_pcp_init(struct zone *zone)
{
int cpu;
unsigned long batch = zone_batchsize(zone);
for (cpu = 0; cpu < NR_CPUS; cpu++) {
#ifdef CONFIG_NUMA
/* Early boot. Slab allocator not functional yet */
setup_pageset(&boot_pageset[cpu],0);
#else
setup_pageset(zone_pcp(zone,cpu), batch);
#endif
}
if (zone->present_pages)
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
zone->name, zone->present_pages, batch);
Yasunori Goto
committed
__meminit int init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size,
enum memmap_context context)
{
struct pglist_data *pgdat = zone->zone_pgdat;
Yasunori Goto
committed
int ret;
ret = zone_wait_table_init(zone, size);
if (ret)
return ret;
pgdat->nr_zones = zone_idx(zone) + 1;
zone->zone_start_pfn = zone_start_pfn;
memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
zone_init_free_lists(pgdat, zone, zone->spanned_pages);
Yasunori Goto
committed
return 0;
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
/*
* Basic iterator support. Return the first range of PFNs for a node
* Note: nid == MAX_NUMNODES returns first region regardless of node
*/
static int __meminit first_active_region_index_in_nid(int nid)
{
int i;
for (i = 0; i < nr_nodemap_entries; i++)
if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
return i;
return -1;
}
/*
* Basic iterator support. Return the next active range of PFNs for a node
* Note: nid == MAX_NUMNODES returns next region regardles of node
*/
static int __meminit next_active_region_index_in_nid(int index, int nid)
{
for (index = index + 1; index < nr_nodemap_entries; index++)
if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
return index;
return -1;
}
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
* Architectures may implement their own version but if add_active_range()
* was used and there are no special requirements, this is a convenient
* alternative
*/
int __meminit early_pfn_to_nid(unsigned long pfn)
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
{
int i;
for (i = 0; i < nr_nodemap_entries; i++) {
unsigned long start_pfn = early_node_map[i].start_pfn;
unsigned long end_pfn = early_node_map[i].end_pfn;
if (start_pfn <= pfn && pfn < end_pfn)
return early_node_map[i].nid;
}
return 0;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
/* Basic iterator support to walk early_node_map[] */
#define for_each_active_range_index_in_nid(i, nid) \
for (i = first_active_region_index_in_nid(nid); i != -1; \
i = next_active_region_index_in_nid(i, nid))
/**
* free_bootmem_with_active_regions - Call free_bootmem_node for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
* @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
* this function may be used instead of calling free_bootmem() manually.
*/
void __init free_bootmem_with_active_regions(int nid,
unsigned long max_low_pfn)
{
int i;
for_each_active_range_index_in_nid(i, nid) {
unsigned long size_pages = 0;
unsigned long end_pfn = early_node_map[i].end_pfn;
if (early_node_map[i].start_pfn >= max_low_pfn)
continue;
if (end_pfn > max_low_pfn)
end_pfn = max_low_pfn;
size_pages = end_pfn - early_node_map[i].start_pfn;
free_bootmem_node(NODE_DATA(early_node_map[i].nid),
PFN_PHYS(early_node_map[i].start_pfn),
size_pages << PAGE_SHIFT);
}
}
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
* function may be used instead of calling memory_present() manually.
*/
void __init sparse_memory_present_with_active_regions(int nid)
{
int i;
for_each_active_range_index_in_nid(i, nid)
memory_present(early_node_map[i].nid,
early_node_map[i].start_pfn,
early_node_map[i].end_pfn);
}
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
/**
* push_node_boundaries - Push node boundaries to at least the requested boundary
* @nid: The nid of the node to push the boundary for
* @start_pfn: The start pfn of the node
* @end_pfn: The end pfn of the node
*
* In reserve-based hot-add, mem_map is allocated that is unused until hotadd
* time. Specifically, on x86_64, SRAT will report ranges that can potentially
* be hotplugged even though no physical memory exists. This function allows
* an arch to push out the node boundaries so mem_map is allocated that can
* be used later.
*/
#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
void __init push_node_boundaries(unsigned int nid,
unsigned long start_pfn, unsigned long end_pfn)
{
printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
nid, start_pfn, end_pfn);
/* Initialise the boundary for this node if necessary */
if (node_boundary_end_pfn[nid] == 0)
node_boundary_start_pfn[nid] = -1UL;
/* Update the boundaries */
if (node_boundary_start_pfn[nid] > start_pfn)
node_boundary_start_pfn[nid] = start_pfn;
if (node_boundary_end_pfn[nid] < end_pfn)
node_boundary_end_pfn[nid] = end_pfn;
}
/* If necessary, push the node boundary out for reserve hotadd */
static void __meminit account_node_boundary(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
nid, *start_pfn, *end_pfn);
/* Return if boundary information has not been provided */
if (node_boundary_end_pfn[nid] == 0)
return;
/* Check the boundaries and update if necessary */
if (node_boundary_start_pfn[nid] < *start_pfn)
*start_pfn = node_boundary_start_pfn[nid];
if (node_boundary_end_pfn[nid] > *end_pfn)
*end_pfn = node_boundary_end_pfn[nid];
}
#else
void __init push_node_boundaries(unsigned int nid,
unsigned long start_pfn, unsigned long end_pfn) {}
static void __meminit account_node_boundary(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn) {}
#endif
/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
*
* It returns the start and end page frame of a node based on information
* provided by an arch calling add_active_range(). If called for a node
* with no available memory, a warning is printed and the start and end
* PFNs will be 0.
void __meminit get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
int i;
*start_pfn = -1UL;
*end_pfn = 0;
for_each_active_range_index_in_nid(i, nid) {
*start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
*end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
}
if (*start_pfn == -1UL) {
printk(KERN_WARNING "Node %u active with no memory\n", nid);
*start_pfn = 0;
}
/* Push the node boundaries out if requested */
account_node_boundary(nid, start_pfn, end_pfn);
}
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
/*
* This finds a zone that can be used for ZONE_MOVABLE pages. The
* assumption is made that zones within a node are ordered in monotonic
* increasing memory addresses so that the "highest" populated zone is used
*/
void __init find_usable_zone_for_movable(void)
{
int zone_index;
for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
if (zone_index == ZONE_MOVABLE)
continue;
if (arch_zone_highest_possible_pfn[zone_index] >
arch_zone_lowest_possible_pfn[zone_index])
break;
}
VM_BUG_ON(zone_index == -1);
movable_zone = zone_index;
}
/*
* The zone ranges provided by the architecture do not include ZONE_MOVABLE
* because it is sized independant of architecture. Unlike the other zones,
* the starting point for ZONE_MOVABLE is not fixed. It may be different
* in each node depending on the size of each node and how evenly kernelcore
* is distributed. This helper function adjusts the zone ranges
* provided by the architecture for a given node by using the end of the
* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
* zones within a node are in order of monotonic increases memory addresses
*/
void __meminit adjust_zone_range_for_zone_movable(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
unsigned long *zone_end_pfn)
{
/* Only adjust if ZONE_MOVABLE is on this node */
if (zone_movable_pfn[nid]) {
/* Size ZONE_MOVABLE */
if (zone_type == ZONE_MOVABLE) {
*zone_start_pfn = zone_movable_pfn[nid];
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
/* Adjust for ZONE_MOVABLE starting within this range */
} else if (*zone_start_pfn < zone_movable_pfn[nid] &&
*zone_end_pfn > zone_movable_pfn[nid]) {
*zone_end_pfn = zone_movable_pfn[nid];
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
}
}
/*
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *ignored)
{
unsigned long node_start_pfn, node_end_pfn;
unsigned long zone_start_pfn, zone_end_pfn;
/* Get the start and end of the node and zone */
get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
/* Check that this node has pages within the zone's required range */
if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
return 0;
/* Move the zone boundaries inside the node if necessary */
zone_end_pfn = min(zone_end_pfn, node_end_pfn);
zone_start_pfn = max(zone_start_pfn, node_start_pfn);
/* Return the spanned pages */
return zone_end_pfn - zone_start_pfn;
}
/*
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
unsigned long __meminit __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
int i = 0;
unsigned long prev_end_pfn = 0, hole_pages = 0;
unsigned long start_pfn;
/* Find the end_pfn of the first active range of pfns in the node */
i = first_active_region_index_in_nid(nid);
if (i == -1)
return 0;
prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
/* Account for ranges before physical memory on this node */
if (early_node_map[i].start_pfn > range_start_pfn)
hole_pages = prev_end_pfn - range_start_pfn;
/* Find all holes for the zone within the node */
for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
/* No need to continue if prev_end_pfn is outside the zone */
if (prev_end_pfn >= range_end_pfn)
break;
/* Make sure the end of the zone is not within the hole */
start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
prev_end_pfn = max(prev_end_pfn, range_start_pfn);
/* Update the hole size cound and move on */
if (start_pfn > range_start_pfn) {
BUG_ON(prev_end_pfn > start_pfn);
hole_pages += start_pfn - prev_end_pfn;
}
prev_end_pfn = early_node_map[i].end_pfn;
}
/* Account for ranges past physical memory on this node */
if (range_end_pfn > prev_end_pfn)
hole_pages += range_end_pfn -
max(range_start_pfn, prev_end_pfn);
return hole_pages;
}
/**
* absent_pages_in_range - Return number of page frames in holes within a range
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
* It returns the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
{
return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
}
/* Return the number of page frames in holes in a zone on a node */
static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *ignored)
{
unsigned long node_start_pfn, node_end_pfn;
unsigned long zone_start_pfn, zone_end_pfn;
get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
node_start_pfn);
zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
node_end_pfn);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
#else
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *zones_size)
{
return zones_size[zone_type];
}
static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long *zholes_size)
{
if (!zholes_size)
return 0;
return zholes_size[zone_type];
}
#endif
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
unsigned long realtotalpages, totalpages = 0;
enum zone_type i;
for (i = 0; i < MAX_NR_ZONES; i++)
totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
zones_size);
pgdat->node_spanned_pages = totalpages;
realtotalpages = totalpages;
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -=
zone_absent_pages_in_node(pgdat->node_id, i,
zholes_size);
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
}
/*
* Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*/
Yasunori Goto
committed
static void __meminit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
Christoph Lameter
committed
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
Yasunori Goto
committed
int ret;
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
* Adjust realsize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds realsize %lu\n",
zone_names[j], memmap_pages, realsize);
Christoph Lameter
committed
/* Account for reserved pages */
if (j == 0 && realsize > dma_reserve) {
realsize -= dma_reserve;
Christoph Lameter
committed
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += realsize;
nr_all_pages += realsize;
zone->spanned_pages = size;
zone->present_pages = realsize;
Christoph Lameter
committed
#ifdef CONFIG_NUMA
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
Christoph Lameter
committed
/ 100;
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
Christoph Lameter
committed
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->prev_priority = DEF_PRIORITY;
zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
INIT_LIST_HEAD(&zone->inactive_list);
zone->nr_scan_active = 0;
zone->nr_scan_inactive = 0;
zap_zone_vm_stats(zone);
atomic_set(&zone->reclaim_in_progress, 0);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
Yasunori Goto
committed
BUG_ON(ret);
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
unsigned long size, start, end;