Newer
Older
for_each_populated_zone(zone) {
show_node(zone);
printk("%s per-cpu:\n", zone->name);
for_each_online_cpu(cpu) {
printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
cpu, pageset->pcp.high,
pageset->pcp.batch, pageset->pcp.count);
printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
" inactive_file:%lu"
//TODO: check/adjust line lengths
#ifdef CONFIG_UNEVICTABLE_LRU
" unevictable:%lu"
#endif
" dirty:%lu writeback:%lu unstable:%lu\n"
" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_ACTIVE_FILE),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_INACTIVE_FILE),
#ifdef CONFIG_UNEVICTABLE_LRU
global_page_state(NR_UNEVICTABLE),
#endif
global_page_state(NR_FILE_DIRTY),
Christoph Lameter
committed
global_page_state(NR_WRITEBACK),
Christoph Lameter
committed
global_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE));
for_each_populated_zone(zone) {
int i;
show_node(zone);
printk("%s"
" free:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
#ifdef CONFIG_UNEVICTABLE_LRU
" unevictable:%lukB"
#endif
" present:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(zone->pages_min),
K(zone->pages_low),
K(zone->pages_high),
K(zone_page_state(zone, NR_ACTIVE_ANON)),
K(zone_page_state(zone, NR_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ACTIVE_FILE)),
K(zone_page_state(zone, NR_INACTIVE_FILE)),
#ifdef CONFIG_UNEVICTABLE_LRU
K(zone_page_state(zone, NR_UNEVICTABLE)),
#endif
(zone_is_all_unreclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %lu", zone->lowmem_reserve[i]);
printk("\n");
}
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
show_node(zone);
printk("%s: ", zone->name);
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
nr[order] = zone->free_area[order].nr_free;
total += nr[order] << order;
for (order = 0; order < MAX_ORDER; order++)
printk("%lu*%lukB ", nr[order], K(1UL) << order);
printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
Christoph Lameter
committed
*
* Add all populated zones of a node to the zonelist.
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
int nr_zones, enum zone_type zone_type)
Christoph Lameter
committed
struct zone *zone;
BUG_ON(zone_type >= MAX_NR_ZONES);
zone = pgdat->node_zones + zone_type;
Christoph Lameter
committed
if (populated_zone(zone)) {
zoneref_set_zone(zone,
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
/*
* zonelist_order:
* 0 = automatic detection of better ordering.
* 1 = order by ([node] distance, -zonetype)
* 2 = order by (-zonetype, [node] distance)
*
* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
* the same zonelist. So only NUMA can configure this param.
*/
#define ZONELIST_ORDER_DEFAULT 0
#define ZONELIST_ORDER_NODE 1
#define ZONELIST_ORDER_ZONE 2
/* zonelist order in the kernel.
* set_zonelist_order() will set this to NODE or ZONE.
*/
static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
/* The value user specified ....changed by config */
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
/* string for sysctl */
#define NUMA_ZONELIST_ORDER_LEN 16
char numa_zonelist_order[16] = "default";
/*
* interface for configure zonelist ordering.
* command line option "numa_zonelist_order"
* = "[dD]efault - default, automatic configuration.
* = "[nN]ode - order by node locality, then by zone within node
* = "[zZ]one - order by zone, then by locality within zone
*/
static int __parse_numa_zonelist_order(char *s)
{
if (*s == 'd' || *s == 'D') {
user_zonelist_order = ZONELIST_ORDER_DEFAULT;
} else if (*s == 'n' || *s == 'N') {
user_zonelist_order = ZONELIST_ORDER_NODE;
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
printk(KERN_WARNING
"Ignoring invalid numa_zonelist_order value: "
"%s\n", s);
return -EINVAL;
}
return 0;
}
static __init int setup_numa_zonelist_order(char *s)
{
if (s)
return __parse_numa_zonelist_order(s);
return 0;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
/*
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,
loff_t *ppos)
{
char saved_string[NUMA_ZONELIST_ORDER_LEN];
int ret;
if (write)
strncpy(saved_string, (char*)table->data,
NUMA_ZONELIST_ORDER_LEN);
ret = proc_dostring(table, write, file, buffer, length, ppos);
if (ret)
return ret;
if (write) {
int oldval = user_zonelist_order;
if (__parse_numa_zonelist_order((char*)table->data)) {
/*
* bogus value. restore saved string
*/
strncpy((char*)table->data, saved_string,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order)
build_all_zonelists();
}
return 0;
}
static int node_load[MAX_NUMNODES];
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
* @used_node_mask: nodemask_t of already used nodes
*
* We use a number of factors to determine which is the next node that should
* appear on a given node's fallback list. The node should not have appeared
* already in @node's fallback list, and it should be the next closest node
* according to the distance array (which contains arbitrary distance values
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
* It returns -1 if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
node_set(node, *used_node_mask);
return node;
}
for_each_node_state(n, N_HIGH_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
/* Use the distance array to find the distance */
val = node_distance(node, n);
/* Penalize nodes under us ("prefer the next node") */
val += (n < node);
tmp = cpumask_of_node(n);
if (!cpumask_empty(tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
val += node_load[n];
if (val < min_val) {
min_val = val;
best_node = n;
}
}
if (best_node >= 0)
node_set(best_node, *used_node_mask);
return best_node;
}
/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
zonelist = &pgdat->node_zonelists[0];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
;
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
MAX_NR_ZONES - 1);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Build gfp_thisnode zonelists
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
int j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[1];
j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Build zonelists ordered by zone and nodes within zones.
* This results in conserving DMA zone[s] until all Normal memory is
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
static int node_order[MAX_NUMNODES];
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
int pos, j, node;
int zone_type; /* needs to be signed */
struct zone *z;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[0];
pos = 0;
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
node = node_order[j];
z = &NODE_DATA(node)->node_zones[zone_type];
if (populated_zone(z)) {
zoneref_set_zone(z,
&zonelist->_zonerefs[pos++]);
check_highest_zone(zone_type);
}
}
}
zonelist->_zonerefs[pos].zone = NULL;
zonelist->_zonerefs[pos].zone_idx = 0;
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
}
static int default_zonelist_order(void)
{
int nid, zone_type;
unsigned long low_kmem_size,total_size;
struct zone *z;
int average_size;
/*
* ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
* If they are really small and used heavily, the system can fall
* into OOM very easily.
* This function detect ZONE_DMA/DMA32 size and confgigures zone order.
*/
/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
low_kmem_size = 0;
total_size = 0;
for_each_online_node(nid) {
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
z = &NODE_DATA(nid)->node_zones[zone_type];
if (populated_zone(z)) {
if (zone_type < ZONE_NORMAL)
low_kmem_size += z->present_pages;
total_size += z->present_pages;
}
}
}
if (!low_kmem_size || /* there are no DMA area. */
low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
return ZONELIST_ORDER_NODE;
/*
* look into each node's config.
* If there is a node whose DMA/DMA32 memory is very big area on
* local memory, NODE_ORDER may be suitable.
*/
average_size = total_size /
(nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
for_each_online_node(nid) {
low_kmem_size = 0;
total_size = 0;
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
z = &NODE_DATA(nid)->node_zones[zone_type];
if (populated_zone(z)) {
if (zone_type < ZONE_NORMAL)
low_kmem_size += z->present_pages;
total_size += z->present_pages;
}
}
if (low_kmem_size &&
total_size > average_size && /* ignore small node */
low_kmem_size > total_size * 70/100)
return ZONELIST_ORDER_NODE;
}
return ZONELIST_ORDER_ZONE;
}
static void set_zonelist_order(void)
{
if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
current_zonelist_order = default_zonelist_order();
else
current_zonelist_order = user_zonelist_order;
}
static void build_zonelists(pg_data_t *pgdat)
{
int j, node, load;
enum zone_type i;
int local_node, prev_node;
struct zonelist *zonelist;
int order = current_zonelist_order;
for (i = 0; i < MAX_ZONELISTS; i++) {
zonelist->_zonerefs[0].zone = NULL;
zonelist->_zonerefs[0].zone_idx = 0;
}
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
load = num_online_nodes();
prev_node = local_node;
nodes_clear(used_mask);
memset(node_load, 0, sizeof(node_load));
memset(node_order, 0, sizeof(node_order));
j = 0;
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
int distance = node_distance(local_node, node);
/*
* If another node is sufficiently far away then it is better
* to reclaim pages in a zone before going off node.
*/
if (distance > RECLAIM_DISTANCE)
zone_reclaim_mode = 1;
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
if (distance != node_distance(local_node, prev_node))
node_load[node] = load;
if (order == ZONELIST_ORDER_NODE)
build_zonelists_in_node_order(pgdat, node);
else
node_order[j++] = node; /* remember order */
}
if (order == ZONELIST_ORDER_ZONE) {
/* calculate node order -- i.e., DMA last! */
build_zonelists_in_zone_order(pgdat, j);
build_thisnode_zonelists(pgdat);
/* Construct the zonelist performance cache - see further mmzone.h */
static void build_zonelist_cache(pg_data_t *pgdat)
struct zonelist *zonelist;
struct zonelist_cache *zlc;
struct zoneref *z;
zonelist = &pgdat->node_zonelists[0];
zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
for (z = zonelist->_zonerefs; z->zone; z++)
zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
static void set_zonelist_order(void)
{
current_zonelist_order = ZONELIST_ORDER_ZONE;
}
static void build_zonelists(pg_data_t *pgdat)
int node, local_node;
enum zone_type j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[0];
j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
MAX_NR_ZONES - 1);
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j,
MAX_NR_ZONES - 1);
}
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
static void build_zonelist_cache(pg_data_t *pgdat)
pgdat->node_zonelists[0].zlcache_ptr = NULL;
/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *dummy)
Yasunori Goto
committed
int nid;
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
build_zonelist_cache(pgdat);
Yasunori Goto
committed
return 0;
}
void build_all_zonelists(void)
Yasunori Goto
committed
{
set_zonelist_order();
Yasunori Goto
committed
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
mminit_verify_zonelist();
Yasunori Goto
committed
cpuset_init_current_mems_allowed();
} else {
/* we have to stop all cpus to guarantee there is no user
Yasunori Goto
committed
of zonelist */
stop_machine(__build_all_zonelists, NULL, NULL);
Yasunori Goto
committed
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
* more accurate, but expensive to check per-zone. This check is
* made on memory-hotadd so a system can start with mobility
* disabled and enable it later
*/
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
printk("Built %i zonelists in %s order, mobility grouping %s. "
"Total pages: %ld\n",
num_online_nodes(),
zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
printk("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}
/*
* Helper functions to size the waitqueue hash table.
* Essentially these want to choose hash table sizes sufficiently
* large so that collisions trying to wait on pages are rare.
* But in fact, the number of active page waitqueues on typical
* systems is ridiculously low, less than 200. So this is even
* conservative, even though it seems large.
*
* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
* waitqueues, i.e. the size of the waitq table given the number of pages.
*/
#define PAGES_PER_WAITQUEUE 256
Yasunori Goto
committed
#ifndef CONFIG_MEMORY_HOTPLUG
Yasunori Goto
committed
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
unsigned long size = 1;
pages /= PAGES_PER_WAITQUEUE;
while (size < pages)
size <<= 1;
/*
* Once we have dozens or even hundreds of threads sleeping
* on IO we've got bigger problems than wait queue collision.
* Limit the size of the wait table to a reasonable size.
*/
size = min(size, 4096UL);
return max(size, 4UL);
}
Yasunori Goto
committed
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
#else
/*
* A zone's size might be changed by hot-add, so it is not possible to determine
* a suitable size for its wait_table. So we use the maximum size now.
*
* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
*
* i386 (preemption config) : 4096 x 16 = 64Kbyte.
* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
*
* The maximum entries are prepared when a zone's memory is (512K + 256) pages
* or more by the traditional way. (See above). It equals:
*
* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
* ia64(16K page size) : = ( 8G + 4M)byte.
* powerpc (64K page size) : = (32G +16M)byte.
*/
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
return 4096UL;
}
#endif
/*
* This is an integer logarithm so that shifts can be used later
* to extract the more random high bits from the multiplicative
* hash function before the remainder is taken.
*/
static inline unsigned long wait_table_bits(unsigned long size)
{
return ffz(~size);
}
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
Mel Gorman
committed
/*
* Mark a number of pageblocks as MIGRATE_RESERVE. The number
Mel Gorman
committed
* of blocks reserved is based on zone->pages_min. The memory within the
* reserve will tend to store contiguous free pages. Setting min_free_kbytes
* higher will lead to a bigger reserve which will get freed as contiguous
* blocks as reclaim kicks in
*/
static void setup_zone_migrate_reserve(struct zone *zone)
{
unsigned long start_pfn, pfn, end_pfn;
struct page *page;
unsigned long reserve, block_migratetype;
/* Get the start pfn, end pfn and the number of blocks to reserve */
start_pfn = zone->zone_start_pfn;
end_pfn = start_pfn + zone->spanned_pages;
reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
pageblock_order;
Mel Gorman
committed
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
Mel Gorman
committed
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
/* Watch out for overlapping nodes */
if (page_to_nid(page) != zone_to_nid(zone))
continue;
Mel Gorman
committed
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
/* Blocks with reserved pages will never free, skip them. */
if (PageReserved(page))
continue;
block_migratetype = get_pageblock_migratetype(page);
/* If this block is reserved, account for it */
if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
reserve--;
continue;
}
/* Suitable for reserving if this block is movable */
if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
set_pageblock_migratetype(page, MIGRATE_RESERVE);
move_freepages_block(zone, page, MIGRATE_RESERVE);
reserve--;
continue;
}
/*
* If the reserve is met and this is a previous reserved block,
* take it back
*/
if (block_migratetype == MIGRATE_RESERVE) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
move_freepages_block(zone, page, MIGRATE_MOVABLE);
}
}
}
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
}
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
mminit_verify_page_links(page, zone, nid, pfn);
reset_page_mapcount(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
Mel Gorman
committed
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
if ((z->zone_start_pfn <= pfn)
&& (pfn < z->zone_start_pfn + z->spanned_pages)
&& !(pfn & (pageblock_nr_pages - 1)))
Mel Gorman
committed
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
static void __meminit zone_init_free_lists(struct zone *zone)
int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
static int zone_batchsize(struct zone *zone)
#ifdef CONFIG_MMU
int batch;
/*
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone. But no more than 1/2 of a meg.
*
* OK, so we don't know how big the cache is. So guess.
*/
batch = zone->present_pages / 1024;
if (batch * PAGE_SIZE > 512 * 1024)
batch = (512 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
batch = rounddown_pow_of_two(batch + batch/2) - 1;
#else
/* The deferral and batching of frees should be suppressed under NOMMU
* conditions.
*
* The problem is that NOMMU needs to be able to allocate large chunks
* of contiguous memory as there's no hardware page translation to
* assemble apparent contiguous memory from discontiguous pages.
*
* Queueing large contiguous runs of pages for batching, however,
* causes the pages to actually be freed in smaller chunks. As there
* can be a significant delay between the individual batches being
* recycled, this leads to the once large chunks of space being
* fragmented and becoming unavailable for high-order allocations.
*/
return 0;
#endif
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
struct per_cpu_pages *pcp;
memset(p, 0, sizeof(*p));
pcp->count = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
INIT_LIST_HEAD(&pcp->list);
}
/*
* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
* to the value high for the pageset p.
*/
static void setup_pagelist_highmark(struct per_cpu_pageset *p,
unsigned long high)
{
struct per_cpu_pages *pcp;
pcp->high = high;
pcp->batch = max(1UL, high/4);
if ((high/4) > (PAGE_SHIFT * 8))
pcp->batch = PAGE_SHIFT * 8;
}
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
* that an item put on a list will immediately be handed over to
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
* Some NUMA counter updates may also be caught by the boot pagesets.
*
* The boot_pagesets must be kept even after bootup is complete for
* unused processors and/or zones. They do play a role for bootstrapping
* hotplugged processors.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
static struct per_cpu_pageset boot_pageset[NR_CPUS];
/*
* Dynamically allocate memory for the
* per cpu pageset array in struct zone.
*/
static int __cpuinit process_zones(int cpu)
int node = cpu_to_node(cpu);
node_set_state(node, N_CPU); /* this node has a cpu */
for_each_populated_zone(zone) {
zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
if (percpu_pagelist_fraction)
setup_pagelist_highmark(zone_pcp(zone, cpu),
(zone->present_pages / percpu_pagelist_fraction));
}
return 0;
bad:
for_each_zone(dzone) {
if (!populated_zone(dzone))
continue;
kfree(zone_pcp(dzone, cpu));
zone_pcp(dzone, cpu) = NULL;
}
return -ENOMEM;
}
static inline void free_zone_pagesets(int cpu)
{
struct zone *zone;
for_each_zone(zone) {
struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
/* Free per_cpu_pageset if it is slab allocated */
if (pset != &boot_pageset[cpu])
kfree(pset);
zone_pcp(zone, cpu) = NULL;
}
}
static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
int cpu = (long)hcpu;
int ret = NOTIFY_OK;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
if (process_zones(cpu))
ret = NOTIFY_BAD;
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_zone_pagesets(cpu);
break;
default:
break;
Chandra Seetharaman
committed
static struct notifier_block __cpuinitdata pageset_notifier =
{ &pageset_cpuup_callback, NULL, 0 };
void __init setup_per_cpu_pageset(void)
{
int err;
/* Initialize per_cpu_pageset for cpu 0.
* A cpuup callback will do this for every cpu
* as it comes online
*/
err = process_zones(smp_processor_id());
BUG_ON(err);
register_cpu_notifier(&pageset_notifier);
}
#endif