Newer
Older
val->totalram = managed_pages;
val->freeram = node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
NR_FREE_PAGES);
#else
val->totalhigh = 0;
val->freehigh = 0;
#endif
* Determine whether the node should be displayed or not, depending on whether
* SHOW_MEM_FILTER_NODES was passed to show_free_areas().
bool skip_free_areas_node(unsigned int flags, int nid)
{
bool ret = false;
unsigned int cpuset_mems_cookie;
if (!(flags & SHOW_MEM_FILTER_NODES))
goto out;
do {
cpuset_mems_cookie = get_mems_allowed();
ret = !node_isset(nid, cpuset_current_mems_allowed);
} while (!put_mems_allowed(cpuset_mems_cookie));
out:
return ret;
}
static void show_migration_types(unsigned char type)
{
static const char types[MIGRATE_TYPES] = {
[MIGRATE_UNMOVABLE] = 'U',
[MIGRATE_RECLAIMABLE] = 'E',
[MIGRATE_MOVABLE] = 'M',
[MIGRATE_RESERVE] = 'R',
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',
#endif
#ifdef CONFIG_MEMORY_ISOLATION
};
char tmp[MIGRATE_TYPES + 1];
char *p = tmp;
int i;
for (i = 0; i < MIGRATE_TYPES; i++) {
if (type & (1 << i))
*p++ = types[i];
}
*p = '\0';
printk("(%s) ", tmp);
}
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
* Suppresses nodes that are not allowed by current's cpuset if
* SHOW_MEM_FILTER_NODES is passed.
void show_free_areas(unsigned int filter)
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
show_node(zone);
printk("%s per-cpu:\n", zone->name);
for_each_online_cpu(cpu) {
pageset = per_cpu_ptr(zone->pageset, cpu);
printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
cpu, pageset->pcp.high,
pageset->pcp.batch, pageset->pcp.count);
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
Andrew Morton
committed
" dirty:%lu writeback:%lu unstable:%lu\n"
" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
" free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON),
global_page_state(NR_ACTIVE_FILE),
global_page_state(NR_INACTIVE_FILE),
global_page_state(NR_FILE_DIRTY),
Christoph Lameter
committed
global_page_state(NR_WRITEBACK),
Christoph Lameter
committed
global_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
global_page_state(NR_FREE_CMA_PAGES));
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
show_node(zone);
printk("%s"
" free:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" isolated(anon):%lukB"
" isolated(file):%lukB"
" mlocked:%lukB"
" dirty:%lukB"
" writeback:%lukB"
" mapped:%lukB"
" slab_reclaimable:%lukB"
" slab_unreclaimable:%lukB"
KOSAKI Motohiro
committed
" kernel_stack:%lukB"
" pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone_page_state(zone, NR_ACTIVE_ANON)),
K(zone_page_state(zone, NR_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ACTIVE_FILE)),
K(zone_page_state(zone, NR_INACTIVE_FILE)),
K(zone_page_state(zone, NR_UNEVICTABLE)),
K(zone_page_state(zone, NR_ISOLATED_ANON)),
K(zone_page_state(zone, NR_ISOLATED_FILE)),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_FILE_DIRTY)),
K(zone_page_state(zone, NR_WRITEBACK)),
K(zone_page_state(zone, NR_FILE_MAPPED)),
K(zone_page_state(zone, NR_SHMEM)),
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
KOSAKI Motohiro
committed
zone_page_state(zone, NR_KERNEL_STACK) *
THREAD_SIZE / 1024,
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
(!zone_reclaimable(zone) ? "yes" : "no")
);
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %lu", zone->lowmem_reserve[i]);
printk("\n");
}
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
show_node(zone);
printk("%s: ", zone->name);
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
int type;
nr[order] = area->nr_free;
total += nr[order] << order;
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
if (!list_empty(&area->free_list[type]))
types[order] |= 1 << type;
}
for (order = 0; order < MAX_ORDER; order++) {
printk("%lu*%lukB ", nr[order], K(1UL) << order);
if (nr[order])
show_migration_types(types[order]);
}
hugetlb_show_meminfo();
printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
Christoph Lameter
committed
*
* Add all populated zones of a node to the zonelist.
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
int nr_zones)
Christoph Lameter
committed
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
zone = pgdat->node_zones + zone_type;
Christoph Lameter
committed
if (populated_zone(zone)) {
zoneref_set_zone(zone,
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
/*
* zonelist_order:
* 0 = automatic detection of better ordering.
* 1 = order by ([node] distance, -zonetype)
* 2 = order by (-zonetype, [node] distance)
*
* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
* the same zonelist. So only NUMA can configure this param.
*/
#define ZONELIST_ORDER_DEFAULT 0
#define ZONELIST_ORDER_NODE 1
#define ZONELIST_ORDER_ZONE 2
/* zonelist order in the kernel.
* set_zonelist_order() will set this to NODE or ZONE.
*/
static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
/* The value user specified ....changed by config */
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
/* string for sysctl */
#define NUMA_ZONELIST_ORDER_LEN 16
char numa_zonelist_order[16] = "default";
/*
* interface for configure zonelist ordering.
* command line option "numa_zonelist_order"
* = "[dD]efault - default, automatic configuration.
* = "[nN]ode - order by node locality, then by zone within node
* = "[zZ]one - order by zone, then by locality within zone
*/
static int __parse_numa_zonelist_order(char *s)
{
if (*s == 'd' || *s == 'D') {
user_zonelist_order = ZONELIST_ORDER_DEFAULT;
} else if (*s == 'n' || *s == 'N') {
user_zonelist_order = ZONELIST_ORDER_NODE;
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
printk(KERN_WARNING
"Ignoring invalid numa_zonelist_order value: "
"%s\n", s);
return -EINVAL;
}
return 0;
}
static __init int setup_numa_zonelist_order(char *s)
{
Volodymyr G. Lukiianyk
committed
int ret;
if (!s)
return 0;
ret = __parse_numa_zonelist_order(s);
if (ret == 0)
strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
/*
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
char saved_string[NUMA_ZONELIST_ORDER_LEN];
int ret;
static DEFINE_MUTEX(zl_order_mutex);
mutex_lock(&zl_order_mutex);
if (write) {
if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
ret = -EINVAL;
goto out;
}
strcpy(saved_string, (char *)table->data);
}
ret = proc_dostring(table, write, buffer, length, ppos);
if (write) {
int oldval = user_zonelist_order;
ret = __parse_numa_zonelist_order((char *)table->data);
if (ret) {
/*
* bogus value. restore saved string
*/
strncpy((char *)table->data, saved_string,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
Haicheng Li
committed
} else if (oldval != user_zonelist_order) {
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
Haicheng Li
committed
mutex_unlock(&zonelists_mutex);
}
out:
mutex_unlock(&zl_order_mutex);
return ret;
Christoph Lameter
committed
#define MAX_NODE_LOAD (nr_online_nodes)
static int node_load[MAX_NUMNODES];
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
* @used_node_mask: nodemask_t of already used nodes
*
* We use a number of factors to determine which is the next node that should
* appear on a given node's fallback list. The node should not have appeared
* already in @node's fallback list, and it should be the next closest node
* according to the distance array (which contains arbitrary distance values
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
* It returns -1 if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
node_set(node, *used_node_mask);
return node;
}
Lai Jiangshan
committed
for_each_node_state(n, N_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
/* Use the distance array to find the distance */
val = node_distance(node, n);
/* Penalize nodes under us ("prefer the next node") */
val += (n < node);
tmp = cpumask_of_node(n);
if (!cpumask_empty(tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
val += node_load[n];
if (val < min_val) {
min_val = val;
best_node = n;
}
}
if (best_node >= 0)
node_set(best_node, *used_node_mask);
return best_node;
}
/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
zonelist = &pgdat->node_zonelists[0];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Build gfp_thisnode zonelists
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
int j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[1];
j = build_zonelists_node(pgdat, zonelist, 0);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Build zonelists ordered by zone and nodes within zones.
* This results in conserving DMA zone[s] until all Normal memory is
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
static int node_order[MAX_NUMNODES];
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
int pos, j, node;
int zone_type; /* needs to be signed */
struct zone *z;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[0];
pos = 0;
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
node = node_order[j];
z = &NODE_DATA(node)->node_zones[zone_type];
if (populated_zone(z)) {
zoneref_set_zone(z,
&zonelist->_zonerefs[pos++]);
check_highest_zone(zone_type);
}
}
}
zonelist->_zonerefs[pos].zone = NULL;
zonelist->_zonerefs[pos].zone_idx = 0;
}
static int default_zonelist_order(void)
{
int nid, zone_type;
unsigned long low_kmem_size, total_size;
struct zone *z;
int average_size;
/*
* ZONE_DMA and ZONE_DMA32 can be very small area in the system.
* If they are really small and used heavily, the system can fall
* into OOM very easily.
* This function detect ZONE_DMA/DMA32 size and configures zone order.
*/
/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
low_kmem_size = 0;
total_size = 0;
for_each_online_node(nid) {
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
z = &NODE_DATA(nid)->node_zones[zone_type];
if (populated_zone(z)) {
if (zone_type < ZONE_NORMAL)
low_kmem_size += z->managed_pages;
total_size += z->managed_pages;
} else if (zone_type == ZONE_NORMAL) {
/*
* If any node has only lowmem, then node order
* is preferred to allow kernel allocations
* locally; otherwise, they can easily infringe
* on other nodes when there is an abundance of
* lowmem available to allocate from.
*/
return ZONELIST_ORDER_NODE;
}
}
}
if (!low_kmem_size || /* there are no DMA area. */
low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
return ZONELIST_ORDER_NODE;
/*
* look into each node's config.
* If there is a node whose DMA/DMA32 memory is very big area on
* local memory, NODE_ORDER may be suitable.
*/
average_size = total_size /
Lai Jiangshan
committed
(nodes_weight(node_states[N_MEMORY]) + 1);
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
for_each_online_node(nid) {
low_kmem_size = 0;
total_size = 0;
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
z = &NODE_DATA(nid)->node_zones[zone_type];
if (populated_zone(z)) {
if (zone_type < ZONE_NORMAL)
low_kmem_size += z->present_pages;
total_size += z->present_pages;
}
}
if (low_kmem_size &&
total_size > average_size && /* ignore small node */
low_kmem_size > total_size * 70/100)
return ZONELIST_ORDER_NODE;
}
return ZONELIST_ORDER_ZONE;
}
static void set_zonelist_order(void)
{
if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
current_zonelist_order = default_zonelist_order();
else
current_zonelist_order = user_zonelist_order;
}
static void build_zonelists(pg_data_t *pgdat)
{
int j, node, load;
enum zone_type i;
int local_node, prev_node;
struct zonelist *zonelist;
int order = current_zonelist_order;
for (i = 0; i < MAX_ZONELISTS; i++) {
zonelist->_zonerefs[0].zone = NULL;
zonelist->_zonerefs[0].zone_idx = 0;
}
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
Christoph Lameter
committed
load = nr_online_nodes;
memset(node_order, 0, sizeof(node_order));
j = 0;
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
node_load[node] = load;
if (order == ZONELIST_ORDER_NODE)
build_zonelists_in_node_order(pgdat, node);
else
node_order[j++] = node; /* remember order */
}
if (order == ZONELIST_ORDER_ZONE) {
/* calculate node order -- i.e., DMA last! */
build_zonelists_in_zone_order(pgdat, j);
build_thisnode_zonelists(pgdat);
/* Construct the zonelist performance cache - see further mmzone.h */
static void build_zonelist_cache(pg_data_t *pgdat)
struct zonelist *zonelist;
struct zonelist_cache *zlc;
struct zoneref *z;
zonelist = &pgdat->node_zonelists[0];
zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
for (z = zonelist->_zonerefs; z->zone; z++)
zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* Return node id of node used for "local" allocations.
* I.e., first node id of first zone in arg node's generic zonelist.
* Used for initializing percpu 'numa_mem', which is used primarily
* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
*/
int local_memory_node(int node)
{
struct zone *zone;
(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL,
&zone);
return zone->node;
}
#endif
static void set_zonelist_order(void)
{
current_zonelist_order = ZONELIST_ORDER_ZONE;
}
static void build_zonelists(pg_data_t *pgdat)
int node, local_node;
enum zone_type j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[0];
j = build_zonelists_node(pgdat, zonelist, 0);
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
static void build_zonelist_cache(pg_data_t *pgdat)
pgdat->node_zonelists[0].zlcache_ptr = NULL;
/*
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
* that an item put on a list will immediately be handed over to
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
* The boot_pagesets must be kept even after bootup is complete for
* unused processors and/or zones. They do play a role for bootstrapping
* hotplugged processors.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static void setup_zone_pageset(struct zone *zone);
Haicheng Li
committed
/*
* Global mutex to protect against size modification of zonelists
* as well as to serialize pageset setup for the new populated zone.
*/
DEFINE_MUTEX(zonelists_mutex);
/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *data)
Yasunori Goto
committed
int nid;
pg_data_t *self = data;
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
if (self && !node_online(self->node_id)) {
build_zonelists(self);
build_zonelist_cache(self);
}
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
build_zonelist_cache(pgdat);
/*
* Initialize the boot_pagesets that are going to be used
* for bootstrapping processors. The real pagesets for
* each zone will be allocated later when the per cpu
* allocator is available.
*
* boot_pagesets are used also for bootstrapping offline
* cpus if the system is already booted because the pagesets
* are needed to initialize allocators on a specific cpu too.
* F.e. the percpu allocator needs the page allocator which
* needs the percpu allocator in order to allocate its pagesets
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu) {
setup_pageset(&per_cpu(boot_pageset, cpu), 0);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* We now know the "local memory node" for each node--
* i.e., the node of the first zone in the generic zonelist.
* Set up numa_mem percpu variable for on-line cpus. During
* boot, only the boot cpu should be on-line; we'll init the
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
if (cpu_online(cpu))
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}
Yasunori Goto
committed
return 0;
}
Haicheng Li
committed
/*
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
Yasunori Goto
committed
{
set_zonelist_order();
Yasunori Goto
committed
if (system_state == SYSTEM_BOOTING) {
__build_all_zonelists(NULL);
mminit_verify_zonelist();
Yasunori Goto
committed
cpuset_init_current_mems_allowed();
} else {
KAMEZAWA Hiroyuki
committed
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
setup_zone_pageset(zone);
KAMEZAWA Hiroyuki
committed
#endif
/* we have to stop all cpus to guarantee there is no user
of zonelist */
stop_machine(__build_all_zonelists, pgdat, NULL);
Yasunori Goto
committed
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
* more accurate, but expensive to check per-zone. This check is
* made on memory-hotadd so a system can start with mobility
* disabled and enable it later
*/
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
printk("Built %i zonelists in %s order, mobility grouping %s. "
"Total pages: %ld\n",
Christoph Lameter
committed
nr_online_nodes,
zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
printk("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}
/*
* Helper functions to size the waitqueue hash table.
* Essentially these want to choose hash table sizes sufficiently
* large so that collisions trying to wait on pages are rare.
* But in fact, the number of active page waitqueues on typical
* systems is ridiculously low, less than 200. So this is even
* conservative, even though it seems large.
*
* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
* waitqueues, i.e. the size of the waitq table given the number of pages.
*/
#define PAGES_PER_WAITQUEUE 256
Yasunori Goto
committed
#ifndef CONFIG_MEMORY_HOTPLUG
Yasunori Goto
committed
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
unsigned long size = 1;
pages /= PAGES_PER_WAITQUEUE;
while (size < pages)
size <<= 1;
/*
* Once we have dozens or even hundreds of threads sleeping
* on IO we've got bigger problems than wait queue collision.
* Limit the size of the wait table to a reasonable size.
*/
size = min(size, 4096UL);
return max(size, 4UL);
}
Yasunori Goto
committed
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
#else
/*
* A zone's size might be changed by hot-add, so it is not possible to determine
* a suitable size for its wait_table. So we use the maximum size now.
*
* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
*
* i386 (preemption config) : 4096 x 16 = 64Kbyte.
* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
*
* The maximum entries are prepared when a zone's memory is (512K + 256) pages
* or more by the traditional way. (See above). It equals:
*
* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
* ia64(16K page size) : = ( 8G + 4M)byte.
* powerpc (64K page size) : = (32G +16M)byte.
*/
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
return 4096UL;
}
#endif
/*
* This is an integer logarithm so that shifts can be used later
* to extract the more random high bits from the multiplicative
* hash function before the remainder is taken.
*/
static inline unsigned long wait_table_bits(unsigned long size)
{
return ffz(~size);
}
Arve Hjønnevåg
committed
/*
* Check if a pageblock contains reserved pages
*/
static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
return 1;
}
return 0;
}
Mel Gorman
committed
/*
* Mark a number of pageblocks as MIGRATE_RESERVE. The number
* of blocks reserved is based on min_wmark_pages(zone). The memory within
* the reserve will tend to store contiguous free pages. Setting min_free_kbytes
Mel Gorman
committed
* higher will lead to a bigger reserve which will get freed as contiguous
* blocks as reclaim kicks in
*/
static void setup_zone_migrate_reserve(struct zone *zone)
{
Arve Hjønnevåg
committed
unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
Mel Gorman
committed
struct page *page;
unsigned long block_migratetype;
int reserve;
Yasuaki Ishimatsu
committed
int old_reserve;
Mel Gorman
committed
Michal Hocko
committed
/*
* Get the start pfn, end pfn and the number of blocks to reserve
* We have to be careful to be aligned to pageblock_nr_pages to
* make sure that we always check pfn_valid for the first page in
* the block.
*/
Mel Gorman
committed
start_pfn = zone->zone_start_pfn;
end_pfn = zone_end_pfn(zone);
Michal Hocko
committed
start_pfn = roundup(start_pfn, pageblock_nr_pages);
reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
pageblock_order;
Mel Gorman
committed
/*
* Reserve blocks are generally in place to help high-order atomic
* allocations that are short-lived. A min_free_kbytes value that
* would result in more than 2 reserve blocks for atomic allocations
* is assumed to be in place to help anti-fragmentation for the
* future allocation of hugepages at runtime.
*/
reserve = min(2, reserve);
Yasuaki Ishimatsu
committed
old_reserve = zone->nr_migrate_reserve_block;
/* When memory hot-add, we almost always need to do nothing */
if (reserve == old_reserve)
return;
zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
Mel Gorman
committed
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
/* Watch out for overlapping nodes */
if (page_to_nid(page) != zone_to_nid(zone))
continue;
Mel Gorman
committed
block_migratetype = get_pageblock_migratetype(page);
/* Only test what is necessary when the reserves are not met */
if (reserve > 0) {
/*
* Blocks with reserved pages will never free, skip
* them.
*/
block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
if (pageblock_is_reserved(pfn, block_end_pfn))
continue;
Mel Gorman
committed
/* If this block is reserved, account for it */
if (block_migratetype == MIGRATE_RESERVE) {
reserve--;
continue;
}
/* Suitable for reserving if this block is movable */
if (block_migratetype == MIGRATE_MOVABLE) {
set_pageblock_migratetype(page,
MIGRATE_RESERVE);
move_freepages_block(zone, page,
MIGRATE_RESERVE);
reserve--;
continue;
}
Yasuaki Ishimatsu
committed
} else if (!old_reserve) {
/*
* At boot time we don't need to scan the whole zone
* for turning off MIGRATE_RESERVE.
*/
break;
Mel Gorman
committed
}
/*
* If the reserve is met and this is a previous reserved block,