Newer
Older
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
Christoph Lameter
committed
*
* Add all populated zones of a node to the zonelist.
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
int nr_zones)
Christoph Lameter
committed
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
zone = pgdat->node_zones + zone_type;
Christoph Lameter
committed
if (populated_zone(zone)) {
zoneref_set_zone(zone,
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
/*
* zonelist_order:
* 0 = automatic detection of better ordering.
* 1 = order by ([node] distance, -zonetype)
* 2 = order by (-zonetype, [node] distance)
*
* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
* the same zonelist. So only NUMA can configure this param.
*/
#define ZONELIST_ORDER_DEFAULT 0
#define ZONELIST_ORDER_NODE 1
#define ZONELIST_ORDER_ZONE 2
/* zonelist order in the kernel.
* set_zonelist_order() will set this to NODE or ZONE.
*/
static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
/* The value user specified ....changed by config */
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
/* string for sysctl */
#define NUMA_ZONELIST_ORDER_LEN 16
char numa_zonelist_order[16] = "default";
/*
* interface for configure zonelist ordering.
* command line option "numa_zonelist_order"
* = "[dD]efault - default, automatic configuration.
* = "[nN]ode - order by node locality, then by zone within node
* = "[zZ]one - order by zone, then by locality within zone
*/
static int __parse_numa_zonelist_order(char *s)
{
if (*s == 'd' || *s == 'D') {
user_zonelist_order = ZONELIST_ORDER_DEFAULT;
} else if (*s == 'n' || *s == 'N') {
user_zonelist_order = ZONELIST_ORDER_NODE;
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
printk(KERN_WARNING
"Ignoring invalid numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
}
static __init int setup_numa_zonelist_order(char *s)
{
Volodymyr G. Lukiianyk
committed
int ret;
if (!s)
return 0;
ret = __parse_numa_zonelist_order(s);
if (ret == 0)
strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
/*
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
char saved_string[NUMA_ZONELIST_ORDER_LEN];
int ret;
static DEFINE_MUTEX(zl_order_mutex);
mutex_lock(&zl_order_mutex);
if (write) {
if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
ret = -EINVAL;
goto out;
}
strcpy(saved_string, (char *)table->data);
}
ret = proc_dostring(table, write, buffer, length, ppos);
if (write) {
int oldval = user_zonelist_order;
ret = __parse_numa_zonelist_order((char *)table->data);
if (ret) {
/*
* bogus value. restore saved string
*/
strncpy((char *)table->data, saved_string,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
Haicheng Li
committed
} else if (oldval != user_zonelist_order) {
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
Haicheng Li
committed
mutex_unlock(&zonelists_mutex);
}
out:
mutex_unlock(&zl_order_mutex);
return ret;
Christoph Lameter
committed
#define MAX_NODE_LOAD (nr_online_nodes)
static int node_load[MAX_NUMNODES];
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
* @used_node_mask: nodemask_t of already used nodes
*
* We use a number of factors to determine which is the next node that should
* appear on a given node's fallback list. The node should not have appeared
* already in @node's fallback list, and it should be the next closest node
* according to the distance array (which contains arbitrary distance values
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
* It returns -1 if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
node_set(node, *used_node_mask);
return node;
}
Lai Jiangshan
committed
for_each_node_state(n, N_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
/* Use the distance array to find the distance */
val = node_distance(node, n);
/* Penalize nodes under us ("prefer the next node") */
val += (n < node);
tmp = cpumask_of_node(n);
if (!cpumask_empty(tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
val += node_load[n];
if (val < min_val) {
min_val = val;
best_node = n;
}
}
if (best_node >= 0)
node_set(best_node, *used_node_mask);
return best_node;
}
/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
zonelist = &pgdat->node_zonelists[0];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Build gfp_thisnode zonelists
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
int j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[1];
j = build_zonelists_node(pgdat, zonelist, 0);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Build zonelists ordered by zone and nodes within zones.
* This results in conserving DMA zone[s] until all Normal memory is
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
static int node_order[MAX_NUMNODES];
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
int pos, j, node;
int zone_type; /* needs to be signed */
struct zone *z;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[0];
pos = 0;
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
node = node_order[j];
z = &NODE_DATA(node)->node_zones[zone_type];
if (populated_zone(z)) {
zoneref_set_zone(z,
&zonelist->_zonerefs[pos++]);
check_highest_zone(zone_type);
}
}
}
zonelist->_zonerefs[pos].zone = NULL;
zonelist->_zonerefs[pos].zone_idx = 0;
#if defined(CONFIG_64BIT)
/*
* Devices that require DMA32/DMA are relatively rare and do not justify a
* penalty to every machine in case the specialised case applies. Default
* to Node-ordering on 64-bit NUMA machines
*/
static int default_zonelist_order(void)
{
return ZONELIST_ORDER_NODE;
}
#else
/*
* On 32-bit, the Normal zone needs to be preserved for allocations accessible
* by the kernel. If processes running on node 0 deplete the low memory zone
* then reclaim will occur more frequency increasing stalls and potentially
* be easier to OOM if a large percentage of the zone is under writeback or
* dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
* Hence, default to zone ordering on 32-bit.
*/
static int default_zonelist_order(void)
{
return ZONELIST_ORDER_ZONE;
}
#endif /* CONFIG_64BIT */
static void set_zonelist_order(void)
{
if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
current_zonelist_order = default_zonelist_order();
else
current_zonelist_order = user_zonelist_order;
}
static void build_zonelists(pg_data_t *pgdat)
{
int local_node, prev_node;
struct zonelist *zonelist;
unsigned int order = current_zonelist_order;
for (i = 0; i < MAX_ZONELISTS; i++) {
zonelist->_zonerefs[0].zone = NULL;
zonelist->_zonerefs[0].zone_idx = 0;
}
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
Christoph Lameter
committed
load = nr_online_nodes;
memset(node_order, 0, sizeof(node_order));
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
node_load[node] = load;
if (order == ZONELIST_ORDER_NODE)
build_zonelists_in_node_order(pgdat, node);
else
node_order[i++] = node; /* remember order */
if (order == ZONELIST_ORDER_ZONE) {
/* calculate node order -- i.e., DMA last! */
build_zonelists_in_zone_order(pgdat, i);
build_thisnode_zonelists(pgdat);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* Return node id of node used for "local" allocations.
* I.e., first node id of first zone in arg node's generic zonelist.
* Used for initializing percpu 'numa_mem', which is used primarily
* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
*/
int local_memory_node(int node)
{
struct zone *zone;
(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL,
&zone);
return zone->node;
}
#endif
static void set_zonelist_order(void)
{
current_zonelist_order = ZONELIST_ORDER_ZONE;
}
static void build_zonelists(pg_data_t *pgdat)
int node, local_node;
enum zone_type j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[0];
j = build_zonelists_node(pgdat, zonelist, 0);
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
* that an item put on a list will immediately be handed over to
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
* The boot_pagesets must be kept even after bootup is complete for
* unused processors and/or zones. They do play a role for bootstrapping
* hotplugged processors.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static void setup_zone_pageset(struct zone *zone);
Haicheng Li
committed
/*
* Global mutex to protect against size modification of zonelists
* as well as to serialize pageset setup for the new populated zone.
*/
DEFINE_MUTEX(zonelists_mutex);
/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *data)
Yasunori Goto
committed
int nid;
pg_data_t *self = data;
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
if (self && !node_online(self->node_id)) {
build_zonelists(self);
}
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
/*
* Initialize the boot_pagesets that are going to be used
* for bootstrapping processors. The real pagesets for
* each zone will be allocated later when the per cpu
* allocator is available.
*
* boot_pagesets are used also for bootstrapping offline
* cpus if the system is already booted because the pagesets
* are needed to initialize allocators on a specific cpu too.
* F.e. the percpu allocator needs the page allocator which
* needs the percpu allocator in order to allocate its pagesets
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu) {
setup_pageset(&per_cpu(boot_pageset, cpu), 0);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* We now know the "local memory node" for each node--
* i.e., the node of the first zone in the generic zonelist.
* Set up numa_mem percpu variable for on-line cpus. During
* boot, only the boot cpu should be on-line; we'll init the
* secondary cpus' numa_mem as they come on-line. During
* node/memory hotplug, we'll fixup all on-line cpus.
*/
if (cpu_online(cpu))
set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
}
Yasunori Goto
committed
return 0;
}
static noinline void __init
build_all_zonelists_init(void)
{
__build_all_zonelists(NULL);
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
}
Haicheng Li
committed
/*
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*
* __ref due to (1) call of __meminit annotated setup_zone_pageset
* [we're only called with non-NULL zone through __meminit paths] and
* (2) call of __init annotated helper build_all_zonelists_init
* [protected by SYSTEM_BOOTING].
Haicheng Li
committed
*/
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
Yasunori Goto
committed
{
set_zonelist_order();
Yasunori Goto
committed
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
Yasunori Goto
committed
} else {
KAMEZAWA Hiroyuki
committed
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
setup_zone_pageset(zone);
KAMEZAWA Hiroyuki
committed
#endif
/* we have to stop all cpus to guarantee there is no user
of zonelist */
stop_machine(__build_all_zonelists, pgdat, NULL);
Yasunori Goto
committed
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
* more accurate, but expensive to check per-zone. This check is
* made on memory-hotadd so a system can start with mobility
* disabled and enable it later
*/
if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
page_group_by_mobility_disabled = 1;
else
page_group_by_mobility_disabled = 0;
pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
zonelist_order_name[current_zonelist_order],
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
}
/*
* Helper functions to size the waitqueue hash table.
* Essentially these want to choose hash table sizes sufficiently
* large so that collisions trying to wait on pages are rare.
* But in fact, the number of active page waitqueues on typical
* systems is ridiculously low, less than 200. So this is even
* conservative, even though it seems large.
*
* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
* waitqueues, i.e. the size of the waitq table given the number of pages.
*/
#define PAGES_PER_WAITQUEUE 256
Yasunori Goto
committed
#ifndef CONFIG_MEMORY_HOTPLUG
Yasunori Goto
committed
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
unsigned long size = 1;
pages /= PAGES_PER_WAITQUEUE;
while (size < pages)
size <<= 1;
/*
* Once we have dozens or even hundreds of threads sleeping
* on IO we've got bigger problems than wait queue collision.
* Limit the size of the wait table to a reasonable size.
*/
size = min(size, 4096UL);
return max(size, 4UL);
}
Yasunori Goto
committed
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
#else
/*
* A zone's size might be changed by hot-add, so it is not possible to determine
* a suitable size for its wait_table. So we use the maximum size now.
*
* The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
*
* i386 (preemption config) : 4096 x 16 = 64Kbyte.
* ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
* ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
*
* The maximum entries are prepared when a zone's memory is (512K + 256) pages
* or more by the traditional way. (See above). It equals:
*
* i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
* ia64(16K page size) : = ( 8G + 4M)byte.
* powerpc (64K page size) : = (32G +16M)byte.
*/
static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
{
return 4096UL;
}
#endif
/*
* This is an integer logarithm so that shifts can be used later
* to extract the more random high bits from the multiplicative
* hash function before the remainder is taken.
*/
static inline unsigned long wait_table_bits(unsigned long size)
{
return ffz(~size);
}
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
unsigned long end_pfn = start_pfn + size;
pg_data_t *pgdat = NODE_DATA(nid);
Mel Gorman
committed
unsigned long nr_initialised = 0;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
struct memblock_region *r = NULL, *tmp;
#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
/*
* Honor reservation requested by the driver for this ZONE_DEVICE
* memory
*/
if (altmap && start_pfn == altmap->base_pfn)
start_pfn += altmap->reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
if (context != MEMMAP_EARLY)
goto not_early;
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
break;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
* If not mirrored_kernelcore and ZONE_MOVABLE exists, range
* from zone_movable_pfn[nid] to end of each node should be
* ZONE_MOVABLE not ZONE_NORMAL. skip it.
*/
if (!mirrored_kernelcore && zone_movable_pfn[nid])
if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
continue;
/*
* Check given memblock attribute by firmware which can affect
* kernel memory layout. If zone==ZONE_MOVABLE but memory is
* mirrored, it's an overlapped memmap init. skip it.
*/
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
for_each_memblock(memory, tmp)
if (pfn < memblock_region_memory_end_pfn(tmp))
break;
r = tmp;
}
if (pfn >= memblock_region_memory_base_pfn(r) &&
memblock_is_mirror(r)) {
/* already initialized as NORMAL */
pfn = memblock_region_memory_end_pfn(r);
continue;
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
struct page *page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
} else {
__init_single_pfn(pfn, zone, nid);
}
static void __meminit zone_init_free_lists(struct zone *zone)
unsigned int order, t;
for_each_migratetype_order(order, t) {
INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
zone->free_area[order].nr_free = 0;
}
}
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
static int zone_batchsize(struct zone *zone)
#ifdef CONFIG_MMU
int batch;
/*
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone. But no more than 1/2 of a meg.
*
* OK, so we don't know how big the cache is. So guess.
*/
batch = zone->managed_pages / 1024;
if (batch * PAGE_SIZE > 512 * 1024)
batch = (512 * 1024) / PAGE_SIZE;
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
/*
* Clamp the batch to a 2^n - 1 value. Having a power
* of 2 value was found to be more likely to have
* suboptimal cache aliasing properties in some cases.
* For example if 2 tasks are alternately allocating
* batches of pages, one task can end up with a lot
* of pages of one half of the possible page colors
* and the other with pages of the other colors.
batch = rounddown_pow_of_two(batch + batch/2) - 1;
#else
/* The deferral and batching of frees should be suppressed under NOMMU
* conditions.
*
* The problem is that NOMMU needs to be able to allocate large chunks
* of contiguous memory as there's no hardware page translation to
* assemble apparent contiguous memory from discontiguous pages.
*
* Queueing large contiguous runs of pages for batching, however,
* causes the pages to actually be freed in smaller chunks. As there
* can be a significant delay between the individual batches being
* recycled, this leads to the once large chunks of space being
* fragmented and becoming unavailable for high-order allocations.
*/
return 0;
#endif
Cody P Schafer
committed
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
/*
* pcp->high and pcp->batch values are related and dependent on one another:
* ->batch must never be higher then ->high.
* The following function updates them in a safe manner without read side
* locking.
*
* Any new users of pcp->batch and pcp->high should ensure they can cope with
* those fields changing asynchronously (acording the the above rule).
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
* exist).
*/
static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
unsigned long batch)
{
/* start with a fail safe value for batch */
pcp->batch = 1;
smp_wmb();
/* Update high, then batch, in order */
pcp->high = high;
smp_wmb();
pcp->batch = batch;
}
Cody P Schafer
committed
/* a companion to pageset_set_high() */
static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
{
Cody P Schafer
committed
pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
Cody P Schafer
committed
static void pageset_init(struct per_cpu_pageset *p)
{
struct per_cpu_pages *pcp;
int migratetype;
memset(p, 0, sizeof(*p));
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
Cody P Schafer
committed
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
pageset_init(p);
pageset_set_batch(p, batch);
}
Cody P Schafer
committed
* pageset_set_high() sets the high water mark for hot per_cpu_pagelist
* to the value high for the pageset p.
*/
Cody P Schafer
committed
static void pageset_set_high(struct per_cpu_pageset *p,
unsigned long high)
{
Cody P Schafer
committed
unsigned long batch = max(1UL, high / 4);
if ((high / 4) > (PAGE_SHIFT * 8))
batch = PAGE_SHIFT * 8;
Cody P Schafer
committed
pageset_update(&p->pcp, high, batch);
}
static void pageset_set_high_and_batch(struct zone *zone,
struct per_cpu_pageset *pcp)
{
if (percpu_pagelist_fraction)
Cody P Schafer
committed
pageset_set_high(pcp,
(zone->managed_pages /
percpu_pagelist_fraction));
else
pageset_set_batch(pcp, zone_batchsize(zone));
}
static void __meminit zone_pageset_init(struct zone *zone, int cpu)
{
struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
pageset_init(pcp);
pageset_set_high_and_batch(zone, pcp);
}
static void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
zone->pageset = alloc_percpu(struct per_cpu_pageset);
for_each_possible_cpu(cpu)
zone_pageset_init(zone, cpu);
}
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
void __init setup_per_cpu_pageset(void)
for_each_populated_zone(zone)
setup_zone_pageset(zone);
Yasunori Goto
committed
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
Yasunori Goto
committed
size_t alloc_size;
/*
* The per-page waitqueue mechanism uses hashed waitqueues
* per zone.
*/
Yasunori Goto
committed
zone->wait_table_hash_nr_entries =
wait_table_hash_nr_entries(zone_size_pages);
zone->wait_table_bits =
wait_table_bits(zone->wait_table_hash_nr_entries);
Yasunori Goto
committed
alloc_size = zone->wait_table_hash_nr_entries
* sizeof(wait_queue_head_t);
if (!slab_is_available()) {
Yasunori Goto
committed
zone->wait_table = (wait_queue_head_t *)
memblock_virt_alloc_node_nopanic(
alloc_size, zone->zone_pgdat->node_id);
Yasunori Goto
committed
} else {
/*
* This case means that a zone whose size was 0 gets new memory
* via memory hot-add.
* But it may be the case that a new node was hot-added. In
* this case vmalloc() will not be able to use this new node's
* memory - this wait_table must be initialized to use this new
* node itself as well.
* To use this new node's memory, further consideration will be
* necessary.
*/
zone->wait_table = vmalloc(alloc_size);
Yasunori Goto
committed
}
if (!zone->wait_table)
return -ENOMEM;
for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
init_waitqueue_head(zone->wait_table + i);
Yasunori Goto
committed
return 0;
static __meminit void zone_pcp_init(struct zone *zone)
/*
* per cpu subsystem is not up at this point. The following code
* relies on the ability of the linker to provide the
* offset of a (static) per cpu variable into the per cpu area.
*/
zone->pageset = &boot_pageset;
if (populated_zone(zone))
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
zone->name, zone->present_pages,
zone_batchsize(zone));
int __meminit init_currently_empty_zone(struct zone *zone,
Yasunori Goto
committed
unsigned long zone_start_pfn,
unsigned long size)
{
struct pglist_data *pgdat = zone->zone_pgdat;
Yasunori Goto
committed
int ret;
ret = zone_wait_table_init(zone, size);
if (ret)
return ret;
pgdat->nr_zones = zone_idx(zone) + 1;
zone->zone_start_pfn = zone_start_pfn;
mminit_dprintk(MMINIT_TRACE, "memmap_init",
"Initialising map node %d zone %lu pfns %lu -> %lu\n",
pgdat->node_id,
(unsigned long)zone_idx(zone),
zone_start_pfn, (zone_start_pfn + size));
zone_init_free_lists(zone);
Yasunori Goto
committed
return 0;
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/
int __meminit __early_pfn_to_nid(unsigned long pfn,
struct mminit_pfnnid_cache *state)
unsigned long start_pfn, end_pfn;
if (state->last_start <= pfn && pfn < state->last_end)
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
if (nid != -1) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
/**
* free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
* @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
* If an architecture guarantees that all ranges registered contain no holes
* and may be freed, this this function may be used instead of calling
* memblock_free_early_nid() manually.
void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)