Newer
Older
if (unlikely(!nc->va)) {
refill:
Alexander Duyck
committed
page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
#endif
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
page_ref_add(page, size - 1);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
nc->pagecnt_bias = size;
nc->offset = size;
}
offset = nc->offset - fragsz;
if (unlikely(offset < 0)) {
page = virt_to_page(nc->va);
if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
set_page_count(page, size);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = size;
offset = size - fragsz;
}
nc->pagecnt_bias--;
nc->offset = offset;
return nc->va + offset;
}
Alexander Duyck
committed
EXPORT_SYMBOL(page_frag_alloc);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
*/
Alexander Duyck
committed
void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
__free_pages_ok(page, compound_order(page));
}
Alexander Duyck
committed
EXPORT_SYMBOL(page_frag_free);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
if (addr) {
unsigned long alloc_end = addr + (PAGE_SIZE << order);
unsigned long used = addr + PAGE_ALIGN(size);
split_page(virt_to_page((void *)addr), order);
while (used < alloc_end) {
free_page(used);
used += PAGE_SIZE;
}
}
return (void *)addr;
}
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
* @gfp_mask: GFP flags for the allocation
*
* This function is similar to alloc_pages(), except that it allocates the
* minimum number of pages to satisfy the request. alloc_pages() can only
* allocate memory in power-of-two pages.
*
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
unsigned long addr;
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
EXPORT_SYMBOL(alloc_pages_exact);
/**
* alloc_pages_exact_nid - allocate an exact number of physically-contiguous
* pages on a node.
* @nid: the preferred node ID where memory should be allocated
* @size: the number of bytes to allocate
* @gfp_mask: GFP flags for the allocation
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
struct page *p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
}
/**
* free_pages_exact - release memory allocated via alloc_pages_exact()
* @virt: the value returned by alloc_pages_exact.
* @size: size of allocation, same value as passed to alloc_pages_exact().
*
* Release the memory allocated by a previous call to alloc_pages_exact.
*/
void free_pages_exact(void *virt, size_t size)
{
unsigned long addr = (unsigned long)virt;
unsigned long end = addr + PAGE_ALIGN(size);
while (addr < end) {
free_page(addr);
addr += PAGE_SIZE;
}
}
EXPORT_SYMBOL(free_pages_exact);
/**
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
* nr_free_zone_pages() counts the number of counts pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
static unsigned long nr_free_zone_pages(int offset)
struct zoneref *z;
struct zone *zone;
/* Just pick one node, since fallback list is circular */
unsigned long sum = 0;
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
for_each_zone_zonelist(zone, z, zonelist, offset) {
unsigned long size = zone->managed_pages;
unsigned long high = high_wmark_pages(zone);
if (size > high)
sum += size - high;
/**
* nr_free_buffer_pages - count number of pages beyond high watermark
*
* nr_free_buffer_pages() counts the number of pages which are beyond the high
* watermark within ZONE_DMA and ZONE_NORMAL.
unsigned long nr_free_buffer_pages(void)
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
/**
* nr_free_pagecache_pages - count number of pages beyond high watermark
*
* nr_free_pagecache_pages() counts the number of pages which are beyond the
* high watermark within all zones.
unsigned long nr_free_pagecache_pages(void)
return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
Christoph Lameter
committed
static inline void show_node(struct zone *zone)
if (IS_ENABLED(CONFIG_NUMA))
Andy Whitcroft
committed
printk("Node %d ", zone_to_nid(zone));
long si_mem_available(void)
{
long available;
unsigned long pagecache;
unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
struct zone *zone;
int lru;
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
for_each_zone(zone)
wmark_low += zone->watermark[WMARK_LOW];
/*
* Estimate the amount of memory available for userspace allocations,
* without causing swapping.
*/
available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
* start swapping. Assume at least half of the page cache, or the
* low watermark worth of cache, needs to stay.
*/
pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
pagecache -= min(pagecache / 2, wmark_low);
available += pagecache;
/*
* Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
available += global_page_state(NR_SLAB_RECLAIMABLE) -
min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
if (available < 0)
available = 0;
return available;
}
EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}
EXPORT_SYMBOL(si_meminfo);
#ifdef CONFIG_NUMA
void si_meminfo_node(struct sysinfo *val, int nid)
{
int zone_type; /* needs to be signed */
unsigned long managed_pages = 0;
unsigned long managed_highpages = 0;
unsigned long free_highpages = 0;
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
managed_pages += pgdat->node_zones[zone_type].managed_pages;
val->totalram = managed_pages;
val->sharedram = node_page_state(pgdat, NR_SHMEM);
val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (is_highmem(zone)) {
managed_highpages += zone->managed_pages;
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
#else
val->totalhigh = managed_highpages;
val->freehigh = free_highpages;
#endif
* Determine whether the node should be displayed or not, depending on whether
* SHOW_MEM_FILTER_NODES was passed to show_free_areas().
static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
{
if (!(flags & SHOW_MEM_FILTER_NODES))
return false;
/*
* no node mask - aka implicit memory numa policy. Do not bother with
* the synchronization - read_mems_allowed_begin - because we do not
* have to be precise here.
*/
if (!nodemask)
nodemask = &cpuset_current_mems_allowed;
return !node_isset(nid, *nodemask);
}
static void show_migration_types(unsigned char type)
{
static const char types[MIGRATE_TYPES] = {
[MIGRATE_UNMOVABLE] = 'U',
[MIGRATE_MOVABLE] = 'M',
[MIGRATE_RECLAIMABLE] = 'E',
[MIGRATE_HIGHATOMIC] = 'H',
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',
#endif
#ifdef CONFIG_MEMORY_ISOLATION
};
char tmp[MIGRATE_TYPES + 1];
char *p = tmp;
int i;
for (i = 0; i < MIGRATE_TYPES; i++) {
if (type & (1 << i))
*p++ = types[i];
}
*p = '\0';
printk(KERN_CONT "(%s) ", tmp);
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
*
* Bits in @filter:
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
void show_free_areas(unsigned int filter, nodemask_t *nodemask)
unsigned long free_pcp = 0;
for_each_populated_zone(zone) {
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
for_each_online_cpu(cpu)
free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
global_node_page_state(NR_INACTIVE_ANON),
global_node_page_state(NR_ISOLATED_ANON),
global_node_page_state(NR_ACTIVE_FILE),
global_node_page_state(NR_INACTIVE_FILE),
global_node_page_state(NR_ISOLATED_FILE),
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_FREE_PAGES),
free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
printk("Node %d"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
" isolated(anon):%lukB"
" isolated(file):%lukB"
" dirty:%lukB"
" writeback:%lukB"
" shmem:%lukB"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
" shmem_thp: %lukB"
" shmem_pmdmapped: %lukB"
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
" unstable:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
K(node_page_state(pgdat, NR_ACTIVE_ANON)),
K(node_page_state(pgdat, NR_INACTIVE_ANON)),
K(node_page_state(pgdat, NR_ACTIVE_FILE)),
K(node_page_state(pgdat, NR_INACTIVE_FILE)),
K(node_page_state(pgdat, NR_UNEVICTABLE)),
K(node_page_state(pgdat, NR_ISOLATED_ANON)),
K(node_page_state(pgdat, NR_ISOLATED_FILE)),
K(node_page_state(pgdat, NR_FILE_MAPPED)),
K(node_page_state(pgdat, NR_FILE_DIRTY)),
K(node_page_state(pgdat, NR_WRITEBACK)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
* HPAGE_PMD_NR),
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_SHMEM)),
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
node_page_state(pgdat, NR_PAGES_SCANNED),
!pgdat_reclaimable(pgdat) ? "yes" : "no");
}
for_each_populated_zone(zone) {
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
free_pcp = 0;
for_each_online_cpu(cpu)
free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
printk(KERN_CONT
"%s"
" free:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
" active_anon:%lukB"
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
" writepending:%lukB"
" mlocked:%lukB"
" slab_reclaimable:%lukB"
" slab_unreclaimable:%lukB"
KOSAKI Motohiro
committed
" kernel_stack:%lukB"
" pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
" local_pcp:%ukB"
K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
printk(KERN_CONT "\n");
for_each_populated_zone(zone) {
unsigned int order;
unsigned long nr[MAX_ORDER], flags, total = 0;
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
printk(KERN_CONT "%s: ", zone->name);
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
int type;
nr[order] = area->nr_free;
total += nr[order] << order;
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
if (!list_empty(&area->free_list[type]))
types[order] |= 1 << type;
}
for (order = 0; order < MAX_ORDER; order++) {
printk(KERN_CONT "%lu*%lukB ",
nr[order], K(1UL) << order);
if (nr[order])
show_migration_types(types[order]);
}
printk(KERN_CONT "= %lukB\n", K(total));
hugetlb_show_meminfo();
printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
Christoph Lameter
committed
*
* Add all populated zones of a node to the zonelist.
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
int nr_zones)
Christoph Lameter
committed
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
zone = pgdat->node_zones + zone_type;
Mel Gorman
committed
if (managed_zone(zone)) {
zoneref_set_zone(zone,
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
/*
* zonelist_order:
* 0 = automatic detection of better ordering.
* 1 = order by ([node] distance, -zonetype)
* 2 = order by (-zonetype, [node] distance)
*
* If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
* the same zonelist. So only NUMA can configure this param.
*/
#define ZONELIST_ORDER_DEFAULT 0
#define ZONELIST_ORDER_NODE 1
#define ZONELIST_ORDER_ZONE 2
/* zonelist order in the kernel.
* set_zonelist_order() will set this to NODE or ZONE.
*/
static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
/* The value user specified ....changed by config */
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
/* string for sysctl */
#define NUMA_ZONELIST_ORDER_LEN 16
char numa_zonelist_order[16] = "default";
/*
* interface for configure zonelist ordering.
* command line option "numa_zonelist_order"
* = "[dD]efault - default, automatic configuration.
* = "[nN]ode - order by node locality, then by zone within node
* = "[zZ]one - order by zone, then by locality within zone
*/
static int __parse_numa_zonelist_order(char *s)
{
if (*s == 'd' || *s == 'D') {
user_zonelist_order = ZONELIST_ORDER_DEFAULT;
} else if (*s == 'n' || *s == 'N') {
user_zonelist_order = ZONELIST_ORDER_NODE;
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
}
static __init int setup_numa_zonelist_order(char *s)
{
Volodymyr G. Lukiianyk
committed
int ret;
if (!s)
return 0;
ret = __parse_numa_zonelist_order(s);
if (ret == 0)
strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
/*
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos)
{
char saved_string[NUMA_ZONELIST_ORDER_LEN];
int ret;
static DEFINE_MUTEX(zl_order_mutex);
mutex_lock(&zl_order_mutex);
if (write) {
if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
ret = -EINVAL;
goto out;
}
strcpy(saved_string, (char *)table->data);
}
ret = proc_dostring(table, write, buffer, length, ppos);
if (write) {
int oldval = user_zonelist_order;
ret = __parse_numa_zonelist_order((char *)table->data);
if (ret) {
/*
* bogus value. restore saved string
*/
strncpy((char *)table->data, saved_string,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
Haicheng Li
committed
} else if (oldval != user_zonelist_order) {
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
Haicheng Li
committed
mutex_unlock(&zonelists_mutex);
}
out:
mutex_unlock(&zl_order_mutex);
return ret;
Christoph Lameter
committed
#define MAX_NODE_LOAD (nr_online_nodes)
static int node_load[MAX_NUMNODES];
* find_next_best_node - find the next node that should appear in a given node's fallback list
* @node: node whose fallback list we're appending
* @used_node_mask: nodemask_t of already used nodes
*
* We use a number of factors to determine which is the next node that should
* appear on a given node's fallback list. The node should not have appeared
* already in @node's fallback list, and it should be the next closest node
* according to the distance array (which contains arbitrary distance values
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
* It returns -1 if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
node_set(node, *used_node_mask);
return node;
}
Lai Jiangshan
committed
for_each_node_state(n, N_MEMORY) {
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
/* Use the distance array to find the distance */
val = node_distance(node, n);
/* Penalize nodes under us ("prefer the next node") */
val += (n < node);
tmp = cpumask_of_node(n);
if (!cpumask_empty(tmp))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
val += node_load[n];
if (val < min_val) {
min_val = val;
best_node = n;
}
}
if (best_node >= 0)
node_set(best_node, *used_node_mask);
return best_node;
}
/*
* Build zonelists ordered by node and zones within node.
* This results in maximum locality--normal zone overflows into local
* DMA zone, if any--but risks exhausting DMA zone.
*/
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Build gfp_thisnode zonelists
*/
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
int j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
j = build_zonelists_node(pgdat, zonelist, 0);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Build zonelists ordered by zone and nodes within zones.
* This results in conserving DMA zone[s] until all Normal memory is
* exhausted, but results in overflowing to remote node while memory
* may still exist in local DMA zone.
*/
static int node_order[MAX_NUMNODES];
static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
int pos, j, node;
int zone_type; /* needs to be signed */
struct zone *z;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
pos = 0;
for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
for (j = 0; j < nr_nodes; j++) {
node = node_order[j];
z = &NODE_DATA(node)->node_zones[zone_type];
Mel Gorman
committed
if (managed_zone(z)) {
zoneref_set_zone(z,
&zonelist->_zonerefs[pos++]);
check_highest_zone(zone_type);
}
}
}
zonelist->_zonerefs[pos].zone = NULL;
zonelist->_zonerefs[pos].zone_idx = 0;
#if defined(CONFIG_64BIT)
/*
* Devices that require DMA32/DMA are relatively rare and do not justify a
* penalty to every machine in case the specialised case applies. Default
* to Node-ordering on 64-bit NUMA machines
*/
static int default_zonelist_order(void)
{
return ZONELIST_ORDER_NODE;
}
#else
/*
* On 32-bit, the Normal zone needs to be preserved for allocations accessible
* by the kernel. If processes running on node 0 deplete the low memory zone
* then reclaim will occur more frequency increasing stalls and potentially
* be easier to OOM if a large percentage of the zone is under writeback or
* dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
* Hence, default to zone ordering on 32-bit.
*/
static int default_zonelist_order(void)
{
return ZONELIST_ORDER_ZONE;
}
#endif /* CONFIG_64BIT */
static void set_zonelist_order(void)
{
if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
current_zonelist_order = default_zonelist_order();
else
current_zonelist_order = user_zonelist_order;
}
static void build_zonelists(pg_data_t *pgdat)
{
int local_node, prev_node;
struct zonelist *zonelist;
unsigned int order = current_zonelist_order;
for (i = 0; i < MAX_ZONELISTS; i++) {
zonelist->_zonerefs[0].zone = NULL;
zonelist->_zonerefs[0].zone_idx = 0;
}
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
Christoph Lameter
committed
load = nr_online_nodes;
memset(node_order, 0, sizeof(node_order));
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
node_load[node] = load;
if (order == ZONELIST_ORDER_NODE)
build_zonelists_in_node_order(pgdat, node);
else
node_order[i++] = node; /* remember order */
if (order == ZONELIST_ORDER_ZONE) {
/* calculate node order -- i.e., DMA last! */
build_zonelists_in_zone_order(pgdat, i);
build_thisnode_zonelists(pgdat);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* Return node id of node used for "local" allocations.
* I.e., first node id of first zone in arg node's generic zonelist.
* Used for initializing percpu 'numa_mem', which is used primarily
* for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
*/
int local_memory_node(int node)
{
struct zoneref *z;
z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL);
return z->zone->node;
}
#endif
static void setup_min_unmapped_ratio(void);
static void setup_min_slab_ratio(void);
static void set_zonelist_order(void)
{
current_zonelist_order = ZONELIST_ORDER_ZONE;
}
static void build_zonelists(pg_data_t *pgdat)
int node, local_node;
enum zone_type j;
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
j = build_zonelists_node(pgdat, zonelist, 0);
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
/*
* Boot pageset table. One per cpu which is going to be used for all
* zones and all nodes. The parameters will be set in such a way
* that an item put on a list will immediately be handed over to
* the buddy list. This is safe since pageset manipulation is done
* with interrupts disabled.
*
* The boot_pagesets must be kept even after bootup is complete for
* unused processors and/or zones. They do play a role for bootstrapping
* hotplugged processors.
*
* zoneinfo_show() and maybe other functions do
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static void setup_zone_pageset(struct zone *zone);
Haicheng Li
committed
/*
* Global mutex to protect against size modification of zonelists
* as well as to serialize pageset setup for the new populated zone.
*/
DEFINE_MUTEX(zonelists_mutex);
/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *data)
Yasunori Goto
committed
int nid;
pg_data_t *self = data;
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
if (self && !node_online(self->node_id)) {
build_zonelists(self);
}
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
/*
* Initialize the boot_pagesets that are going to be used