Skip to content
Snippets Groups Projects
page_alloc.c 193 KiB
Newer Older
	if (order >= MAX_ORDER) {
		WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 * We also sanity check to catch abuse of atomic reserves being used by
	 * callers that are not in atomic context.
	 */
	if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
				(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
		gfp_mask &= ~__GFP_ATOMIC;

David Rientjes's avatar
David Rientjes committed
	 * If this allocation cannot block and it is for a specific node, then
	 * fail early.  There's no need to wakeup kswapd or retry for a
	 * speculative node-specific allocation.
	if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
Linus Torvalds's avatar
Linus Torvalds committed

	 * OK, we're below the kswapd watermark and have kicked background
	 * reclaim. Now things get more complex, so set up alloc_flags according
	 * to how we want to proceed.
	alloc_flags = gfp_to_alloc_flags(gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed

	/*
	 * Find the true preferred zone if the allocation is unconstrained by
	 * cpusets.
	 */
	if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
		struct zoneref *preferred_zoneref;
		preferred_zoneref = first_zones_zonelist(ac->zonelist,
				ac->high_zoneidx, NULL, &ac->preferred_zone);
		ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
	/* This is the last chance, in general, before the goto nopage. */
	page = get_page_from_freelist(gfp_mask, order,
				alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
	if (page)
		goto got_pg;
Linus Torvalds's avatar
Linus Torvalds committed

	/* Allocate without watermarks if the context allows */
	if (alloc_flags & ALLOC_NO_WATERMARKS) {
		/*
		 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
		 * the allocation is high priority and these type of
		 * allocations are system rather than user orientated
		 */
		ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
		page = get_page_from_freelist(gfp_mask, order,
						ALLOC_NO_WATERMARKS, ac);
		if (page)
			goto got_pg;
	/* Caller is not willing to reclaim, we can't balance anything */
	if (!can_direct_reclaim) {
		 * All existing users of the __GFP_NOFAIL are blockable, so warn
		 * of any new users that actually allow this type of allocation
		 * to fail.
		 */
		WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
Linus Torvalds's avatar
Linus Torvalds committed
		goto nopage;
Linus Torvalds's avatar
Linus Torvalds committed

	/* Avoid recursion of direct reclaim */
	if (current->flags & PF_MEMALLOC) {
		/*
		 * __GFP_NOFAIL request from this context is rather bizarre
		 * because we cannot reclaim anything and only can loop waiting
		 * for somebody to do a work for us.
		 */
		if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
			cond_resched();
			goto retry;
		}
	/* Avoid allocations with no watermarks from looping endlessly */
	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
		goto nopage;

	/*
	 * Try direct compaction. The first pass is asynchronous. Subsequent
	 * attempts after direct reclaim are synchronous
	 */
	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
					migration_mode,
					&contended_compaction,
	/* Checks for THP-specific high-order allocations */
		/*
		 * If compaction is deferred for high-order allocations, it is
		 * because sync compaction recently failed. If this is the case
		 * and the caller requested a THP allocation, we do not want
		 * to heavily disrupt the system, so we fail the allocation
		 * instead of entering direct reclaim.
		 */
		if (deferred_compaction)
			goto nopage;

		/*
		 * In all zones where compaction was attempted (and not
		 * deferred or skipped), lock contention has been detected.
		 * For THP allocation we do not want to disrupt the others
		 * so we fallback to base pages instead.
		 */
		if (contended_compaction == COMPACT_CONTENDED_LOCK)
			goto nopage;

		/*
		 * If compaction was aborted due to need_resched(), we do not
		 * want to further increase allocation latency, unless it is
		 * khugepaged trying to collapse.
		 */
		if (contended_compaction == COMPACT_CONTENDED_SCHED
			&& !(current->flags & PF_KTHREAD))
			goto nopage;
	}
	/*
	 * It can become very expensive to allocate transparent hugepages at
	 * fault, so use asynchronous memory compaction for THP unless it is
	 * khugepaged trying to collapse.
	 */
	if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
		migration_mode = MIGRATE_SYNC_LIGHT;

	/* Try direct reclaim and then allocating */
	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
							&did_some_progress);
Linus Torvalds's avatar
Linus Torvalds committed

	/* Do not loop if specifically requested */
	if (gfp_mask & __GFP_NORETRY)
		goto noretry;

	/* Keep reclaiming pages as long as there is reasonable progress */
	pages_reclaimed += did_some_progress;
	if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
	    ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
		/* Wait for some write requests to complete then retry */
		wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
	/* Reclaim has failed us, start killing things */
	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
	if (page)
		goto got_pg;

	/* Retry as long as the OOM killer is making progress */
	if (did_some_progress)
		goto retry;

noretry:
	/*
	 * High-order allocations do not necessarily loop after
	 * direct reclaim and reclaim/compaction depends on compaction
	 * being called after reclaim so call directly if necessary
	 */
	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
					    ac, migration_mode,
					    &contended_compaction,
					    &deferred_compaction);
	if (page)
		goto got_pg;
Linus Torvalds's avatar
Linus Torvalds committed
nopage:
	warn_alloc_failed(gfp_mask, order, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
got_pg:
Linus Torvalds's avatar
Linus Torvalds committed
}

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
			struct zonelist *zonelist, nodemask_t *nodemask)
{
	struct zoneref *preferred_zoneref;
	struct page *page = NULL;
	unsigned int cpuset_mems_cookie;
	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
	struct alloc_context ac = {
		.high_zoneidx = gfp_zone(gfp_mask),
		.nodemask = nodemask,
		.migratetype = gfpflags_to_migratetype(gfp_mask),
	};
	gfp_mask &= gfp_allowed_mask;

	might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);

	if (should_fail_alloc_page(gfp_mask, order))
		return NULL;

	/*
	 * Check the zones suitable for the gfp_mask contain at least one
	 * valid zone. It's possible to have an empty zonelist as a result
David Rientjes's avatar
David Rientjes committed
	 * of __GFP_THISNODE and a memoryless node
	 */
	if (unlikely(!zonelist->_zonerefs->zone))
		return NULL;

	if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
		alloc_flags |= ALLOC_CMA;

	cpuset_mems_cookie = read_mems_allowed_begin();
	/* We set it here, as __alloc_pages_slowpath might have changed it */
	ac.zonelist = zonelist;

	/* Dirty zone balancing only done in the fast path */
	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);

	/* The preferred zone is used for statistics later */
	preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
				ac.nodemask ? : &cpuset_current_mems_allowed,
				&ac.preferred_zone);
	if (!ac.preferred_zone)
	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
	alloc_mask = gfp_mask|__GFP_HARDWALL;
	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
	if (unlikely(!page)) {
		/*
		 * Runtime PM, block IO and its error handling path
		 * can deadlock because I/O on the device might not
		 * complete.
		 */
		alloc_mask = memalloc_noio_flags(gfp_mask);
		page = __alloc_pages_slowpath(alloc_mask, order, &ac);
	if (kmemcheck_enabled && page)
		kmemcheck_pagealloc_alloc(page, order, gfp_mask);

	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);

out:
	/*
	 * When updating a task's mems_allowed, it is possible to race with
	 * parallel threads in such a way that an allocation can fail while
	 * the mask is being updated. If a page allocation is about to fail,
	 * check if the cpuset changed during allocation and if so, retry.
	 */
	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
Linus Torvalds's avatar
Linus Torvalds committed
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
Linus Torvalds's avatar
Linus Torvalds committed

/*
 * Common helper functions.
 */
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct page *page;

	/*
	 * __get_free_pages() returns a 32-bit address, which cannot represent
	 * a highmem page
	 */
	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

Linus Torvalds's avatar
Linus Torvalds committed
	page = alloc_pages(gfp_mask, order);
	if (!page)
		return 0;
	return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);

unsigned long get_zeroed_page(gfp_t gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
{
	return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
Linus Torvalds's avatar
Linus Torvalds committed
}
EXPORT_SYMBOL(get_zeroed_page);

void __free_pages(struct page *page, unsigned int order)
Linus Torvalds's avatar
Linus Torvalds committed
{
	if (put_page_testzero(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
		if (order == 0)
Linus Torvalds's avatar
Linus Torvalds committed
		else
			__free_pages_ok(page, order);
	}
}

EXPORT_SYMBOL(__free_pages);

void free_pages(unsigned long addr, unsigned int order)
Linus Torvalds's avatar
Linus Torvalds committed
{
	if (addr != 0) {
Nick Piggin's avatar
Nick Piggin committed
		VM_BUG_ON(!virt_addr_valid((void *)addr));
Linus Torvalds's avatar
Linus Torvalds committed
		__free_pages(virt_to_page((void *)addr), order);
	}
}

EXPORT_SYMBOL(free_pages);

/*
 * Page Fragment:
 *  An arbitrary-length arbitrary-offset area of memory which resides
 *  within a 0 or higher order page.  Multiple fragments within that page
 *  are individually refcounted, in the page's reference counter.
 *
 * The page_frag functions below provide a simple allocation framework for
 * page fragments.  This is used by the network stack and network device
 * drivers to provide a backing region of memory for use as either an
 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
 */
static struct page *__page_frag_refill(struct page_frag_cache *nc,
				       gfp_t gfp_mask)
{
	struct page *page = NULL;
	gfp_t gfp = gfp_mask;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
		    __GFP_NOMEMALLOC;
	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
				PAGE_FRAG_CACHE_MAX_ORDER);
	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
#endif
	if (unlikely(!page))
		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);

	nc->va = page ? page_address(page) : NULL;

	return page;
}

void *__alloc_page_frag(struct page_frag_cache *nc,
			unsigned int fragsz, gfp_t gfp_mask)
{
	unsigned int size = PAGE_SIZE;
	struct page *page;
	int offset;

	if (unlikely(!nc->va)) {
refill:
		page = __page_frag_refill(nc, gfp_mask);
		if (!page)
			return NULL;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
		/* if size can vary use size else just use PAGE_SIZE */
		size = nc->size;
#endif
		/* Even if we own the page, we do not use atomic_set().
		 * This would break get_page_unless_zero() users.
		 */
		atomic_add(size - 1, &page->_count);

		/* reset page count bias and offset to start of new frag */
		nc->pfmemalloc = page_is_pfmemalloc(page);
		nc->pagecnt_bias = size;
		nc->offset = size;
	}

	offset = nc->offset - fragsz;
	if (unlikely(offset < 0)) {
		page = virt_to_page(nc->va);

		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
			goto refill;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
		/* if size can vary use size else just use PAGE_SIZE */
		size = nc->size;
#endif
		/* OK, page count is 0, we can safely set it */
		atomic_set(&page->_count, size);

		/* reset page count bias and offset to start of new frag */
		nc->pagecnt_bias = size;
		offset = size - fragsz;
	}

	nc->pagecnt_bias--;
	nc->offset = offset;

	return nc->va + offset;
}
EXPORT_SYMBOL(__alloc_page_frag);

/*
 * Frees a page fragment allocated out of either a compound or order 0 page.
 */
void __free_page_frag(void *addr)
{
	struct page *page = virt_to_head_page(addr);

	if (unlikely(put_page_testzero(page)))
		__free_pages_ok(page, compound_order(page));
}
EXPORT_SYMBOL(__free_page_frag);

 * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
 * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
 * equivalent to alloc_pages.
 * It should be used when the caller would like to use kmalloc, but since the
 * allocation is large, it has to fall back to the page allocator.
 */
struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
{
	struct page *page;

	page = alloc_pages(gfp_mask, order);
	if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
		__free_pages(page, order);
		page = NULL;
	}
	return page;
}

struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
	struct page *page;

	page = alloc_pages_node(nid, gfp_mask, order);
	if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
		__free_pages(page, order);
		page = NULL;
	}
	return page;
}

/*
 * __free_kmem_pages and free_kmem_pages will free pages allocated with
 * alloc_kmem_pages.
void __free_kmem_pages(struct page *page, unsigned int order)
	memcg_kmem_uncharge(page, order);
	__free_pages(page, order);
}

void free_kmem_pages(unsigned long addr, unsigned int order)
{
	if (addr != 0) {
		VM_BUG_ON(!virt_addr_valid((void *)addr));
		__free_kmem_pages(virt_to_page((void *)addr), order);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
		size_t size)
{
	if (addr) {
		unsigned long alloc_end = addr + (PAGE_SIZE << order);
		unsigned long used = addr + PAGE_ALIGN(size);

		split_page(virt_to_page((void *)addr), order);
		while (used < alloc_end) {
			free_page(used);
			used += PAGE_SIZE;
		}
	}
	return (void *)addr;
}

/**
 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
 * @size: the number of bytes to allocate
 * @gfp_mask: GFP flags for the allocation
 *
 * This function is similar to alloc_pages(), except that it allocates the
 * minimum number of pages to satisfy the request.  alloc_pages() can only
 * allocate memory in power-of-two pages.
 *
 * This function is also limited by MAX_ORDER.
 *
 * Memory allocated by this function must be released by free_pages_exact().
 */
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
	unsigned int order = get_order(size);
	unsigned long addr;

	addr = __get_free_pages(gfp_mask, order);
	return make_alloc_exact(addr, order, size);
}
EXPORT_SYMBOL(alloc_pages_exact);

/**
 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
 *			   pages on a node.
 * @nid: the preferred node ID where memory should be allocated
 * @size: the number of bytes to allocate
 * @gfp_mask: GFP flags for the allocation
 *
 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
 * back.
 */
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
	unsigned int order = get_order(size);
	struct page *p = alloc_pages_node(nid, gfp_mask, order);
	if (!p)
		return NULL;
	return make_alloc_exact((unsigned long)page_address(p), order, size);
}

/**
 * free_pages_exact - release memory allocated via alloc_pages_exact()
 * @virt: the value returned by alloc_pages_exact.
 * @size: size of allocation, same value as passed to alloc_pages_exact().
 *
 * Release the memory allocated by a previous call to alloc_pages_exact.
 */
void free_pages_exact(void *virt, size_t size)
{
	unsigned long addr = (unsigned long)virt;
	unsigned long end = addr + PAGE_ALIGN(size);

	while (addr < end) {
		free_page(addr);
		addr += PAGE_SIZE;
	}
}
EXPORT_SYMBOL(free_pages_exact);

/**
 * nr_free_zone_pages - count number of pages beyond high watermark
 * @offset: The zone index of the highest zone
 *
 * nr_free_zone_pages() counts the number of counts pages which are beyond the
 * high watermark within all zones at or below a given zone index.  For each
 * zone, the number of pages is calculated as:
 *     managed_pages - high_pages
static unsigned long nr_free_zone_pages(int offset)
Linus Torvalds's avatar
Linus Torvalds committed
{
	/* Just pick one node, since fallback list is circular */
	unsigned long sum = 0;
Linus Torvalds's avatar
Linus Torvalds committed

	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed

	for_each_zone_zonelist(zone, z, zonelist, offset) {
		unsigned long size = zone->managed_pages;
		unsigned long high = high_wmark_pages(zone);
		if (size > high)
			sum += size - high;
/**
 * nr_free_buffer_pages - count number of pages beyond high watermark
 *
 * nr_free_buffer_pages() counts the number of pages which are beyond the high
 * watermark within ZONE_DMA and ZONE_NORMAL.
Linus Torvalds's avatar
Linus Torvalds committed
 */
unsigned long nr_free_buffer_pages(void)
Linus Torvalds's avatar
Linus Torvalds committed
{
Al Viro's avatar
Al Viro committed
	return nr_free_zone_pages(gfp_zone(GFP_USER));
Linus Torvalds's avatar
Linus Torvalds committed
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
Linus Torvalds's avatar
Linus Torvalds committed

/**
 * nr_free_pagecache_pages - count number of pages beyond high watermark
 *
 * nr_free_pagecache_pages() counts the number of pages which are beyond the
 * high watermark within all zones.
Linus Torvalds's avatar
Linus Torvalds committed
 */
unsigned long nr_free_pagecache_pages(void)
Linus Torvalds's avatar
Linus Torvalds committed
{
Mel Gorman's avatar
Mel Gorman committed
	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
Linus Torvalds's avatar
Linus Torvalds committed
}

static inline void show_node(struct zone *zone)
Linus Torvalds's avatar
Linus Torvalds committed
{
	if (IS_ENABLED(CONFIG_NUMA))
Linus Torvalds's avatar
Linus Torvalds committed
}

void si_meminfo(struct sysinfo *val)
{
	val->totalram = totalram_pages;
	val->sharedram = global_page_state(NR_SHMEM);
	val->freeram = global_page_state(NR_FREE_PAGES);
Linus Torvalds's avatar
Linus Torvalds committed
	val->bufferram = nr_blockdev_pages();
	val->totalhigh = totalhigh_pages;
	val->freehigh = nr_free_highpages();
	val->mem_unit = PAGE_SIZE;
}

EXPORT_SYMBOL(si_meminfo);

#ifdef CONFIG_NUMA
void si_meminfo_node(struct sysinfo *val, int nid)
{
	int zone_type;		/* needs to be signed */
	unsigned long managed_pages = 0;
Linus Torvalds's avatar
Linus Torvalds committed
	pg_data_t *pgdat = NODE_DATA(nid);

	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
		managed_pages += pgdat->node_zones[zone_type].managed_pages;
	val->totalram = managed_pages;
	val->sharedram = node_page_state(nid, NR_SHMEM);
	val->freeram = node_page_state(nid, NR_FREE_PAGES);
	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
			NR_FREE_PAGES);
#else
	val->totalhigh = 0;
	val->freehigh = 0;
#endif
Linus Torvalds's avatar
Linus Torvalds committed
	val->mem_unit = PAGE_SIZE;
}
#endif

 * Determine whether the node should be displayed or not, depending on whether
 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
bool skip_free_areas_node(unsigned int flags, int nid)
	unsigned int cpuset_mems_cookie;

	if (!(flags & SHOW_MEM_FILTER_NODES))
		goto out;

		cpuset_mems_cookie = read_mems_allowed_begin();
		ret = !node_isset(nid, cpuset_current_mems_allowed);
	} while (read_mems_allowed_retry(cpuset_mems_cookie));
Linus Torvalds's avatar
Linus Torvalds committed
#define K(x) ((x) << (PAGE_SHIFT-10))

static void show_migration_types(unsigned char type)
{
	static const char types[MIGRATE_TYPES] = {
		[MIGRATE_UNMOVABLE]	= 'U',
		[MIGRATE_MOVABLE]	= 'M',
		[MIGRATE_RECLAIMABLE]	= 'E',
		[MIGRATE_HIGHATOMIC]	= 'H',
#ifdef CONFIG_CMA
		[MIGRATE_CMA]		= 'C',
#endif
#ifdef CONFIG_MEMORY_ISOLATION
		[MIGRATE_ISOLATE]	= 'I',
	};
	char tmp[MIGRATE_TYPES + 1];
	char *p = tmp;
	int i;

	for (i = 0; i < MIGRATE_TYPES; i++) {
		if (type & (1 << i))
			*p++ = types[i];
	}

	*p = '\0';
	printk("(%s) ", tmp);
}

Linus Torvalds's avatar
Linus Torvalds committed
/*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
 *
 * Bits in @filter:
 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
 *   cpuset.
Linus Torvalds's avatar
Linus Torvalds committed
 */
void show_free_areas(unsigned int filter)
Linus Torvalds's avatar
Linus Torvalds committed
{
	unsigned long free_pcp = 0;
Linus Torvalds's avatar
Linus Torvalds committed
	struct zone *zone;

	for_each_populated_zone(zone) {
		if (skip_free_areas_node(filter, zone_to_nid(zone)))
		for_each_online_cpu(cpu)
			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
		" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
		" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
		" free:%lu free_pcp:%lu free_cma:%lu\n",
		global_page_state(NR_ACTIVE_ANON),
		global_page_state(NR_INACTIVE_ANON),
		global_page_state(NR_ISOLATED_ANON),
		global_page_state(NR_ACTIVE_FILE),
		global_page_state(NR_INACTIVE_FILE),
		global_page_state(NR_ISOLATED_FILE),
		global_page_state(NR_UNEVICTABLE),
		global_page_state(NR_SLAB_RECLAIMABLE),
		global_page_state(NR_SLAB_UNRECLAIMABLE),
		global_page_state(NR_FILE_MAPPED),
		global_page_state(NR_SHMEM),
		global_page_state(NR_PAGETABLE),
		global_page_state(NR_BOUNCE),
		global_page_state(NR_FREE_PAGES),
		free_pcp,
		global_page_state(NR_FREE_CMA_PAGES));
Linus Torvalds's avatar
Linus Torvalds committed

	for_each_populated_zone(zone) {
Linus Torvalds's avatar
Linus Torvalds committed
		int i;

		if (skip_free_areas_node(filter, zone_to_nid(zone)))

		free_pcp = 0;
		for_each_online_cpu(cpu)
			free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;

Linus Torvalds's avatar
Linus Torvalds committed
		show_node(zone);
		printk("%s"
			" free:%lukB"
			" min:%lukB"
			" low:%lukB"
			" high:%lukB"
			" active_anon:%lukB"
			" inactive_anon:%lukB"
			" active_file:%lukB"
			" inactive_file:%lukB"
			" unevictable:%lukB"
			" isolated(anon):%lukB"
			" isolated(file):%lukB"
Linus Torvalds's avatar
Linus Torvalds committed
			" present:%lukB"
			" mlocked:%lukB"
			" dirty:%lukB"
			" writeback:%lukB"
			" mapped:%lukB"
			" shmem:%lukB"
			" slab_reclaimable:%lukB"
			" slab_unreclaimable:%lukB"
			" pagetables:%lukB"
			" unstable:%lukB"
			" bounce:%lukB"
			" free_pcp:%lukB"
			" local_pcp:%ukB"
			" free_cma:%lukB"
Linus Torvalds's avatar
Linus Torvalds committed
			" pages_scanned:%lu"
			" all_unreclaimable? %s"
			"\n",
			zone->name,
			K(zone_page_state(zone, NR_FREE_PAGES)),
			K(min_wmark_pages(zone)),
			K(low_wmark_pages(zone)),
			K(high_wmark_pages(zone)),
			K(zone_page_state(zone, NR_ACTIVE_ANON)),
			K(zone_page_state(zone, NR_INACTIVE_ANON)),
			K(zone_page_state(zone, NR_ACTIVE_FILE)),
			K(zone_page_state(zone, NR_INACTIVE_FILE)),
			K(zone_page_state(zone, NR_UNEVICTABLE)),
			K(zone_page_state(zone, NR_ISOLATED_ANON)),
			K(zone_page_state(zone, NR_ISOLATED_FILE)),
Linus Torvalds's avatar
Linus Torvalds committed
			K(zone->present_pages),
			K(zone->managed_pages),
			K(zone_page_state(zone, NR_MLOCK)),
			K(zone_page_state(zone, NR_FILE_DIRTY)),
			K(zone_page_state(zone, NR_WRITEBACK)),
			K(zone_page_state(zone, NR_FILE_MAPPED)),
			K(zone_page_state(zone, NR_SHMEM)),
			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
			zone_page_state(zone, NR_KERNEL_STACK) *
				THREAD_SIZE / 1024,
			K(zone_page_state(zone, NR_PAGETABLE)),
			K(zone_page_state(zone, NR_UNSTABLE_NFS)),
			K(zone_page_state(zone, NR_BOUNCE)),
			K(free_pcp),
			K(this_cpu_read(zone->pageset->pcp.count)),
			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
			K(zone_page_state(zone, NR_PAGES_SCANNED)),
			(!zone_reclaimable(zone) ? "yes" : "no")
Linus Torvalds's avatar
Linus Torvalds committed
			);
		printk("lowmem_reserve[]:");
		for (i = 0; i < MAX_NR_ZONES; i++)
Linus Torvalds's avatar
Linus Torvalds committed
		printk("\n");
	}

	for_each_populated_zone(zone) {
		unsigned int order;
		unsigned long nr[MAX_ORDER], flags, total = 0;
		unsigned char types[MAX_ORDER];
Linus Torvalds's avatar
Linus Torvalds committed

		if (skip_free_areas_node(filter, zone_to_nid(zone)))
Linus Torvalds's avatar
Linus Torvalds committed
		show_node(zone);
		printk("%s: ", zone->name);

		spin_lock_irqsave(&zone->lock, flags);
		for (order = 0; order < MAX_ORDER; order++) {
			struct free_area *area = &zone->free_area[order];
			int type;

			nr[order] = area->nr_free;
			total += nr[order] << order;

			types[order] = 0;
			for (type = 0; type < MIGRATE_TYPES; type++) {
				if (!list_empty(&area->free_list[type]))
					types[order] |= 1 << type;
			}
Linus Torvalds's avatar
Linus Torvalds committed
		}
		spin_unlock_irqrestore(&zone->lock, flags);
		for (order = 0; order < MAX_ORDER; order++) {
			printk("%lu*%lukB ", nr[order], K(1UL) << order);
			if (nr[order])
				show_migration_types(types[order]);
		}
Linus Torvalds's avatar
Linus Torvalds committed
		printk("= %lukB\n", K(total));
	}

	hugetlb_show_meminfo();

	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));

Linus Torvalds's avatar
Linus Torvalds committed
	show_swap_cache_info();
}

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
	zoneref->zone = zone;
	zoneref->zone_idx = zone_idx(zone);
}

Linus Torvalds's avatar
Linus Torvalds committed
/*
 * Builds allocation fallback zone lists.
 *
 * Add all populated zones of a node to the zonelist.
Linus Torvalds's avatar
Linus Torvalds committed
 */
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
Linus Torvalds's avatar
Linus Torvalds committed
{
	enum zone_type zone_type = MAX_NR_ZONES;
		zone_type--;
		zone = pgdat->node_zones + zone_type;
			zoneref_set_zone(zone,
				&zonelist->_zonerefs[nr_zones++]);
			check_highest_zone(zone_type);
Linus Torvalds's avatar
Linus Torvalds committed
		}
	} while (zone_type);

/*
 *  zonelist_order:
 *  0 = automatic detection of better ordering.
 *  1 = order by ([node] distance, -zonetype)
 *  2 = order by (-zonetype, [node] distance)
 *
 *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
 *  the same zonelist. So only NUMA can configure this param.
 */
#define ZONELIST_ORDER_DEFAULT  0
#define ZONELIST_ORDER_NODE     1
#define ZONELIST_ORDER_ZONE     2

/* zonelist order in the kernel.
 * set_zonelist_order() will set this to NODE or ZONE.
 */
static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};


Linus Torvalds's avatar
Linus Torvalds committed
#ifdef CONFIG_NUMA
/* The value user specified ....changed by config */
static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
/* string for sysctl */
#define NUMA_ZONELIST_ORDER_LEN	16
char numa_zonelist_order[16] = "default";

/*
 * interface for configure zonelist ordering.
 * command line option "numa_zonelist_order"
 *	= "[dD]efault	- default, automatic configuration.
 *	= "[nN]ode 	- order by node locality, then by zone within node
 *	= "[zZ]one      - order by zone, then by locality within zone
 */

static int __parse_numa_zonelist_order(char *s)
{
	if (*s == 'd' || *s == 'D') {
		user_zonelist_order = ZONELIST_ORDER_DEFAULT;
	} else if (*s == 'n' || *s == 'N') {
		user_zonelist_order = ZONELIST_ORDER_NODE;
	} else if (*s == 'z' || *s == 'Z') {
		user_zonelist_order = ZONELIST_ORDER_ZONE;
	} else {
		printk(KERN_WARNING
			"Ignoring invalid numa_zonelist_order value:  "
			"%s\n", s);
		return -EINVAL;
	}
	return 0;
}

static __init int setup_numa_zonelist_order(char *s)
{
	int ret;

	if (!s)
		return 0;

	ret = __parse_numa_zonelist_order(s);
	if (ret == 0)
		strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);

	return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);

/*
 * sysctl handler for numa_zonelist_order
 */
int numa_zonelist_order_handler(struct ctl_table *table, int write,
		void __user *buffer, size_t *length,
		loff_t *ppos)
{
	char saved_string[NUMA_ZONELIST_ORDER_LEN];
	int ret;
	static DEFINE_MUTEX(zl_order_mutex);
	mutex_lock(&zl_order_mutex);
	if (write) {
		if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
			ret = -EINVAL;
			goto out;
		}
		strcpy(saved_string, (char *)table->data);
	}
	ret = proc_dostring(table, write, buffer, length, ppos);
	if (write) {
		int oldval = user_zonelist_order;

		ret = __parse_numa_zonelist_order((char *)table->data);
		if (ret) {
			/*
			 * bogus value.  restore saved string
			 */
			strncpy((char *)table->data, saved_string,
				NUMA_ZONELIST_ORDER_LEN);
			user_zonelist_order = oldval;
		} else if (oldval != user_zonelist_order) {
			mutex_lock(&zonelists_mutex);
			build_all_zonelists(NULL, NULL);