Skip to content
Snippets Groups Projects
page_alloc.c 198 KiB
Newer Older
			list_add(&page->lru, list);
		else
			list_add_tail(&page->lru, list);
		list = &page->lru;
		if (is_migrate_cma(get_pcppage_migratetype(page)))
			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
					      -(1 << order));
Linus Torvalds's avatar
Linus Torvalds committed
	}
	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
Nick Piggin's avatar
Nick Piggin committed
	spin_unlock(&zone->lock);
	return i;
 * Called from the vmstat counter updater to drain pagesets of this
 * currently executing processor on remote nodes after they have
 * expired.
 *
 * Note that this function must be called with the thread pinned to
 * a single processor.
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
	local_irq_save(flags);
	batch = READ_ONCE(pcp->batch);
	to_drain = min(pcp->count, batch);
	if (to_drain > 0) {
		free_pcppages_bulk(zone, to_drain, pcp);
		pcp->count -= to_drain;
	}
	local_irq_restore(flags);
 * Drain pcplists of the indicated processor and zone.
 *
 * The processor must either be the current processor and the
 * thread pinned to the current processor or a processor that
 * is not online.
 */
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
Linus Torvalds's avatar
Linus Torvalds committed
{
Nick Piggin's avatar
Nick Piggin committed
	unsigned long flags;
	struct per_cpu_pageset *pset;
	struct per_cpu_pages *pcp;
Linus Torvalds's avatar
Linus Torvalds committed

	local_irq_save(flags);
	pset = per_cpu_ptr(zone->pageset, cpu);
Linus Torvalds's avatar
Linus Torvalds committed

	pcp = &pset->pcp;
	if (pcp->count) {
		free_pcppages_bulk(zone, pcp->count, pcp);
		pcp->count = 0;
	}
	local_irq_restore(flags);
}
/*
 * Drain pcplists of all zones on the indicated processor.
 *
 * The processor must either be the current processor and the
 * thread pinned to the current processor or a processor that
 * is not online.
 */
static void drain_pages(unsigned int cpu)
{
	struct zone *zone;

	for_each_populated_zone(zone) {
		drain_pages_zone(cpu, zone);
/*
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
 *
 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
 * the single zone's pages.
void drain_local_pages(struct zone *zone)
	int cpu = smp_processor_id();

	if (zone)
		drain_pages_zone(cpu, zone);
	else
		drain_pages(cpu);
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
 *
 * When zone parameter is non-NULL, spill just the single zone's pages.
 *
 * Note that this code is protected against sending an IPI to an offline
 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
 * nothing keeps CPUs from showing up after we populated the cpumask and
 * before the call to on_each_cpu_mask().
void drain_all_pages(struct zone *zone)
	int cpu;

	/*
	 * Allocate in the BSS so we wont require allocation in
	 * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
	 */
	static cpumask_t cpus_with_pcps;

	/*
	 * We don't care about racing with CPU hotplug event
	 * as offline notification will cause the notified
	 * cpu to drain that CPU pcps and on_each_cpu_mask
	 * disables preemption as part of its processing
	 */
	for_each_online_cpu(cpu) {
		struct per_cpu_pageset *pcp;
		struct zone *z;
			pcp = per_cpu_ptr(zone->pageset, cpu);
			if (pcp->pcp.count)
		} else {
			for_each_populated_zone(z) {
				pcp = per_cpu_ptr(z->pageset, cpu);
				if (pcp->pcp.count) {
					has_pcps = true;
					break;
				}
		if (has_pcps)
			cpumask_set_cpu(cpu, &cpus_with_pcps);
		else
			cpumask_clear_cpu(cpu, &cpus_with_pcps);
	}
	on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
								zone, 1);
Linus Torvalds's avatar
Linus Torvalds committed

void mark_free_pages(struct zone *zone)
{
	unsigned long pfn, max_zone_pfn;
	unsigned long flags;
Linus Torvalds's avatar
Linus Torvalds committed

	if (zone_is_empty(zone))
Linus Torvalds's avatar
Linus Torvalds committed
		return;

	spin_lock_irqsave(&zone->lock, flags);
	max_zone_pfn = zone_end_pfn(zone);
	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
		if (pfn_valid(pfn)) {

			if (page_zone(page) != zone)
				continue;

			if (!swsusp_page_is_forbidden(page))
				swsusp_unset_page_free(page);
Linus Torvalds's avatar
Linus Torvalds committed

	for_each_migratetype_order(order, t) {
		list_for_each_entry(page,
				&zone->free_area[order].free_list[t], lru) {
			unsigned long i;
Linus Torvalds's avatar
Linus Torvalds committed

			for (i = 0; i < (1UL << order); i++)
				swsusp_set_page_free(pfn_to_page(pfn + i));
Linus Torvalds's avatar
Linus Torvalds committed
	spin_unlock_irqrestore(&zone->lock, flags);
}
#endif /* CONFIG_PM */
Linus Torvalds's avatar
Linus Torvalds committed

/*
 * Free a 0-order page
 * cold == true ? free a cold page : free a hot page
Linus Torvalds's avatar
Linus Torvalds committed
 */
void free_hot_cold_page(struct page *page, bool cold)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct zone *zone = page_zone(page);
	struct per_cpu_pages *pcp;
	unsigned long flags;
	unsigned long pfn = page_to_pfn(page);
Linus Torvalds's avatar
Linus Torvalds committed

	if (!free_pages_prepare(page, 0))
	migratetype = get_pfnblock_migratetype(page, pfn);
	set_pcppage_migratetype(page, migratetype);
Linus Torvalds's avatar
Linus Torvalds committed
	local_irq_save(flags);
	__count_vm_event(PGFREE);
	/*
	 * We only track unmovable, reclaimable and movable on pcp lists.
	 * Free ISOLATE pages back to the allocator because they are being
	 * offlined but treat RESERVE as movable pages so we can get those
	 * areas back if necessary. Otherwise, we may have to free
	 * excessively into the page allocator
	 */
	if (migratetype >= MIGRATE_PCPTYPES) {
		if (unlikely(is_migrate_isolate(migratetype))) {
			free_one_page(zone, page, pfn, 0, migratetype);
	pcp = &this_cpu_ptr(zone->pageset)->pcp;
		list_add(&page->lru, &pcp->lists[migratetype]);
	else
		list_add_tail(&page->lru, &pcp->lists[migratetype]);
Linus Torvalds's avatar
Linus Torvalds committed
	pcp->count++;
Nick Piggin's avatar
Nick Piggin committed
	if (pcp->count >= pcp->high) {
		unsigned long batch = READ_ONCE(pcp->batch);
		free_pcppages_bulk(zone, batch, pcp);
		pcp->count -= batch;
Linus Torvalds's avatar
Linus Torvalds committed
	local_irq_restore(flags);
}

/*
 * Free a list of 0-order pages
 */
void free_hot_cold_page_list(struct list_head *list, bool cold)
{
	struct page *page, *next;

	list_for_each_entry_safe(page, next, list, lru) {
		trace_mm_page_free_batched(page, cold);
		free_hot_cold_page(page, cold);
	}
}

/*
 * split_page takes a non-compound higher-order page, and splits it into
 * n (1<<order) sub-pages: page[0..n]
 * Each sub-page must be freed individually.
 *
 * Note: this is probably too low level an operation for use in drivers.
 * Please consult with lkml before using this in your driver.
 */
void split_page(struct page *page, unsigned int order)
{
	int i;
	VM_BUG_ON_PAGE(PageCompound(page), page);
	VM_BUG_ON_PAGE(!page_count(page), page);

#ifdef CONFIG_KMEMCHECK
	/*
	 * Split shadow pages too, because free(page[0]) would
	 * otherwise free the whole shadow.
	 */
	if (kmemcheck_page_is_tracked(page))
		split_page(virt_to_page(page[0].shadow), order);
#endif

	gfp_mask = get_page_owner_gfp(page);
	set_page_owner(page, 0, gfp_mask);
	for (i = 1; i < (1 << order); i++) {
		set_page_refcounted(page + i);
		set_page_owner(page + i, 0, gfp_mask);
EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
	unsigned long watermark;
	struct zone *zone;

	BUG_ON(!PageBuddy(page));

	zone = page_zone(page);
	if (!is_migrate_isolate(mt)) {
		/* Obey watermarks as if the page was being allocated */
		watermark = low_wmark_pages(zone) + (1 << order);
		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
			return 0;

		__mod_zone_freepage_state(zone, -(1UL << order), mt);

	/* Remove page from free list */
	list_del(&page->lru);
	zone->free_area[order].nr_free--;
	rmv_page_order(page);
	set_page_owner(page, order, __GFP_MOVABLE);
	/* Set the pageblock if the isolated page is at least a pageblock */
	if (order >= pageblock_order - 1) {
		struct page *endpage = page + (1 << order) - 1;
		for (; page < endpage; page += pageblock_nr_pages) {
			int mt = get_pageblock_migratetype(page);
			if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
				set_pageblock_migratetype(page,
							  MIGRATE_MOVABLE);
		}
}

/*
 * Similar to split_page except the page is already free. As this is only
 * being used for migration, the migratetype of the block also changes.
 * As this is called with interrupts disabled, the caller is responsible
 * for calling arch_alloc_page() and kernel_map_page() after interrupts
 * are enabled.
 *
 * Note: this is probably too low level an operation for use in drivers.
 * Please consult with lkml before using this in your driver.
 */
int split_free_page(struct page *page)
{
	unsigned int order;
	int nr_pages;

	order = page_order(page);

	nr_pages = __isolate_free_page(page, order);
	if (!nr_pages)
		return 0;

	/* Split into individual pages */
	set_page_refcounted(page);
	split_page(page, order);
	return nr_pages;
/*
 * Update NUMA hit/miss statistics
 *
 * Must be called with interrupts disabled.
 *
 * When __GFP_OTHER_NODE is set assume the node of the preferred
 * zone is the local node. This is useful for daemons who allocate
 * memory on behalf of other processes.
 */
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
								gfp_t flags)
{
#ifdef CONFIG_NUMA
	int local_nid = numa_node_id();
	enum zone_stat_item local_stat = NUMA_LOCAL;

	if (unlikely(flags & __GFP_OTHER_NODE)) {
		local_stat = NUMA_OTHER;
		local_nid = preferred_zone->node;
	}

	if (z->node == local_nid) {
		__inc_zone_state(z, NUMA_HIT);
		__inc_zone_state(z, local_stat);
	} else {
		__inc_zone_state(z, NUMA_MISS);
		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
	}
#endif
}

Linus Torvalds's avatar
Linus Torvalds committed
/*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
Linus Torvalds's avatar
Linus Torvalds committed
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
			struct zone *zone, unsigned int order,
			gfp_t gfp_flags, int alloc_flags, int migratetype)
Linus Torvalds's avatar
Linus Torvalds committed
{
	unsigned long flags;
	struct page *page;
	bool cold = ((gfp_flags & __GFP_COLD) != 0);
Linus Torvalds's avatar
Linus Torvalds committed

Nick Piggin's avatar
Nick Piggin committed
	if (likely(order == 0)) {
Linus Torvalds's avatar
Linus Torvalds committed
		struct per_cpu_pages *pcp;
Linus Torvalds's avatar
Linus Torvalds committed

		local_irq_save(flags);
		pcp = &this_cpu_ptr(zone->pageset)->pcp;
		list = &pcp->lists[migratetype];
			pcp->count += rmqueue_bulk(zone, 0,
			if (unlikely(list_empty(list)))
			page = list_last_entry(list, struct page, lru);
			page = list_first_entry(list, struct page, lru);
		__dec_zone_state(zone, NR_ALLOC_BATCH);
		list_del(&page->lru);
		pcp->count--;
	} else {
		/*
		 * We most definitely don't want callers attempting to
		 * allocate greater than order-1 page units with __GFP_NOFAIL.
		 */
		WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
Linus Torvalds's avatar
Linus Torvalds committed
		spin_lock_irqsave(&zone->lock, flags);

		page = NULL;
		if (alloc_flags & ALLOC_HARDER) {
			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
			if (page)
				trace_mm_page_alloc_zone_locked(page, order, migratetype);
		}
		if (!page)
			page = __rmqueue(zone, order, migratetype);
Nick Piggin's avatar
Nick Piggin committed
		spin_unlock(&zone->lock);
		if (!page)
			goto failed;
		__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
		__mod_zone_freepage_state(zone, -(1 << order),
					  get_pcppage_migratetype(page));
	if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
Johannes Weiner's avatar
Johannes Weiner committed
	    !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
		set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
	__count_zone_vm_events(PGALLOC, zone, 1 << order);
	zone_statistics(preferred_zone, zone, gfp_flags);
Nick Piggin's avatar
Nick Piggin committed
	local_irq_restore(flags);
Linus Torvalds's avatar
Linus Torvalds committed

	VM_BUG_ON_PAGE(bad_range(zone, page), page);
Linus Torvalds's avatar
Linus Torvalds committed
	return page;
Nick Piggin's avatar
Nick Piggin committed

failed:
	local_irq_restore(flags);
	return NULL;
#ifdef CONFIG_FAIL_PAGE_ALLOC

	bool ignore_gfp_highmem;
	bool ignore_gfp_reclaim;
} fail_page_alloc = {
	.attr = FAULT_ATTR_INITIALIZER,
	.ignore_gfp_reclaim = true,
	.ignore_gfp_highmem = true,
};

static int __init setup_fail_page_alloc(char *str)
{
	return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);

static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
	if (order < fail_page_alloc.min_order)
	if (gfp_mask & __GFP_NOFAIL)
	if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
	if (fail_page_alloc.ignore_gfp_reclaim &&
			(gfp_mask & __GFP_DIRECT_RECLAIM))

	return should_fail(&fail_page_alloc.attr, 1 << order);
}

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_page_alloc_debugfs(void)
{
Al Viro's avatar
Al Viro committed
	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
	dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
					&fail_page_alloc.attr);
	if (IS_ERR(dir))
		return PTR_ERR(dir);
	if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
				&fail_page_alloc.ignore_gfp_reclaim))
		goto fail;
	if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
				&fail_page_alloc.ignore_gfp_highmem))
		goto fail;
	if (!debugfs_create_u32("min-order", mode, dir,
				&fail_page_alloc.min_order))
		goto fail;

	return 0;
fail:
}

late_initcall(fail_page_alloc_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

#else /* CONFIG_FAIL_PAGE_ALLOC */

static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
}

#endif /* CONFIG_FAIL_PAGE_ALLOC */

Linus Torvalds's avatar
Linus Torvalds committed
/*
 * Return true if free base pages are above 'mark'. For high-order checks it
 * will return true of the order-0 watermark is reached and there is at least
 * one free page of a suitable size. Checking now avoids taking the zone lock
 * to check in the allocation paths if no pages are free.
Linus Torvalds's avatar
Linus Torvalds committed
 */
static bool __zone_watermark_ok(struct zone *z, unsigned int order,
			unsigned long mark, int classzone_idx, int alloc_flags,
			long free_pages)
Linus Torvalds's avatar
Linus Torvalds committed
{
	long min = mark;
Linus Torvalds's avatar
Linus Torvalds committed
	int o;
	const int alloc_harder = (alloc_flags & ALLOC_HARDER);
Linus Torvalds's avatar
Linus Torvalds committed

	/* free_pages may go negative - that's OK */
	free_pages -= (1 << order) - 1;
	if (alloc_flags & ALLOC_HIGH)
Linus Torvalds's avatar
Linus Torvalds committed
		min -= min / 2;

	/*
	 * If the caller does not have rights to ALLOC_HARDER then subtract
	 * the high-atomic reserves. This will over-estimate the size of the
	 * atomic reserve but it avoids a search.
	 */
		free_pages -= z->nr_reserved_highatomic;
	else
Linus Torvalds's avatar
Linus Torvalds committed
		min -= min / 4;
#ifdef CONFIG_CMA
	/* If allocation can't use CMA areas don't use free CMA pages */
	if (!(alloc_flags & ALLOC_CMA))
		free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
	/*
	 * Check watermarks for an order-0 allocation request. If these
	 * are not met, then a high-order request also cannot go ahead
	 * even if a suitable page happened to be free.
	 */
	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
Linus Torvalds's avatar
Linus Torvalds committed

	/* If this is an order-0 request then the watermark is fine */
	if (!order)
		return true;

	/* For a high-order request, check at least one suitable page is free */
	for (o = order; o < MAX_ORDER; o++) {
		struct free_area *area = &z->free_area[o];
		int mt;

		if (!area->nr_free)
			continue;

		if (alloc_harder)
			return true;
Linus Torvalds's avatar
Linus Torvalds committed

		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
			if (!list_empty(&area->free_list[mt]))
				return true;
		}

#ifdef CONFIG_CMA
		if ((alloc_flags & ALLOC_CMA) &&
		    !list_empty(&area->free_list[MIGRATE_CMA])) {
			return true;
		}
#endif
Linus Torvalds's avatar
Linus Torvalds committed
	}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
		      int classzone_idx, int alloc_flags)
{
	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
					zone_page_state(z, NR_FREE_PAGES));
}

bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
			unsigned long mark, int classzone_idx)
{
	long free_pages = zone_page_state(z, NR_FREE_PAGES);

	if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
		free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);

	return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
	return local_zone->node == zone->node;
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
	return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
				RECLAIM_DISTANCE;
#else	/* CONFIG_NUMA */
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
	return true;
}

static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
	return true;
}
static void reset_alloc_batches(struct zone *preferred_zone)
{
	struct zone *zone = preferred_zone->zone_pgdat->node_zones;

	do {
		mod_zone_page_state(zone, NR_ALLOC_BATCH,
			high_wmark_pages(zone) - low_wmark_pages(zone) -
			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
Johannes Weiner's avatar
Johannes Weiner committed
		clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
	} while (zone++ != preferred_zone);
}

 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
						const struct alloc_context *ac)
	struct zonelist *zonelist = ac->zonelist;
	struct page *page = NULL;
	int nr_fair_skipped = 0;
	bool zonelist_rescan;
	 * Scan zonelist, looking for a zone with enough free.
	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
								ac->nodemask) {
		if (cpusets_enabled() &&
			(alloc_flags & ALLOC_CPUSET) &&
			!cpuset_zone_allowed(zone, gfp_mask))
		/*
		 * Distribute pages in proportion to the individual
		 * zone size to ensure fair page aging.  The zone a
		 * page was allocated in should have no effect on the
		 * time the page has in memory before being reclaimed.
		 */
		if (alloc_flags & ALLOC_FAIR) {
			if (!zone_local(ac->preferred_zone, zone))
Johannes Weiner's avatar
Johannes Weiner committed
			if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
		/*
		 * When allocating a page cache page for writing, we
		 * want to get it from a zone that is within its dirty
		 * limit, such that no single zone holds more than its
		 * proportional share of globally allowed dirty pages.
		 * The dirty limits take into account the zone's
		 * lowmem reserves and high watermark so that kswapd
		 * should be able to balance it without having to
		 * write pages from its LRU list.
		 *
		 * This may look like it could increase pressure on
		 * lower zones by failing allocations in higher zones
		 * before they are full.  But the pages that do spill
		 * over are limited as the lower zones are protected
		 * by this very same mechanism.  It should not become
		 * a practical burden to them.
		 *
		 * XXX: For now, allow allocations to potentially
		 * exceed the per-zone dirty limit in the slowpath
		 * (spread_dirty_pages unset) before going into reclaim,
		 * which is important when on a NUMA setup the allowed
		 * zones are together not big enough to reach the
		 * global limit.  The proper fix for these situations
		 * will require awareness of zones in the
		 * dirty-throttling and the flusher threads.
		 */
		if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
		if (!zone_watermark_ok(zone, order, mark,
				       ac->classzone_idx, alloc_flags)) {
			/* Checked here to keep the fast path fast */
			BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
			if (alloc_flags & ALLOC_NO_WATERMARKS)
				goto try_this_zone;

			if (zone_reclaim_mode == 0 ||
			    !zone_allows_reclaim(ac->preferred_zone, zone))
			ret = zone_reclaim(zone, gfp_mask, order);
			switch (ret) {
			case ZONE_RECLAIM_NOSCAN:
				/* did not scan */
			case ZONE_RECLAIM_FULL:
				/* scanned but unreclaimable */
				if (zone_watermark_ok(zone, order, mark,
		page = buffered_rmqueue(ac->preferred_zone, zone, order,
				gfp_mask, alloc_flags, ac->migratetype);
		if (page) {
			if (prep_new_page(page, order, gfp_mask, alloc_flags))
				goto try_this_zone;

			/*
			 * If this is a high-order atomic allocation then check
			 * if the pageblock should be reserved for the future
			 */
			if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
				reserve_highatomic_pageblock(page, zone, order);

	/*
	 * The first pass makes sure allocations are spread fairly within the
	 * local node.  However, the local node might have free pages left
	 * after the fairness batches are exhausted, and remote zones haven't
	 * even been considered yet.  Try once more without fairness, and
	 * include remote zones now, before entering the slowpath and waking
	 * kswapd: prefer spilling to a remote zone over swapping locally.
	 */
	if (alloc_flags & ALLOC_FAIR) {
		alloc_flags &= ~ALLOC_FAIR;
		if (nr_fair_skipped) {
			zonelist_rescan = true;
			reset_alloc_batches(ac->preferred_zone);
		}
		if (nr_online_nodes > 1)
			zonelist_rescan = true;
	}

	if (zonelist_rescan)
		goto zonelist_scan;

	return NULL;
/*
 * Large machines with many possible nodes should not always dump per-node
 * meminfo in irq context.
 */
static inline bool should_suppress_show_mem(void)
{
	bool ret = false;

#if NODES_SHIFT > 8
	ret = in_interrupt();
#endif
	return ret;
}

static DEFINE_RATELIMIT_STATE(nopage_rs,
		DEFAULT_RATELIMIT_INTERVAL,
		DEFAULT_RATELIMIT_BURST);

void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
{
	unsigned int filter = SHOW_MEM_FILTER_NODES;

	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
	    debug_guardpage_minorder() > 0)
		return;

	/*
	 * This documents exceptions given to allocations in certain
	 * contexts that are allowed to allocate outside current's set
	 * of allowed nodes.
	 */
	if (!(gfp_mask & __GFP_NOMEMALLOC))
		if (test_thread_flag(TIF_MEMDIE) ||
		    (current->flags & (PF_MEMALLOC | PF_EXITING)))
			filter &= ~SHOW_MEM_FILTER_NODES;
	if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
		filter &= ~SHOW_MEM_FILTER_NODES;

	if (fmt) {
		struct va_format vaf;
		va_list args;

		va_start(args, fmt);

		vaf.fmt = fmt;
		vaf.va = &args;

		pr_warn("%pV", &vaf);

	pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
		current->comm, order, gfp_mask, &gfp_mask);
	dump_stack();
	if (!should_suppress_show_mem())
		show_mem(filter);
}

static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
	const struct alloc_context *ac, unsigned long *did_some_progress)
	struct oom_control oc = {
		.zonelist = ac->zonelist,
		.nodemask = ac->nodemask,
		.gfp_mask = gfp_mask,
		.order = order,
	};
	 * Acquire the oom lock.  If that fails, somebody else is
	 * making progress for us.
	if (!mutex_trylock(&oom_lock)) {
		schedule_timeout_uninterruptible(1);
Linus Torvalds's avatar
Linus Torvalds committed
		return NULL;
	}
	/*
	 * Go through the zonelist yet one more time, keep very high watermark
	 * here, this is only to catch a parallel oom killing, we must fail if
	 * we're still under heavy pressure.
	 */
	page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
					ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
	if (page)
	if (!(gfp_mask & __GFP_NOFAIL)) {
		/* Coredumps can quickly deplete all memory reserves */
		if (current->flags & PF_DUMPCORE)
			goto out;
		/* The OOM killer will not help higher order allocs */
		if (order > PAGE_ALLOC_COSTLY_ORDER)
			goto out;
		/* The OOM killer does not needlessly kill tasks for lowmem */
		if (ac->high_zoneidx < ZONE_NORMAL)
		if (pm_suspended_storage())
			goto out;
		/*
		 * XXX: GFP_NOFS allocations should rather fail than rely on
		 * other request to make a forward progress.
		 * We are in an unfortunate situation where out_of_memory cannot
		 * do much for this context but let's try it to at least get
		 * access to memory reserved if the current task is killed (see
		 * out_of_memory). Once filesystems are ready to handle allocation
		 * failures more gracefully we should just bail out here.
		 */

David Rientjes's avatar
David Rientjes committed
		/* The OOM killer may not free memory on a specific node */
		if (gfp_mask & __GFP_THISNODE)
			goto out;
	}
	/* Exhausted what can be done so it's blamo time */
	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {

		if (gfp_mask & __GFP_NOFAIL) {
			page = get_page_from_freelist(gfp_mask, order,
					ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
			/*
			 * fallback to ignore cpuset restriction if our nodes
			 * are depleted
			 */
			if (!page)
				page = get_page_from_freelist(gfp_mask, order,
					ALLOC_NO_WATERMARKS, ac);
		}
	}
	mutex_unlock(&oom_lock);
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
		int alloc_flags, const struct alloc_context *ac,
		enum migrate_mode mode, int *contended_compaction,
		bool *deferred_compaction)
	current->flags |= PF_MEMALLOC;
	compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
						mode, contended_compaction);
	current->flags &= ~PF_MEMALLOC;
	switch (compact_result) {
	case COMPACT_DEFERRED:
		/* fall-through */
	case COMPACT_SKIPPED:
		return NULL;
	default:
		break;
	}
	/*
	 * At least in one zone compaction wasn't deferred or skipped, so let's
	 * count a compaction stall
	 */
	count_vm_event(COMPACTSTALL);
	page = get_page_from_freelist(gfp_mask, order,
					alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
	if (page) {
		struct zone *zone = page_zone(page);
		zone->compact_blockskip_flush = false;
		compaction_defer_reset(zone, order, true);
		count_vm_event(COMPACTSUCCESS);
		return page;