Newer
Older
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1; current_order >= order;
--current_order) {
for (i = 0;; i++) {
migratetype = fallbacks[start_migratetype][i];
Mel Gorman
committed
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
area = &(zone->free_area[current_order]);
if (list_empty(&area->free_list[migratetype]))
continue;
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
area->nr_free--;
/*
* If breaking a large block of pages, move all free
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
* aggressive about taking ownership of free pages
*
* On the other hand, never change migration
* type of MIGRATE_CMA pageblocks nor move CMA
* pages on different free lists. We don't
* want unmovable pages to be allocated from
* MIGRATE_CMA areas.
if (!is_migrate_cma(migratetype) &&
(unlikely(current_order >= pageblock_order / 2) ||
start_migratetype == MIGRATE_RECLAIMABLE ||
page_group_by_mobility_disabled)) {
int pages;
pages = move_freepages_block(zone, page,
start_migratetype);
/* Claim the whole block if over half of it is free */
Mel Gorman
committed
if (pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page,
start_migratetype);
migratetype = start_migratetype;
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
Mel Gorman
committed
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order &&
!is_migrate_cma(migratetype))
Mel Gorman
committed
change_pageblock_range(page, current_order,
start_migratetype);
expand(zone, page, order, current_order, area,
is_migrate_cma(migratetype)
? migratetype : start_migratetype);
Mel Gorman
committed
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, migratetype);
return page;
}
}
Mel Gorman
committed
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
int migratetype)
Mel Gorman
committed
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
Mel Gorman
committed
page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
* is used because __rmqueue_smallest is an inline function
* and we want just one call site
*/
if (!page) {
migratetype = MIGRATE_RESERVE;
goto retry_reserve;
}
}
Mel Gorman
committed
trace_mm_page_alloc_zone_locked(page, order, migratetype);
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, int cold)
int mt = migratetype, i;
struct page *page = __rmqueue(zone, order, migratetype);
/*
* Split buddy pages returned by expand() are received here
* in physical page order. The page is added to the callers and
* list and the list head then moves forward. From the callers
* perspective, the linked list is ordered by page number in
* some conditions. This is useful for IO devices that can
* merge IO requests if the physical pages are ordered
* properly.
*/
if (likely(cold == 0))
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
if (IS_ENABLED(CONFIG_CMA)) {
mt = get_pageblock_migratetype(page);
if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
mt = migratetype;
}
set_page_private(page, mt);
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
Christoph Lameter
committed
/*
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
*
* Note that this function must be called with the thread pinned to
* a single processor.
Christoph Lameter
committed
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
local_irq_save(flags);
if (pcp->count >= pcp->batch)
to_drain = pcp->batch;
else
to_drain = pcp->count;
KOSAKI Motohiro
committed
if (to_drain > 0) {
free_pcppages_bulk(zone, to_drain, pcp);
pcp->count -= to_drain;
}
local_irq_restore(flags);
/*
* Drain pages of the indicated processor.
*
* The processor must either be the current processor and the
* thread pinned to the current processor or a processor that
* is not online.
*/
static void drain_pages(unsigned int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pages *pcp;
local_irq_save(flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
David Rientjes
committed
if (pcp->count) {
free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
}
local_irq_restore(flags);
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
void drain_local_pages(void *arg)
{
drain_pages(smp_processor_id());
}
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* Note that this code is protected against sending an IPI to an offline
* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
* nothing keeps CPUs from showing up after we populated the cpumask and
* before the call to on_each_cpu_mask().
*/
void drain_all_pages(void)
{
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
int cpu;
struct per_cpu_pageset *pcp;
struct zone *zone;
/*
* Allocate in the BSS so we wont require allocation in
* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
*/
static cpumask_t cpus_with_pcps;
/*
* We don't care about racing with CPU hotplug event
* as offline notification will cause the notified
* cpu to drain that CPU pcps and on_each_cpu_mask
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
bool has_pcps = false;
for_each_populated_zone(zone) {
pcp = per_cpu_ptr(zone->pageset, cpu);
if (pcp->pcp.count) {
has_pcps = true;
break;
}
}
if (has_pcps)
cpumask_set_cpu(cpu, &cpus_with_pcps);
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
#ifdef CONFIG_HIBERNATION
unsigned long pfn, max_zone_pfn;
unsigned long flags;
struct list_head *curr;
if (!zone->spanned_pages)
return;
spin_lock_irqsave(&zone->lock, flags);
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
for_each_migratetype_order(order, t) {
list_for_each(curr, &zone->free_area[order].free_list[t]) {
pfn = page_to_pfn(list_entry(curr, struct page, lru));
for (i = 0; i < (1UL << order); i++)
swsusp_set_page_free(pfn_to_page(pfn + i));
#endif /* CONFIG_PM */
void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
int migratetype;
int wasMlocked = __TestClearPageMlocked(page);
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
if (unlikely(wasMlocked))
free_page_mlock(page);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
* offlined but treat RESERVE as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(migratetype == MIGRATE_ISOLATE)) {
free_one_page(zone, page, 0, migratetype);
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list_add_tail(&page->lru, &pcp->lists[migratetype]);
list_add(&page->lru, &pcp->lists[migratetype]);
free_pcppages_bulk(zone, pcp->batch, pcp);
out:
/*
* Free a list of 0-order pages
*/
void free_hot_cold_page_list(struct list_head *list, int cold)
{
struct page *page, *next;
list_for_each_entry_safe(page, next, list, lru) {
trace_mm_page_free_batched(page, cold);
free_hot_cold_page(page, cold);
}
}
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
* Each sub-page must be freed individually.
*
* Note: this is probably too low level an operation for use in drivers.
* Please consult with lkml before using this in your driver.
*/
void split_page(struct page *page, unsigned int order)
{
int i;
VM_BUG_ON(PageCompound(page));
VM_BUG_ON(!page_count(page));
#ifdef CONFIG_KMEMCHECK
/*
* Split shadow pages too, because free(page[0]) would
* otherwise free the whole shadow.
*/
if (kmemcheck_page_is_tracked(page))
split_page(virt_to_page(page[0].shadow), order);
#endif
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
/*
* Similar to split_page except the page is already free. As this is only
* being used for migration, the migratetype of the block also changes.
* As this is called with interrupts disabled, the caller is responsible
* for calling arch_alloc_page() and kernel_map_page() after interrupts
* are enabled.
*
* Note: this is probably too low level an operation for use in drivers.
* Please consult with lkml before using this in your driver.
*/
int split_free_page(struct page *page)
{
unsigned int order;
unsigned long watermark;
struct zone *zone;
BUG_ON(!PageBuddy(page));
zone = page_zone(page);
order = page_order(page);
/* Obey watermarks as if the page was being allocated */
watermark = low_wmark_pages(zone) + (1 << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
/* Remove page from free list */
list_del(&page->lru);
zone->free_area[order].nr_free--;
rmv_page_order(page);
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
/* Split into individual pages */
set_page_refcounted(page);
split_page(page, order);
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
}
return 1 << order;
}
/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
struct list_head *list;
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
if (cold)
page = list_entry(list->prev, struct page, lru);
else
page = list_entry(list->next, struct page, lru);
list_del(&page->lru);
pcp->count--;
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
if (prep_new_page(page, order, gfp_flags))
failed:
local_irq_restore(flags);
return NULL;
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
#define ALLOC_WMARK_LOW WMARK_LOW
#define ALLOC_WMARK_HIGH WMARK_HIGH
#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#ifdef CONFIG_FAIL_PAGE_ALLOC
struct fault_attr attr;
u32 ignore_gfp_highmem;
u32 ignore_gfp_wait;
u32 min_order;
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
.ignore_gfp_highmem = 1,
.min_order = 1,
};
static int __init setup_fail_page_alloc(char *str)
{
return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
if (order < fail_page_alloc.min_order)
if (gfp_mask & __GFP_NOFAIL)
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
return should_fail(&fail_page_alloc.attr, 1 << order);
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init fail_page_alloc_debugfs(void)
{
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
if (IS_ERR(dir))
return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
&fail_page_alloc.ignore_gfp_wait))
goto fail;
if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
&fail_page_alloc.ignore_gfp_highmem))
goto fail;
if (!debugfs_create_u32("min-order", mode, dir,
&fail_page_alloc.min_order))
goto fail;
return 0;
fail:
debugfs_remove_recursive(dir);
}
late_initcall(fail_page_alloc_debugfs);
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
#else /* CONFIG_FAIL_PAGE_ALLOC */
static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
* Return true if free pages are above 'mark'. This takes into account the order
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags, long free_pages)
long lowmem_reserve = z->lowmem_reserve[classzone_idx];
free_pages -= (1 << order) - 1;
if (free_pages <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
/* Require fewer higher order pages to be free */
min >>= 1;
if (free_pages <= min)
return false;
return true;
}
#ifdef CONFIG_MEMORY_ISOLATION
static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
{
if (unlikely(zone->nr_pageblock_isolate))
return zone->nr_pageblock_isolate * pageblock_nr_pages;
return 0;
}
#else
static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
{
return 0;
}
#endif
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
/*
* If the zone has MIGRATE_ISOLATE type free pages, we should consider
* it. nr_zone_isolate_freepages is never accurate so kswapd might not
* sleep although it could do so. But this is more desirable for memory
* hotplug than sleeping which can cause a livelock in the direct
* reclaim path.
*/
free_pages -= nr_zone_isolate_freepages(z);
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
#ifdef CONFIG_NUMA
/*
* zlc_setup - Setup for "zonelist cache". Uses cached zone data to
* skip over zones that are not allowed by the cpuset, or that have
* been recently (in last second) found to be nearly full. See further
* comments in mmzone.h. Reduces cache footprint of zonelist scans
* that have to skip over a lot of full or unallowed zones.
*
* If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current
* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
*
* If the zonelist cache is not available for this zonelist, does
* nothing and returns NULL.
*
* If the fullzones BITMAP in the zonelist cache is stale (more than
* a second since last zap'd) then we zap it out (clear its bits.)
*
* We hold off even calling zlc_setup, until after we've checked the
* first zone in the zonelist, on the theory that most allocations will
* be satisfied from that first zone, so best to examine that zone as
* quickly as we can.
*/
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
nodemask_t *allowednodes; /* zonelist_cache approximation */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return NULL;
if (time_after(jiffies, zlc->last_full_zap + HZ)) {
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
zlc->last_full_zap = jiffies;
}
allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
&cpuset_current_mems_allowed :
&node_states[N_HIGH_MEMORY];
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
return allowednodes;
}
/*
* Given 'z' scanning a zonelist, run a couple of quick checks to see
* if it is worth looking at further for free memory:
* 1) Check that the zone isn't thought to be full (doesn't have its
* bit set in the zonelist_cache fullzones BITMAP).
* 2) Check that the zones node (obtained from the zonelist_cache
* z_to_n[] mapping) is allowed in the passed in allowednodes mask.
* Return true (non-zero) if zone is worth looking at further, or
* else return false (zero) if it is not.
*
* This check -ignores- the distinction between various watermarks,
* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
* found to be full for any variation of these watermarks, it will
* be considered full for up to one second by all requests, unless
* we are so low on memory on all allowed nodes that we are forced
* into the second scan of the zonelist.
*
* In the second scan we ignore this zonelist cache and exactly
* apply the watermarks to all zones, even it is slower to do so.
* We are low on memory in the second scan, and should leave no stone
* unturned looking for a free page.
*/
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
int n; /* node that zone *z is on */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return 1;
i = z - zonelist->_zonerefs;
n = zlc->z_to_n[i];
/* This zone is worth trying if it is allowed but not full */
return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
}
/*
* Given 'z' scanning a zonelist, set the corresponding bit in
* zlc->fullzones, so that subsequent attempts to allocate a page
* from that zone don't waste time re-examining it.
*/
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return;
i = z - zonelist->_zonerefs;
set_bit(i, zlc->fullzones);
}
/*
* clear all zones full, called after direct reclaim makes progress so that
* a zone that was recently full is not skipped over for up to a second
*/
static void zlc_clear_zones_full(struct zonelist *zonelist)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return;
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
}
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
{
return NULL;
}
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
return 1;
}
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
static void zlc_clear_zones_full(struct zonelist *zonelist)
{
}
#endif /* CONFIG_NUMA */
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
struct zoneref *z;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
/*
* When allocating a page cache page for writing, we
* want to get it from a zone that is within its dirty
* limit, such that no single zone holds more than its
* proportional share of globally allowed dirty pages.
* The dirty limits take into account the zone's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
* This may look like it could increase pressure on
* lower zones by failing allocations in higher zones
* before they are full. But the pages that do spill
* over are limited as the lower zones are protected
* by this very same mechanism. It should not become
* a practical burden to them.
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
* (ALLOC_WMARK_LOW unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
if (zone_reclaim_mode == 0)
goto this_zone_full;
/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
try_this_zone:
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
David Rientjes
committed
/*
* Large machines with many possible nodes should not always dump per-node
* meminfo in irq context.
*/
static inline bool should_suppress_show_mem(void)
{
bool ret = false;
#if NODES_SHIFT > 8
ret = in_interrupt();
#endif
return ret;
}
static DEFINE_RATELIMIT_STATE(nopage_rs,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
debug_guardpage_minorder() > 0)
return;
/*
* This documents exceptions given to allocations in certain
* contexts that are allowed to allocate outside current's set
* of allowed nodes.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
if (test_thread_flag(TIF_MEMDIE) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
filter &= ~SHOW_MEM_FILTER_NODES;
if (fmt) {
struct va_format vaf;
va_list args;
vaf.fmt = fmt;
vaf.va = &args;
pr_warn("%pV", &vaf);
pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
current->comm, order, gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
}
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
unsigned long did_some_progress,
unsigned long pages_reclaimed)
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)