Newer
Older
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, int cold)
struct page *page = __rmqueue(zone, order, migratetype);
/*
* Split buddy pages returned by expand() are received here
* in physical page order. The page is added to the callers and
* list and the list head then moves forward. From the callers
* perspective, the linked list is ordered by page number in
* some conditions. This is useful for IO devices that can
* merge IO requests if the physical pages are ordered
* properly.
*/
if (likely(cold == 0))
list_add(&page->lru, list);
else
list_add_tail(&page->lru, list);
set_page_private(page, migratetype);
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
Christoph Lameter
committed
/*
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
*
* Note that this function must be called with the thread pinned to
* a single processor.
Christoph Lameter
committed
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
local_irq_save(flags);
if (pcp->count >= pcp->batch)
to_drain = pcp->batch;
else
to_drain = pcp->count;
free_pcppages_bulk(zone, to_drain, pcp);
pcp->count -= to_drain;
local_irq_restore(flags);
/*
* Drain pages of the indicated processor.
*
* The processor must either be the current processor and the
* thread pinned to the current processor or a processor that
* is not online.
*/
static void drain_pages(unsigned int cpu)
for_each_populated_zone(zone) {
struct per_cpu_pages *pcp;
local_irq_save(flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
David Rientjes
committed
if (pcp->count) {
free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
}
local_irq_restore(flags);
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*/
void drain_local_pages(void *arg)
{
drain_pages(smp_processor_id());
}
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator
*/
void drain_all_pages(void)
{
on_each_cpu(drain_local_pages, NULL, 1);
#ifdef CONFIG_HIBERNATION
unsigned long pfn, max_zone_pfn;
unsigned long flags;
struct list_head *curr;
if (!zone->spanned_pages)
return;
spin_lock_irqsave(&zone->lock, flags);
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
for_each_migratetype_order(order, t) {
list_for_each(curr, &zone->free_area[order].free_list[t]) {
pfn = page_to_pfn(list_entry(curr, struct page, lru));
for (i = 0; i < (1UL << order); i++)
swsusp_set_page_free(pfn_to_page(pfn + i));
#endif /* CONFIG_PM */
void free_hot_cold_page(struct page *page, int cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
int migratetype;
int wasMlocked = __TestClearPageMlocked(page);
migratetype = get_pageblock_migratetype(page);
set_page_private(page, migratetype);
if (unlikely(wasMlocked))
free_page_mlock(page);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
* offlined but treat RESERVE as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(migratetype == MIGRATE_ISOLATE)) {
free_one_page(zone, page, 0, migratetype);
goto out;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list_add_tail(&page->lru, &pcp->lists[migratetype]);
list_add(&page->lru, &pcp->lists[migratetype]);
free_pcppages_bulk(zone, pcp->batch, pcp);
out:
/*
* Free a list of 0-order pages
*/
void free_hot_cold_page_list(struct list_head *list, int cold)
{
struct page *page, *next;
list_for_each_entry_safe(page, next, list, lru) {
trace_mm_page_free_batched(page, cold);
free_hot_cold_page(page, cold);
}
}
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
* Each sub-page must be freed individually.
*
* Note: this is probably too low level an operation for use in drivers.
* Please consult with lkml before using this in your driver.
*/
void split_page(struct page *page, unsigned int order)
{
int i;
VM_BUG_ON(PageCompound(page));
VM_BUG_ON(!page_count(page));
#ifdef CONFIG_KMEMCHECK
/*
* Split shadow pages too, because free(page[0]) would
* otherwise free the whole shadow.
*/
if (kmemcheck_page_is_tracked(page))
split_page(virt_to_page(page[0].shadow), order);
#endif
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
/*
* Similar to split_page except the page is already free. As this is only
* being used for migration, the migratetype of the block also changes.
* As this is called with interrupts disabled, the caller is responsible
* for calling arch_alloc_page() and kernel_map_page() after interrupts
* are enabled.
*
* Note: this is probably too low level an operation for use in drivers.
* Please consult with lkml before using this in your driver.
*/
int split_free_page(struct page *page)
{
unsigned int order;
unsigned long watermark;
struct zone *zone;
BUG_ON(!PageBuddy(page));
zone = page_zone(page);
order = page_order(page);
/* Obey watermarks as if the page was being allocated */
watermark = low_wmark_pages(zone) + (1 << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return 0;
/* Remove page from free list */
list_del(&page->lru);
zone->free_area[order].nr_free--;
rmv_page_order(page);
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
/* Split into individual pages */
set_page_refcounted(page);
split_page(page, order);
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages)
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
}
return 1 << order;
}
/*
* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, int order, gfp_t gfp_flags,
int migratetype)
struct list_head *list;
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
migratetype, cold);
if (unlikely(list_empty(list)))
if (cold)
page = list_entry(list->prev, struct page, lru);
else
page = list_entry(list->next, struct page, lru);
list_del(&page->lru);
pcp->count--;
if (unlikely(gfp_flags & __GFP_NOFAIL)) {
/*
* __GFP_NOFAIL is not to be used in new code.
*
* All __GFP_NOFAIL callers should be fixed so that they
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
* allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
WARN_ON_ONCE(order > 1);
page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
if (prep_new_page(page, order, gfp_flags))
failed:
local_irq_restore(flags);
return NULL;
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
#define ALLOC_WMARK_LOW WMARK_LOW
#define ALLOC_WMARK_HIGH WMARK_HIGH
#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
#define ALLOC_HARDER 0x10 /* try to alloc harder */
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#ifdef CONFIG_FAIL_PAGE_ALLOC
struct fault_attr attr;
u32 ignore_gfp_highmem;
u32 ignore_gfp_wait;
u32 min_order;
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
.ignore_gfp_highmem = 1,
.min_order = 1,
};
static int __init setup_fail_page_alloc(char *str)
{
return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
if (order < fail_page_alloc.min_order)
return 0;
if (gfp_mask & __GFP_NOFAIL)
return 0;
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
return 0;
if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
return 0;
return should_fail(&fail_page_alloc.attr, 1 << order);
}
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
static int __init fail_page_alloc_debugfs(void)
{
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
if (IS_ERR(dir))
return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
&fail_page_alloc.ignore_gfp_wait))
goto fail;
if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
&fail_page_alloc.ignore_gfp_highmem))
goto fail;
if (!debugfs_create_u32("min-order", mode, dir,
&fail_page_alloc.min_order))
goto fail;
return 0;
fail:
debugfs_remove_recursive(dir);
}
late_initcall(fail_page_alloc_debugfs);
#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
#else /* CONFIG_FAIL_PAGE_ALLOC */
static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return 0;
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
* Return true if free pages are above 'mark'. This takes into account the order
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags, long free_pages)
free_pages -= (1 << order) + 1;
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
/* Require fewer higher order pages to be free */
min >>= 1;
if (free_pages <= min)
return false;
return true;
}
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
#ifdef CONFIG_NUMA
/*
* zlc_setup - Setup for "zonelist cache". Uses cached zone data to
* skip over zones that are not allowed by the cpuset, or that have
* been recently (in last second) found to be nearly full. See further
* comments in mmzone.h. Reduces cache footprint of zonelist scans
* that have to skip over a lot of full or unallowed zones.
*
* If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current
* tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
*
* If the zonelist cache is not available for this zonelist, does
* nothing and returns NULL.
*
* If the fullzones BITMAP in the zonelist cache is stale (more than
* a second since last zap'd) then we zap it out (clear its bits.)
*
* We hold off even calling zlc_setup, until after we've checked the
* first zone in the zonelist, on the theory that most allocations will
* be satisfied from that first zone, so best to examine that zone as
* quickly as we can.
*/
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
nodemask_t *allowednodes; /* zonelist_cache approximation */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return NULL;
if (time_after(jiffies, zlc->last_full_zap + HZ)) {
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
zlc->last_full_zap = jiffies;
}
allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
&cpuset_current_mems_allowed :
&node_states[N_HIGH_MEMORY];
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
return allowednodes;
}
/*
* Given 'z' scanning a zonelist, run a couple of quick checks to see
* if it is worth looking at further for free memory:
* 1) Check that the zone isn't thought to be full (doesn't have its
* bit set in the zonelist_cache fullzones BITMAP).
* 2) Check that the zones node (obtained from the zonelist_cache
* z_to_n[] mapping) is allowed in the passed in allowednodes mask.
* Return true (non-zero) if zone is worth looking at further, or
* else return false (zero) if it is not.
*
* This check -ignores- the distinction between various watermarks,
* such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
* found to be full for any variation of these watermarks, it will
* be considered full for up to one second by all requests, unless
* we are so low on memory on all allowed nodes that we are forced
* into the second scan of the zonelist.
*
* In the second scan we ignore this zonelist cache and exactly
* apply the watermarks to all zones, even it is slower to do so.
* We are low on memory in the second scan, and should leave no stone
* unturned looking for a free page.
*/
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
int n; /* node that zone *z is on */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return 1;
i = z - zonelist->_zonerefs;
n = zlc->z_to_n[i];
/* This zone is worth trying if it is allowed but not full */
return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
}
/*
* Given 'z' scanning a zonelist, set the corresponding bit in
* zlc->fullzones, so that subsequent attempts to allocate a page
* from that zone don't waste time re-examining it.
*/
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
int i; /* index of *z in zonelist zones */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return;
i = z - zonelist->_zonerefs;
set_bit(i, zlc->fullzones);
}
/*
* clear all zones full, called after direct reclaim makes progress so that
* a zone that was recently full is not skipped over for up to a second
*/
static void zlc_clear_zones_full(struct zonelist *zonelist)
{
struct zonelist_cache *zlc; /* cached zonelist speedup info */
zlc = zonelist->zlcache_ptr;
if (!zlc)
return;
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
}
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
{
return NULL;
}
static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
nodemask_t *allowednodes)
{
return 1;
}
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
static void zlc_clear_zones_full(struct zonelist *zonelist)
{
}
#endif /* CONFIG_NUMA */
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
struct zoneref *z;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
!cpuset_zone_allowed_softwall(zone, gfp_mask))
continue;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
if (zone_reclaim_mode == 0)
goto this_zone_full;
/*
* As we may have just activated ZLC, check if the first
* eligible zone has failed zone_reclaim recently.
*/
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
try_this_zone:
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
David Rientjes
committed
/*
* Large machines with many possible nodes should not always dump per-node
* meminfo in irq context.
*/
static inline bool should_suppress_show_mem(void)
{
bool ret = false;
#if NODES_SHIFT > 8
ret = in_interrupt();
#endif
return ret;
}
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
static DEFINE_RATELIMIT_STATE(nopage_rs,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
/*
* This documents exceptions given to allocations in certain
* contexts that are allowed to allocate outside current's set
* of allowed nodes.
*/
if (!(gfp_mask & __GFP_NOMEMALLOC))
if (test_thread_flag(TIF_MEMDIE) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
filter &= ~SHOW_MEM_FILTER_NODES;
if (fmt) {
struct va_format vaf;
va_list args;
vaf.fmt = fmt;
vaf.va = &args;
pr_warn("%pV", &vaf);
pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
current->comm, order, gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
}
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed)
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
return 0;
/*
* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
* means __GFP_NOFAIL, but that may not be true in other
* implementations.
*/
if (order <= PAGE_ALLOC_COSTLY_ORDER)
return 1;
/*
* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
* specified, then we retry until we no longer reclaim any pages
* (above), or we've reclaimed an order of pages at least as
* large as the allocation's order. In both cases, if the
* allocation still fails, we stop retrying.
*/
if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
return 1;
/*
* Don't let big-order allocations loop unless the caller
* explicitly requests that.
*/
if (gfp_mask & __GFP_NOFAIL)
return 1;
return 0;
}
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
struct page *page;
/* Acquire the OOM killer lock for the zones in zonelist */
if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
schedule_timeout_uninterruptible(1);
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
*/
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET,
preferred_zone, migratetype);
goto out;
if (!(gfp_mask & __GFP_NOFAIL)) {
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (high_zoneidx < ZONE_NORMAL)
goto out;
/*
* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
* The caller should handle page allocation failure by itself if
* it specifies __GFP_THISNODE.
* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
*/
if (gfp_mask & __GFP_THISNODE)
goto out;
}
/* Exhausted what can be done so it's blamo time */
out_of_memory(zonelist, gfp_mask, order, nodemask);
out:
clear_zonelist_oom(zonelist, gfp_mask);
return page;
}
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
Mel Gorman
committed
int migratetype, unsigned long *did_some_progress,
bool sync_migration)
{
struct page *page;
Mel Gorman
committed
if (!order || compaction_deferred(preferred_zone))
return NULL;
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
Mel Gorman
committed
nodemask, sync_migration);
current->flags &= ~PF_MEMALLOC;
if (*did_some_progress != COMPACT_SKIPPED) {
/* Page migration frees to the PCP lists but we want merging */
drain_pages(get_cpu());
put_cpu();
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
alloc_flags, preferred_zone,
migratetype);
if (page) {
Mel Gorman
committed
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
count_vm_event(COMPACTSUCCESS);
return page;
}
/*
* It's bad if compaction run occurs and fails.
* The most likely reason is that pages exist,
* but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
Mel Gorman
committed
defer_compaction(preferred_zone);
cond_resched();
}
return NULL;
}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
Mel Gorman
committed
int migratetype, unsigned long *did_some_progress,
bool sync_migration)
{
return NULL;
}
#endif /* CONFIG_COMPACTION */
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
int migratetype, unsigned long *did_some_progress)
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
bool drained = false;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
current->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
current->flags &= ~PF_MEMALLOC;
cond_resched();
if (unlikely(!(*did_some_progress)))
return NULL;
/* After successful reclaim, reconsider all zones for allocation */
if (NUMA_BUILD)
zlc_clear_zones_full(zonelist);
retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags, preferred_zone,
migratetype);
/*
* If an allocation failed after direct reclaim, it could be because
* pages are pinned on the per-cpu lists. Drain them and try again
*/
if (!page && !drained) {
drain_all_pages();
drained = true;
goto retry;
}
return page;
}
* This is called in the allocator slow-path if the allocation request is of
* sufficient urgency to ignore watermarks and take other desperate measures
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
int migratetype)
{
struct page *page;
do {