Newer
Older
#define MAX_COMPACT_RETRIES 16
#ifdef CONFIG_COMPACTION
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
struct page *page = NULL;
unsigned long pflags;
unsigned int noreclaim_flag;
if (!order)
Mel Gorman
committed
return NULL;
psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
Vlastimil Babka
committed
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
*/
count_vm_event(COMPACTSTALL);
/* Prep a captured page if available */
if (page)
prep_new_page(page, order, gfp_mask, alloc_flags);
/* Try get a page from the freelist if available */
if (!page)
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
Vlastimil Babka
committed
if (page) {
struct zone *zone = page_zone(page);
Vlastimil Babka
committed
zone->compact_blockskip_flush = false;
compaction_defer_reset(zone, order, true);
count_vm_event(COMPACTSUCCESS);
return page;
}
Vlastimil Babka
committed
/*
* It's bad if compaction run occurs and fails. The most likely reason
* is that pages exist, but not enough to satisfy watermarks.
*/
count_vm_event(COMPACTFAIL);
Mel Gorman
committed
Vlastimil Babka
committed
cond_resched();
return NULL;
}
Vlastimil Babka
committed
static inline bool
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
Vlastimil Babka
committed
{
int max_retries = MAX_COMPACT_RETRIES;
int min_priority;
bool ret = false;
int retries = *compaction_retries;
enum compact_priority priority = *compact_priority;
Vlastimil Babka
committed
if (!order)
return false;
if (compaction_made_progress(compact_result))
(*compaction_retries)++;
Vlastimil Babka
committed
/*
* compaction considers all the zone as desperately out of memory
* so it doesn't really make much sense to retry except when the
* failure could be caused by insufficient priority
*/
if (compaction_failed(compact_result))
goto check_priority;
Vlastimil Babka
committed
/*
* compaction was skipped because there are not enough order-0 pages
* to work with, so we retry only if it looks like reclaim can help.
*/
if (compaction_needs_reclaim(compact_result)) {
ret = compaction_zonelist_suitable(ac, order, alloc_flags);
goto out;
}
Vlastimil Babka
committed
/*
* make sure the compaction wasn't deferred or didn't bail out early
* due to locks contention before we declare that we should give up.
* But the next retry should use a higher priority if allowed, so
* we don't just keep bailing out endlessly.
Vlastimil Babka
committed
*/
if (compaction_withdrawn(compact_result)) {
goto check_priority;
Vlastimil Babka
committed
/*
Michal Hocko
committed
* !costly requests are much more important than __GFP_RETRY_MAYFAIL
Vlastimil Babka
committed
* costly ones because they are de facto nofail and invoke OOM
* killer to move on while costly can fail and users are ready
* to cope with that. 1/4 retries is rather arbitrary but we
* would need much more detailed feedback from compaction to
* make a better decision.
*/
if (order > PAGE_ALLOC_COSTLY_ORDER)
max_retries /= 4;
if (*compaction_retries <= max_retries) {
ret = true;
goto out;
}
Vlastimil Babka
committed
/*
* Make sure there are attempts at the highest priority if we exhausted
* all retries or failed at the lower priorities.
*/
check_priority:
min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
if (*compact_priority > min_priority) {
(*compact_priority)--;
*compaction_retries = 0;
out:
trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
return ret;
Vlastimil Babka
committed
}
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
*compact_result = COMPACT_SKIPPED;
return NULL;
}
Michal Hocko
committed
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
struct zone *zone;
struct zoneref *z;
if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
return false;
/*
* There are setups with compaction disabled which would prefer to loop
* inside the allocator rather than hit the oom killer prematurely.
* Let's give them a good hope and keep retrying while the order-0
* watermarks are OK.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->highest_zoneidx, ac->nodemask) {
if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
ac->highest_zoneidx, alloc_flags))
return true;
}
Vlastimil Babka
committed
#endif /* CONFIG_COMPACTION */
static struct lockdep_map __fs_reclaim_map =
STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
static bool __need_fs_reclaim(gfp_t gfp_mask)
{
gfp_mask = current_gfp_context(gfp_mask);
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
/* this guy won't enter reclaim */
return false;
/* We're only interested __GFP_FS allocations for now */
if (!(gfp_mask & __GFP_FS))
return false;
if (gfp_mask & __GFP_NOLOCKDEP)
return false;
return true;
}
void __fs_reclaim_acquire(void)
{
lock_map_acquire(&__fs_reclaim_map);
}
void __fs_reclaim_release(void)
{
lock_map_release(&__fs_reclaim_map);
}
void fs_reclaim_acquire(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
}
EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
void fs_reclaim_release(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
}
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
int progress;
unsigned int noreclaim_flag;
unsigned long pflags;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
ac->nodemask);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
cond_resched();
return progress;
}
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)
{
struct page *page = NULL;
bool drained = false;
*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
if (unlikely(!(*did_some_progress)))
return NULL;
retry:
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
/*
* If an allocation failed after direct reclaim, it could be because
* pages are pinned on the per-cpu lists or in high alloc reserves.
* Shrink them them and try again
*/
if (!page && !drained) {
unreserve_highatomic_pageblock(ac, false);
drained = true;
goto retry;
}
return page;
}
static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
pg_data_t *last_pgdat = NULL;
enum zone_type highest_zoneidx = ac->highest_zoneidx;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (last_pgdat != zone->zone_pgdat)
wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
last_pgdat = zone->zone_pgdat;
}
static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
Mateusz Nosek
committed
/*
* __GFP_HIGH is assumed to be the same as ALLOC_HIGH
* and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
* to save two branches.
*/
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
Mateusz Nosek
committed
BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
Mel Gorman
committed
* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
Mateusz Nosek
committed
alloc_flags |= (__force int)
(gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
Mel Gorman
committed
if (gfp_mask & __GFP_ATOMIC) {
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
* comment for __cpuset_node_allowed().
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
#ifdef CONFIG_CMA
if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
}
static bool oom_reserves_allowed(struct task_struct *tsk)
if (!tsk_is_oom_victim(tsk))
return false;
/*
* !MMU doesn't have oom reaper so give access to memory reserves
* only to the thread with TIF_MEMDIE set
*/
if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
return false;
return true;
}
/*
* Distinguish requests which really need access to full memory
* reserves from oom victims which can live with a portion of it
*/
static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
{
if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
return 0;
if (gfp_mask & __GFP_MEMALLOC)
return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
return ALLOC_NO_WATERMARKS;
if (!in_interrupt()) {
if (current->flags & PF_MEMALLOC)
return ALLOC_NO_WATERMARKS;
else if (oom_reserves_allowed(current))
return ALLOC_OOM;
}
return 0;
}
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
return !!__gfp_pfmemalloc_flags(gfp_mask);
/*
* Checks whether it makes sense to retry the reclaim to make a forward progress
* for the given allocation request.
*
* We give up when we either have tried MAX_RECLAIM_RETRIES in a row
* without success, or when we couldn't even meet the watermark if we
* reclaimed all remaining pages on the LRU lists.
*
* Returns true if a retry is viable or false to enter the oom path.
*/
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
struct alloc_context *ac, int alloc_flags,
bool did_some_progress, int *no_progress_loops)
{
struct zone *zone;
struct zoneref *z;
bool ret = false;
/*
* Costly allocations might have made a progress but this doesn't mean
* their order will become available due to high fragmentation so
* always increment the no progress counter for them
*/
if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
*no_progress_loops = 0;
else
(*no_progress_loops)++;
/*
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
/* Before OOM, exhaust highatomic_reserve */
return unreserve_highatomic_pageblock(ac, true);
/*
* Keep reclaiming pages while there is a chance this will lead
* somewhere. If none of the target zones can satisfy our allocation
* request even if all reclaimable pages are considered then we are
* screwed and have to go OOM.
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
ac->highest_zoneidx, ac->nodemask) {
unsigned long reclaimable;
unsigned long min_wmark = min_wmark_pages(zone);
bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
* Would the allocation succeed if we reclaimed all
* reclaimable pages?
wmark = __zone_watermark_ok(zone, order, min_wmark,
ac->highest_zoneidx, alloc_flags, available);
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
/*
* If we didn't make any progress and have a lot of
* dirty + writeback pages then we should wait for
* an IO to complete to slow down the reclaim and
* prevent from pre mature OOM
*/
if (!did_some_progress) {
unsigned long write_pending;
write_pending = zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return true;
}
}
ret = true;
goto out;
out:
/*
* Memory allocation/reclaim might be called from a WQ context and the
* current implementation of the WQ concurrency control doesn't
* recognize that a particular WQ is congested if the worker thread is
* looping without ever sleeping. Therefore we have to do a short sleep
* here rather than calling cond_resched().
*/
if (current->flags & PF_WQ_WORKER)
schedule_timeout_uninterruptible(1);
else
cond_resched();
return ret;
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
static inline bool
check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
{
/*
* It's possible that cpuset's mems_allowed and the nodemask from
* mempolicy don't intersect. This should be normally dealt with by
* policy_nodemask(), but it's possible to race with cpuset update in
* such a way the check therein was true, and then it became false
* before we got our cpuset_mems_cookie here.
* This assumes that for all allocations, ac->nodemask can come only
* from MPOL_BIND mempolicy (whose documented semantics is to be ignored
* when it does not intersect with the cpuset restrictions) or the
* caller can deal with a violated nodemask.
*/
if (cpusets_enabled() && ac->nodemask &&
!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
ac->nodemask = NULL;
return true;
}
/*
* When updating a task's mems_allowed or mempolicy nodemask, it is
* possible to race with parallel threads in such a way that our
* allocation can fail while the mask is being updated. If we are about
* to fail, check if the cpuset changed during allocation and if so,
* retry.
*/
if (read_mems_allowed_retry(cpuset_mems_cookie))
return true;
return false;
}
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
Mel Gorman
committed
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
Vlastimil Babka
committed
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned long did_some_progress;
enum compact_priority compact_priority;
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
int reserve_flags;
Mel Gorman
committed
/*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* We need to recalculate the starting point for the zonelist iterator
* because we might have used different nodemask in the fast path, or
* there was a cpuset modification and we are retrying - otherwise we
* could end up iterating over non-eligible zones endlessly.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/*
* For costly allocations, try direct compaction first, as it's likely
Vlastimil Babka
committed
* that we have enough base pages and don't need to reclaim. For non-
* movable high-order allocations, do that as well, as compaction will
* try prevent permanent fragmentation by migrating from blocks of the
* same migratetype.
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
Vlastimil Babka
committed
if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;
Vlastimil Babka
committed
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes some THP page fault allocations
*/
if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If allocating entire pageblock(s) and compaction
* failed because all zones are below low watermarks
* or is prohibited because it recently failed at this
* order, fail immediately unless the allocator has
* requested compaction and reclaim retry.
*
* Reclaim is
* - potentially very expensive because zones are far
* below their low watermarks or this is part of very
* bursty high order allocations,
* - not guaranteed to help because isolate_freepages()
* may not iterate over freed pages as part of its
* linear scan, and
* - unlikely to make entire pageblocks free on its
* own.
*/
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage;
/*
* Looks like reclaim/compaction is worth trying, but
* sync compaction could be very expensive, so keep
* using async compaction.
*/
compact_priority = INIT_COMPACT_PRIORITY;
}
}
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = reserve_flags;
Mel Gorman
committed
/*
* Reset the nodemask and zonelist iterators if memory policies can be
* ignored. These allocations are high priority and system rather than
* user oriented.
Mel Gorman
committed
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
Mel Gorman
committed
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
Mel Gorman
committed
}
/* Attempt with potentially adjusted zonelist and alloc_flags */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
Mel Gorman
committed
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim)
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;
/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;
/*
* Do not retry costly high order allocations unless they are
Michal Hocko
committed
* __GFP_RETRY_MAYFAIL
Michal Hocko
committed
if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
if (did_some_progress > 0 &&
Michal Hocko
committed
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
/* Deal with possible cpuset update races before we start OOM killing */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
(alloc_flags == ALLOC_OOM ||
Tetsuo Handa
committed
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
/* Deal with possible cpuset update races before we fail */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
if (gfp_mask & __GFP_NOFAIL) {
/*
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
if (WARN_ON_ONCE(!can_direct_reclaim))
goto fail;
/*
* PF_MEMALLOC request from this context is rather bizarre
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
WARN_ON_ONCE(current->flags & PF_MEMALLOC);
/*
* non failing costly orders are a hard requirement which we
* are not prepared for much so let's warn about these users
* so that we can identify them and convert them to something
* else.
*/
WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
/*
* Help non-failing allocations by giving them access to memory
* reserves but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
* the situation worse
*/
page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
if (page)
goto got_pg;
cond_resched();
goto retry;
}
fail:
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->highest_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
if (!ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
Mel Gorman
committed
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
*alloc_flags |= ALLOC_CMA;
/* Determine whether to spread dirty pages and what the first usable zone */
static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
Mel Gorman
committed
/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
}
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
finalise_ac(gfp_mask, &ac);
Mel Gorman
committed
Mel Gorman
committed
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
Mel Gorman
committed
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
* resp. GFP_NOIO which has to be inherited for all allocation requests
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}.
alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
Mel Gorman
committed
/*
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
ac.nodemask = nodemask;
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge_page(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
EXPORT_SYMBOL(__alloc_pages_nodemask);
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
* address cannot represent highmem pages. Use alloc_pages and then kmap if
* you need to access high mem.
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
struct page *page;
page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);
unsigned long get_zeroed_page(gfp_t gfp_mask)
return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
static inline void free_the_page(struct page *page, unsigned int order)
if (order == 0) /* Via pcp? */
free_unref_page(page);
else
__free_pages_ok(page, order);
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order);
}
void free_pages(unsigned long addr, unsigned int order)
__free_pages(virt_to_page((void *)addr), order);
}
}
EXPORT_SYMBOL(free_pages);
/*
* Page Fragment:
* An arbitrary-length arbitrary-offset area of memory which resides
* within a 0 or higher order page. Multiple fragments within that page
* are individually refcounted, in the page's reference counter.
*
* The page_frag functions below provide a simple allocation framework for
* page fragments. This is used by the network stack and network device
* drivers to provide a backing region of memory for use as either an
* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
*/
Alexander Duyck
committed
static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
gfp_t gfp_mask)
{
struct page *page = NULL;
gfp_t gfp = gfp_mask;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
__GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
#endif
if (unlikely(!page))
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
nc->va = page ? page_address(page) : NULL;
return page;
}
Alexander Duyck
committed
void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (page_ref_sub_and_test(page, count))
free_the_page(page, compound_order(page));
Alexander Duyck
committed
EXPORT_SYMBOL(__page_frag_cache_drain);
Alexander Duyck
committed
void *page_frag_alloc(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
int offset;
if (unlikely(!nc->va)) {
refill:
Alexander Duyck
committed
page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */