Newer
Older
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
*compact_result = COMPACT_SKIPPED;
return NULL;
}
Michal Hocko
committed
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
struct zone *zone;
struct zoneref *z;
if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
return false;
/*
* There are setups with compaction disabled which would prefer to loop
* inside the allocator rather than hit the oom killer prematurely.
* Let's give them a good hope and keep retrying while the order-0
* watermarks are OK.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
ac_classzone_idx(ac), alloc_flags))
return true;
}
Vlastimil Babka
committed
#endif /* CONFIG_COMPACTION */
static struct lockdep_map __fs_reclaim_map =
STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
static bool __need_fs_reclaim(gfp_t gfp_mask)
{
gfp_mask = current_gfp_context(gfp_mask);
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
/* this guy won't enter reclaim */
return false;
/* We're only interested __GFP_FS allocations for now */
if (!(gfp_mask & __GFP_FS))
return false;
if (gfp_mask & __GFP_NOLOCKDEP)
return false;
return true;
}
void __fs_reclaim_acquire(void)
{
lock_map_acquire(&__fs_reclaim_map);
}
void __fs_reclaim_release(void)
{
lock_map_release(&__fs_reclaim_map);
}
void fs_reclaim_acquire(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
}
EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
void fs_reclaim_release(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
}
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
/* Perform direct synchronous page reclaim */
static int
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
int progress;
unsigned int noreclaim_flag;
unsigned long pflags;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
ac->nodemask);
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
cond_resched();
return progress;
}
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
unsigned long *did_some_progress)
{
struct page *page = NULL;
bool drained = false;
*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
if (unlikely(!(*did_some_progress)))
return NULL;
retry:
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
/*
* If an allocation failed after direct reclaim, it could be because
* pages are pinned on the per-cpu lists or in high alloc reserves.
* Shrink them them and try again
*/
if (!page && !drained) {
unreserve_highatomic_pageblock(ac, false);
drained = true;
goto retry;
}
return page;
}
static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
pg_data_t *last_pgdat = NULL;
enum zone_type high_zoneidx = ac->high_zoneidx;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
ac->nodemask) {
if (last_pgdat != zone->zone_pgdat)
wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
last_pgdat = zone->zone_pgdat;
}
static inline unsigned int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
Mel Gorman
committed
* set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
Mel Gorman
committed
if (gfp_mask & __GFP_ATOMIC) {
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
if (!(gfp_mask & __GFP_NOMEMALLOC))
alloc_flags |= ALLOC_HARDER;
* Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
* comment for __cpuset_node_allowed().
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
alloc_flags |= ALLOC_KSWAPD;
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
}
static bool oom_reserves_allowed(struct task_struct *tsk)
if (!tsk_is_oom_victim(tsk))
return false;
/*
* !MMU doesn't have oom reaper so give access to memory reserves
* only to the thread with TIF_MEMDIE set
*/
if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
return false;
return true;
}
/*
* Distinguish requests which really need access to full memory
* reserves from oom victims which can live with a portion of it
*/
static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
{
if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
return 0;
if (gfp_mask & __GFP_MEMALLOC)
return ALLOC_NO_WATERMARKS;
if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
return ALLOC_NO_WATERMARKS;
if (!in_interrupt()) {
if (current->flags & PF_MEMALLOC)
return ALLOC_NO_WATERMARKS;
else if (oom_reserves_allowed(current))
return ALLOC_OOM;
}
return 0;
}
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
return !!__gfp_pfmemalloc_flags(gfp_mask);
/*
* Checks whether it makes sense to retry the reclaim to make a forward progress
* for the given allocation request.
*
* We give up when we either have tried MAX_RECLAIM_RETRIES in a row
* without success, or when we couldn't even meet the watermark if we
* reclaimed all remaining pages on the LRU lists.
*
* Returns true if a retry is viable or false to enter the oom path.
*/
static inline bool
should_reclaim_retry(gfp_t gfp_mask, unsigned order,
struct alloc_context *ac, int alloc_flags,
bool did_some_progress, int *no_progress_loops)
{
struct zone *zone;
struct zoneref *z;
bool ret = false;
/*
* Costly allocations might have made a progress but this doesn't mean
* their order will become available due to high fragmentation so
* always increment the no progress counter for them
*/
if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
*no_progress_loops = 0;
else
(*no_progress_loops)++;
/*
* Make sure we converge to OOM if we cannot make any progress
* several times in the row.
*/
if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
/* Before OOM, exhaust highatomic_reserve */
return unreserve_highatomic_pageblock(ac, true);
/*
* Keep reclaiming pages while there is a chance this will lead
* somewhere. If none of the target zones can satisfy our allocation
* request even if all reclaimable pages are considered then we are
* screwed and have to go OOM.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
unsigned long min_wmark = min_wmark_pages(zone);
bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
* Would the allocation succeed if we reclaimed all
* reclaimable pages?
wmark = __zone_watermark_ok(zone, order, min_wmark,
ac_classzone_idx(ac), alloc_flags, available);
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
/*
* If we didn't make any progress and have a lot of
* dirty + writeback pages then we should wait for
* an IO to complete to slow down the reclaim and
* prevent from pre mature OOM
*/
if (!did_some_progress) {
unsigned long write_pending;
write_pending = zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return true;
}
}
ret = true;
goto out;
out:
/*
* Memory allocation/reclaim might be called from a WQ context and the
* current implementation of the WQ concurrency control doesn't
* recognize that a particular WQ is congested if the worker thread is
* looping without ever sleeping. Therefore we have to do a short sleep
* here rather than calling cond_resched().
*/
if (current->flags & PF_WQ_WORKER)
schedule_timeout_uninterruptible(1);
else
cond_resched();
return ret;
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
static inline bool
check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
{
/*
* It's possible that cpuset's mems_allowed and the nodemask from
* mempolicy don't intersect. This should be normally dealt with by
* policy_nodemask(), but it's possible to race with cpuset update in
* such a way the check therein was true, and then it became false
* before we got our cpuset_mems_cookie here.
* This assumes that for all allocations, ac->nodemask can come only
* from MPOL_BIND mempolicy (whose documented semantics is to be ignored
* when it does not intersect with the cpuset restrictions) or the
* caller can deal with a violated nodemask.
*/
if (cpusets_enabled() && ac->nodemask &&
!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
ac->nodemask = NULL;
return true;
}
/*
* When updating a task's mems_allowed or mempolicy nodemask, it is
* possible to race with parallel threads in such a way that our
* allocation can fail while the mask is being updated. If we are about
* to fail, check if the cpuset changed during allocation and if so,
* retry.
*/
if (read_mems_allowed_retry(cpuset_mems_cookie))
return true;
return false;
}
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
Mel Gorman
committed
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
Vlastimil Babka
committed
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned long did_some_progress;
enum compact_priority compact_priority;
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
int reserve_flags;
Mel Gorman
committed
/*
* We also sanity check to catch abuse of atomic reserves being used by
* callers that are not in atomic context.
*/
if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
retry_cpuset:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
* kswapd needs to be woken up, and to avoid the cost of setting up
* alloc_flags precisely. So we do that now.
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
/*
* We need to recalculate the starting point for the zonelist iterator
* because we might have used different nodemask in the fast path, or
* there was a cpuset modification and we are retrying - otherwise we
* could end up iterating over non-eligible zones endlessly.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
/*
* The adjusted alloc_flags might result in immediate success, so try
* that first
*/
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
/*
* For costly allocations, try direct compaction first, as it's likely
Vlastimil Babka
committed
* that we have enough base pages and don't need to reclaim. For non-
* movable high-order allocations, do that as well, as compaction will
* try prevent permanent fragmentation by migrating from blocks of the
* same migratetype.
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
Vlastimil Babka
committed
if (can_direct_reclaim &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
page = __alloc_pages_direct_compact(gfp_mask, order,
alloc_flags, ac,
INIT_COMPACT_PRIORITY,
&compact_result);
if (page)
goto got_pg;
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
*/
Vlastimil Babka
committed
if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If compaction is deferred for high-order allocations,
* it is because sync compaction recently failed. If
* this is the case and the caller requested a THP
* allocation, we do not want to heavily disrupt the
* system, so we fail the allocation instead of entering
* direct reclaim.
*/
if (compact_result == COMPACT_DEFERRED)
goto nopage;
/*
* Looks like reclaim/compaction is worth trying, but
* sync compaction could be very expensive, so keep
* using async compaction.
*/
compact_priority = INIT_COMPACT_PRIORITY;
}
}
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
alloc_flags = reserve_flags;
Mel Gorman
committed
/*
* Reset the nodemask and zonelist iterators if memory policies can be
* ignored. These allocations are high priority and system rather than
* user oriented.
Mel Gorman
committed
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
Mel Gorman
committed
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
/* Attempt with potentially adjusted zonelist and alloc_flags */
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
Mel Gorman
committed
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim)
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
&did_some_progress);
if (page)
goto got_pg;
/* Try direct compaction and then allocating */
page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
compact_priority, &compact_result);
if (page)
goto got_pg;
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
goto nopage;
/*
* Do not retry costly high order allocations unless they are
Michal Hocko
committed
* __GFP_RETRY_MAYFAIL
Michal Hocko
committed
if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
/*
* It doesn't make any sense to retry for the compaction if the order-0
* reclaim is not able to make any progress because the current
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
if (did_some_progress > 0 &&
Michal Hocko
committed
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
/* Deal with possible cpuset update races before we start OOM killing */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
if (page)
goto got_pg;
/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
(alloc_flags == ALLOC_OOM ||
Tetsuo Handa
committed
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
/* Retry as long as the OOM killer is making progress */
if (did_some_progress) {
no_progress_loops = 0;
/* Deal with possible cpuset update races before we fail */
if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
if (gfp_mask & __GFP_NOFAIL) {
/*
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
if (WARN_ON_ONCE(!can_direct_reclaim))
goto fail;
/*
* PF_MEMALLOC request from this context is rather bizarre
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
WARN_ON_ONCE(current->flags & PF_MEMALLOC);
/*
* non failing costly orders are a hard requirement which we
* are not prepared for much so let's warn about these users
* so that we can identify them and convert them to something
* else.
*/
WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
/*
* Help non-failing allocations by giving them access to memory
* reserves but do not use ALLOC_NO_WATERMARKS because this
* could deplete whole memory reserves which would just make
* the situation worse
*/
page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
if (page)
goto got_pg;
cond_resched();
goto retry;
}
fail:
warn_alloc(gfp_mask, ac->nodemask,
"page allocation failure: order:%u", order);
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->high_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
if (!ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
fs_reclaim_acquire(gfp_mask);
fs_reclaim_release(gfp_mask);
Mel Gorman
committed
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
*alloc_flags |= ALLOC_CMA;
/* Determine whether to spread dirty pages and what the first usable zone */
static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
Mel Gorman
committed
/*
* The preferred zone is used for statistics but crucially it is
* also used as the starting point for the zonelist iterator. It
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = { };
/*
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
if (unlikely(order >= MAX_ORDER)) {
WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
}
gfp_mask &= gfp_allowed_mask;
alloc_mask = gfp_mask;
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
finalise_ac(gfp_mask, &ac);
Mel Gorman
committed
Mel Gorman
committed
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
Mel Gorman
committed
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
goto out;
* Apply scoped allocation constraints. This is mainly about GFP_NOFS
* resp. GFP_NOIO which has to be inherited for all allocation requests
* from a particular context which has been marked by
* memalloc_no{fs,io}_{save,restore}.
alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
Mel Gorman
committed
/*
* Restore the original nodemask if it was potentially replaced with
* &cpuset_current_mems_allowed to optimize the fast-path attempt.
*/
if (unlikely(ac.nodemask != nodemask))
Mel Gorman
committed
ac.nodemask = nodemask;
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
EXPORT_SYMBOL(__alloc_pages_nodemask);
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
* address cannot represent highmem pages. Use alloc_pages and then kmap if
* you need to access high mem.
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
struct page *page;
page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
EXPORT_SYMBOL(__get_free_pages);
unsigned long get_zeroed_page(gfp_t gfp_mask)
return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
static inline void free_the_page(struct page *page, unsigned int order)
if (order == 0) /* Via pcp? */
free_unref_page(page);
else
__free_pages_ok(page, order);
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order);
}
void free_pages(unsigned long addr, unsigned int order)
__free_pages(virt_to_page((void *)addr), order);
}
}
EXPORT_SYMBOL(free_pages);
/*
* Page Fragment:
* An arbitrary-length arbitrary-offset area of memory which resides
* within a 0 or higher order page. Multiple fragments within that page
* are individually refcounted, in the page's reference counter.
*
* The page_frag functions below provide a simple allocation framework for
* page fragments. This is used by the network stack and network device
* drivers to provide a backing region of memory for use as either an
* sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
*/
Alexander Duyck
committed
static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
gfp_t gfp_mask)
{
struct page *page = NULL;
gfp_t gfp = gfp_mask;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
__GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
#endif
if (unlikely(!page))
page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
nc->va = page ? page_address(page) : NULL;
return page;
}
Alexander Duyck
committed
void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
if (page_ref_sub_and_test(page, count))
free_the_page(page, compound_order(page));
Alexander Duyck
committed
EXPORT_SYMBOL(__page_frag_cache_drain);
Alexander Duyck
committed
void *page_frag_alloc(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
int offset;
if (unlikely(!nc->va)) {
refill:
Alexander Duyck
committed
page = __page_frag_cache_refill(nc, gfp_mask);
if (!page)
return NULL;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
#endif
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
nc->offset = size;
}
offset = nc->offset - fragsz;
if (unlikely(offset < 0)) {
page = virt_to_page(nc->va);
if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
}
nc->pagecnt_bias--;
nc->offset = offset;
return nc->va + offset;
}
Alexander Duyck
committed
EXPORT_SYMBOL(page_frag_alloc);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
*/
Alexander Duyck
committed
void page_frag_free(void *addr)
{
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
free_the_page(page, compound_order(page));
Alexander Duyck
committed
EXPORT_SYMBOL(page_frag_free);
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
if (addr) {
unsigned long alloc_end = addr + (PAGE_SIZE << order);
unsigned long used = addr + PAGE_ALIGN(size);
split_page(virt_to_page((void *)addr), order);
while (used < alloc_end) {
free_page(used);
used += PAGE_SIZE;
}
}
return (void *)addr;
}
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
* @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* This function is similar to alloc_pages(), except that it allocates the
* minimum number of pages to satisfy the request. alloc_pages() can only
* allocate memory in power-of-two pages.
*
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
*
* Return: pointer to the allocated area or %NULL in case of error.
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
unsigned long addr;
if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
gfp_mask &= ~__GFP_COMP;
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
EXPORT_SYMBOL(alloc_pages_exact);
/**
* alloc_pages_exact_nid - allocate an exact number of physically-contiguous
* pages on a node.
* @nid: the preferred node ID where memory should be allocated
* @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
*
* Return: pointer to the allocated area or %NULL in case of error.
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
struct page *p;
if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
gfp_mask &= ~__GFP_COMP;
p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
}
/**
* free_pages_exact - release memory allocated via alloc_pages_exact()
* @virt: the value returned by alloc_pages_exact.
* @size: size of allocation, same value as passed to alloc_pages_exact().
*
* Release the memory allocated by a previous call to alloc_pages_exact.
*/
void free_pages_exact(void *virt, size_t size)
{
unsigned long addr = (unsigned long)virt;
unsigned long end = addr + PAGE_ALIGN(size);
while (addr < end) {
free_page(addr);
addr += PAGE_SIZE;
}
}
EXPORT_SYMBOL(free_pages_exact);
/**
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
* nr_free_zone_pages() counts the number of pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
*
* nr_free_zone_pages = managed_pages - high_pages