Newer
Older
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/page_alloc.c
*
* Manages the free list, the system allocates free pages here.
* Note that kmalloc() lives in slab.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
* Swap reorganised 29.12.95, Stephen Tweedie
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
*/
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <linux/vmstat.h>
#include <linux/mempolicy.h>
#include <linux/memremap.h>
Yasunori Goto
committed
#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
Mel Gorman
committed
#include <trace/events/kmem.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/sched/rt.h>
Ingo Molnar
committed
#include <linux/sched/mm.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
Steven Rostedt (VMware)
committed
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/psi.h>
Vijay Balakrishna
committed
#include <linux/khugepaged.h>
#include <linux/buffer_head.h>
#include <asm/sections.h>
#include "shuffle.h"
David Hildenbrand
committed
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
/* No special request */
#define FPI_NONE ((__force fpi_t)0)
/*
* Skip free page reporting notification for the (possibly merged) page.
* This does not hinder free page reporting from grabbing the page,
* reporting it and marking it "reported" - it only skips notifying
* the free page reporting infrastructure about a newly freed page. For
* example, used when temporarily pulling a page from a freelist and
* putting it back unmodified.
*/
#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
/*
* Place the (possibly merged) page to the tail of the freelist. Will ignore
* page shuffling (relevant code - e.g., memory onlining - is expected to
* shuffle the whole zone).
*
* Note: No code should rely on this flag for correctness - it's purely
* to allow for optimizations when handing back either fresh pages
* (memory onlining) or untouched pages (page isolation, free page
* reporting).
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
/*
* Don't poison memory with KASAN (only for the tag-based modes).
* During boot, all non-reserved memblock memory is exposed to page_alloc.
* Poisoning all that memory lengthens boot time, especially on systems with
* large amount of RAM. This flag is used to skip that poisoning.
* This is only done for the tag-based KASAN modes, as those are able to
* detect memory corruptions with the memory tags assigned by default.
* All memory allocated normally after boot gets poisoned as usual.
*/
#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif
DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
* defined in <linux/topology.h>.
*/
DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif
/* work_structs for global per-cpu drains */
struct pcpu_drain {
struct zone *zone;
struct work_struct work;
};
static DEFINE_MUTEX(pcpu_drain_mutex);
static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
EXPORT_SYMBOL(latent_entropy);
#endif
* Array of node states.
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
[N_MEMORY] = { { [0] = 1UL } },
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
EXPORT_SYMBOL(node_states);
atomic_long_t _totalram_pages __read_mostly;
EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
Alexander Potapenko
committed
EXPORT_SYMBOL(init_on_alloc);
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
Alexander Potapenko
committed
EXPORT_SYMBOL(init_on_free);
Vlastimil Babka
committed
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
Alexander Potapenko
committed
static int __init early_init_on_alloc(char *buf)
{
Vlastimil Babka
committed
return kstrtobool(buf, &_init_on_alloc_enabled_early);
Alexander Potapenko
committed
}
early_param("init_on_alloc", early_init_on_alloc);
Vlastimil Babka
committed
static bool _init_on_free_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
Alexander Potapenko
committed
static int __init early_init_on_free(char *buf)
{
Vlastimil Babka
committed
return kstrtobool(buf, &_init_on_free_enabled_early);
Alexander Potapenko
committed
}
early_param("init_on_free", early_init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
* put on a pcplist. Used to avoid the pageblock migratetype lookup when
* freeing from pcplists in most cases, at the cost of possibly becoming stale.
* Also the migratetype set in the page does not necessarily match the pcplist
* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
* other index - this ensures that it will be put on the correct CMA freelist.
*/
static inline int get_pcppage_migratetype(struct page *page)
{
return page->index;
}
static inline void set_pcppage_migratetype(struct page *page, int migratetype)
{
page->index = migratetype;
}
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
* change gfp_allowed_mask in order to avoid using I/O during memory allocations
* while devices are suspended. To avoid races with the suspend/hibernate code,
* they should always be called with system_transition_mutex held
* (gfp_allowed_mask also should only be modified with system_transition_mutex
* held, unless the suspend/hibernate code is guaranteed not to run in parallel
* with that modification).
static gfp_t saved_gfp_mask;
void pm_restore_gfp_mask(void)
WARN_ON(!mutex_is_locked(&system_transition_mutex));
if (saved_gfp_mask) {
gfp_allowed_mask = saved_gfp_mask;
saved_gfp_mask = 0;
}
void pm_restrict_gfp_mask(void)
WARN_ON(!mutex_is_locked(&system_transition_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
Mel Gorman
committed
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
bool pm_suspended_storage(void)
{
Mel Gorman
committed
if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
return false;
return true;
}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
*
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#ifdef CONFIG_ZONE_DMA32
[ZONE_DMA32] = 256,
[ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
[ZONE_HIGHMEM] = 0,
[ZONE_MOVABLE] = 0,
Christoph Lameter
committed
};
static char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
Christoph Lameter
committed
"DMA",
#ifdef CONFIG_ZONE_DMA32
Christoph Lameter
committed
"DMA32",
Christoph Lameter
committed
"Normal",
#ifdef CONFIG_HIGHMEM
#ifdef CONFIG_ZONE_DEVICE
"Device",
#endif
Christoph Lameter
committed
};
const char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
"HighAtomic",
#ifdef CONFIG_CMA
"CMA",
#endif
#ifdef CONFIG_MEMORY_ISOLATION
"Isolate",
#endif
};
compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
[NULL_COMPOUND_DTOR] = NULL,
[COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
[HUGETLB_PAGE_DTOR] = free_huge_page,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
[TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
};
int user_min_free_kbytes = -1;
Mel Gorman
committed
#ifdef CONFIG_DISCONTIGMEM
/*
* DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
* are not on separate NUMA nodes. Functionally this works but with
* watermark_boost_factor, it can reclaim prematurely as the ranges can be
* quite small. By default, do not boost watermarks on discontigmem as in
* many cases very high-order allocations like THP are likely to be
* unsupported and the premature reclaim offsets the advantage of long-term
* fragmentation avoidance.
*/
int watermark_boost_factor __read_mostly;
#else
Mel Gorman
committed
int watermark_boost_factor __read_mostly = 15000;
Mel Gorman
committed
#endif
int watermark_scale_factor = 10;
static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
static unsigned long dma_reserve __initdata;
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
static unsigned long required_kernelcore_percent __initdata;
static unsigned long required_movablecore __initdata;
static unsigned long required_movablecore_percent __initdata;
static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
unsigned int nr_online_nodes __read_mostly = 1;
Christoph Lameter
committed
EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
Mel Gorman
committed
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* During boot we initialize deferred pages on-demand, as needed, but once
* page_alloc_init_late() has finished, the deferred pages are all initialized,
* and we can permanently disable that path.
*/
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
/*
* Calling kasan_free_pages() only after deferred memory initialization
* has completed. Poisoning pages during deferred memory init will greatly
* lengthen the process and cause problem in large memory systems as the
* deferred pages initialization is done with interrupt disabled.
*
* Assuming that there will be no reference to those newly initialized
* pages before they are ever allocated, this should have no effect on
* KASAN memory tracking as the poison will be properly inserted at page
* allocation time. The only corner case is when pages are allocated by
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
static inline void kasan_free_nondeferred_pages(struct page *page, int order,
bool init, fpi_t fpi_flags)
if (static_branch_unlikely(&deferred_pages))
return;
if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
(fpi_flags & FPI_SKIP_KASAN_POISON))
return;
kasan_free_pages(page, order, init);
Mel Gorman
committed
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
Mel Gorman
committed
{
Mel Gorman
committed
int nid = early_pfn_to_nid(pfn);
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
Mel Gorman
committed
return true;
return false;
}
/*
* Returns true when the remaining initialisation should be deferred until
Mel Gorman
committed
* later in the boot cycle when it can be parallelised.
*/
static bool __meminit
defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
Mel Gorman
committed
{
static unsigned long prev_end_pfn, nr_initialised;
/*
* prev_end_pfn static that contains the end of previous zone
* No need to protect because called very early in boot before smp_init.
*/
if (prev_end_pfn != end_pfn) {
prev_end_pfn = end_pfn;
nr_initialised = 0;
}
/* Always populate low zones for address-constrained allocations */
if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
Mel Gorman
committed
return false;
if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
return true;
/*
* We start only with one section of pages, more pages are added as
* needed until the rest of deferred pages are initialized.
*/
nr_initialised++;
if ((nr_initialised > PAGES_PER_SECTION) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
NODE_DATA(nid)->first_deferred_pfn = pfn;
return true;
Mel Gorman
committed
}
return false;
Mel Gorman
committed
}
#else
static inline void kasan_free_nondeferred_pages(struct page *page, int order,
bool init, fpi_t fpi_flags)
{
if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
(fpi_flags & FPI_SKIP_KASAN_POISON))
return;
kasan_free_pages(page, order, init);
Mel Gorman
committed
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
}
static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
Mel Gorman
committed
{
return false;
Mel Gorman
committed
}
#endif
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
return section_to_usemap(__pfn_to_section(pfn));
#else
return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}
static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
#else
pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
#endif /* CONFIG_SPARSEMEM */
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
Wei Yang
committed
static __always_inline
unsigned long __get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long word;
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
return (word >> bitidx) & mask;
/**
* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
unsigned long mask)
{
Wei Yang
committed
return __get_pfnblock_flags_mask(page, pfn, mask);
}
static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
Wei Yang
committed
return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @flags: The flags to set
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*/
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long pfn,
unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long old_word, word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
mask <<= bitidx;
flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
if (word == old_word)
break;
word = old_word;
}
}
Mel Gorman
committed
void set_pageblock_migratetype(struct page *page, int migratetype)
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
set_pfnblock_flags_mask(page, (unsigned long)migratetype,
Wei Yang
committed
page_to_pfn(page), MIGRATETYPE_MASK);
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
int ret = 0;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
Cody P Schafer
committed
unsigned long sp, start_pfn;
do {
seq = zone_span_seqbegin(zone);
Cody P Schafer
committed
start_pfn = zone->zone_start_pfn;
sp = zone->spanned_pages;
if (!zone_spans_pfn(zone, pfn))
ret = 1;
} while (zone_span_seqretry(zone, seq));
Cody P Schafer
committed
if (ret)
pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
pfn, zone_to_nid(zone), zone->name,
start_pfn, start_pfn + sp);
Cody P Schafer
committed
}
static int page_is_consistent(struct zone *zone, struct page *page)
{
if (!pfn_valid_within(page_to_pfn(page)))
return 0;
return 1;
}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
Matthias Kaehlcke
committed
static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
if (!page_is_consistent(zone, page))
return 1;
Matthias Kaehlcke
committed
static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
static void bad_page(struct page *page, const char *reason)
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
goto out;
}
if (nr_unshown) {
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
print_modules();
/* Leave bad fields for debug, except PageBuddy could make trouble */
page_mapcount_reset(page); /* remove PageBuddy */
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
* The first PAGE_SIZE page is called the "head page" and have PG_head set.
* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
* The first tail page's ->compound_dtor holds the offset in array of compound
* page destructors. See compound_page_dtors.
* The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
void free_compound_page(struct page *page)
mem_cgroup_uncharge(page);
__free_pages_ok(page, compound_order(page), FPI_NONE);
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
p->mapping = TAIL_MAPPING;
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
atomic_set(compound_mapcount_ptr(page), -1);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
EXPORT_SYMBOL(_debug_pagealloc_enabled);
DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static int __init early_debug_pagealloc(char *buf)
{
return kstrtobool(buf, &_debug_pagealloc_enabled_early);
}
early_param("debug_pagealloc", early_debug_pagealloc);
static int __init debug_guardpage_minorder_setup(char *buf)
{
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
pr_err("Bad debug_guardpage_minorder value\n");
return 0;
}
_debug_guardpage_minorder = res;
pr_info("Setting debug_guardpage_minorder to %lu\n", res);
early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
if (!debug_guardpage_enabled())
return false;
if (order >= debug_guardpage_minorder())
return false;
__SetPageGuard(page);
INIT_LIST_HEAD(&page->lru);
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
if (!debug_guardpage_enabled())
return;
__ClearPageGuard(page);
set_page_private(page, 0);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, (1 << order), migratetype);
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
Vlastimil Babka
committed
/*
* Enable static keys related to various memory debugging and hardening options.
* Some override others, and depend on early params that are evaluated in the
* order of appearance. So we need to first gather the full picture of what was
* enabled, and then make decisions.
*/
void init_mem_debugging_and_hardening(void)
{
bool page_poisoning_requested = false;
#ifdef CONFIG_PAGE_POISONING
/*
* Page poisoning is debug page alloc for some arches. If
* either of those options are enabled, enable poisoning.
*/
if (page_poisoning_enabled() ||
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled())) {
static_branch_enable(&_page_poisoning_enabled);
page_poisoning_requested = true;
}
#endif
Vlastimil Babka
committed
if (_init_on_alloc_enabled_early) {
if (page_poisoning_requested)
Vlastimil Babka
committed
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_alloc\n");
else
static_branch_enable(&init_on_alloc);
}
if (_init_on_free_enabled_early) {
if (page_poisoning_requested)
Vlastimil Babka
committed
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_free\n");
else
static_branch_enable(&init_on_free);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
static_branch_enable(&_debug_pagealloc_enabled);
if (!debug_guardpage_minorder())
return;
static_branch_enable(&_debug_guardpage_enabled);
#endif
}
static inline void set_buddy_order(struct page *page, unsigned int order)
__SetPageBuddy(page);
}
/*
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
* For recording whether a page is in the buddy system, we set PageBuddy.
* Setting, clearing, and testing PageBuddy is serialized by zone->lock.
* For recording page's order, we use page_private(page).
static inline bool page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
if (!page_is_guard(buddy) && !PageBuddy(buddy))
return false;
if (buddy_order(buddy) != order)
return false;
/*
* zone check is done late to avoid uselessly calculating
* zone/node ids for pages that could never merge.
*/
if (page_zone_id(page) != page_zone_id(buddy))
return false;
VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return true;
#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
struct capture_control *capc = current->capture_control;
return unlikely(capc) &&
!(current->flags & PF_KTHREAD) &&
!capc->page &&
capc->cc->zone == zone ? capc : NULL;
}
static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
int order, int migratetype)
{
if (!capc || order != capc->cc->order)
return false;
/* Do not accidentally pollute CMA or isolated regions*/
if (is_migrate_cma(migratetype) ||
is_migrate_isolate(migratetype))
return false;
/*
* Do not let lower order allocations pollute a movable pageblock.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
*/
if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
return false;
capc->page = page;
return true;
}
#else
static inline struct capture_control *task_capc(struct zone *zone)
{
return NULL;
}
static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
int order, int migratetype)
{
return false;
}
#endif /* CONFIG_COMPACTION */
/* Used for pages not on another list */
static inline void add_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_add(&page->lru, &area->free_list[migratetype]);
area->nr_free++;
}
/* Used for pages not on another list */
static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_add_tail(&page->lru, &area->free_list[migratetype]);
area->nr_free++;
}
/*
* Used for pages which are on another list. Move the pages to the tail
* of the list - so the moved pages won't immediately be considered for
* allocation again (e.g., optimization for memory onlining).
*/
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_move_tail(&page->lru, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order)
{
/* clear reported state and update reported page count */
if (page_reported(page))
__ClearPageReported(page);
list_del(&page->lru);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
}
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
struct page *page, unsigned int order)
{
struct page *higher_page, *higher_buddy;
unsigned long combined_pfn;
if (order >= MAX_ORDER - 2)
return false;
if (!pfn_valid_within(buddy_pfn))
return false;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
return pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1);
}
/*
* Freeing function for a buddy system allocator.
*
* The concept of a buddy system is to maintain direct-mapped table