Newer
Older
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
/* Print out the early node map */
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
/* Any memory on that node */
if (pgdat->node_present_pages)
Lai Jiangshan
committed
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
}
static int __init cmdline_parse_core(char *p, unsigned long *core)
{
unsigned long long coremem;
if (!p)
return -EINVAL;
coremem = memparse(p, &p);
*core = coremem >> PAGE_SHIFT;
/* Paranoid check that UL is enough for the coremem value */
WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
return 0;
}
/*
* kernelcore=size sets the amount of memory for use for allocations that
* cannot be reclaimed or migrated.
*/
static int __init cmdline_parse_kernelcore(char *p)
{
/* parse kernelcore=mirror */
if (parse_option_str(p, "mirror")) {
mirrored_kernelcore = true;
return 0;
}
return cmdline_parse_core(p, &required_kernelcore);
}
/*
* movablecore=size sets the amount of memory for use for allocations that
* can be reclaimed or migrated.
*/
static int __init cmdline_parse_movablecore(char *p)
{
return cmdline_parse_core(p, &required_movablecore);
}
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
void adjust_managed_page_count(struct page *page, long count)
{
spin_lock(&managed_page_count_lock);
page_zone(page)->managed_pages += count;
totalram_pages += count;
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
totalhigh_pages += count;
#endif
spin_unlock(&managed_page_count_lock);
}
EXPORT_SYMBOL(adjust_managed_page_count);
unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
void *pos;
unsigned long pages = 0;
start = (void *)PAGE_ALIGN((unsigned long)start);
end = (void *)((unsigned long)end & PAGE_MASK);
for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
if ((unsigned int)poison <= 0xFF)
memset(pos, poison, PAGE_SIZE);
free_reserved_page(virt_to_page(pos));
}
if (pages && s)
pr_info("Freeing %s memory: %ldK (%p - %p)\n",
s, pages << (PAGE_SHIFT - 10), start, end);
return pages;
}
EXPORT_SYMBOL(free_reserved_area);
Jiang Liu
committed
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
{
__free_reserved_page(page);
totalram_pages++;
page_zone(page)->managed_pages++;
Jiang Liu
committed
totalhigh_pages++;
}
#endif
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
void __init mem_init_print_info(const char *str)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
unsigned long init_code_size, init_data_size;
physpages = get_num_physpages();
codesize = _etext - _stext;
datasize = _edata - _sdata;
rosize = __end_rodata - __start_rodata;
bss_size = __bss_stop - __bss_start;
init_data_size = __init_end - __init_begin;
init_code_size = _einittext - _sinittext;
/*
* Detect special cases and adjust section sizes accordingly:
* 1) .init.* may be embedded into .data sections
* 2) .init.text.* may be out of [__init_begin, __init_end],
* please refer to arch/tile/kernel/vmlinux.lds.S.
* 3) .rodata.* may be embedded into .text or .data sections.
*/
#define adj_init_size(start, end, size, pos, adj) \
do { \
if (start <= pos && pos < end && size > adj) \
size -= adj; \
} while (0)
adj_init_size(__init_begin, __init_end, init_data_size,
_sinittext, init_code_size);
adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
#undef adj_init_size
pr_info("Memory: %luK/%luK available "
"(%luK kernel code, %luK rwdata, %luK rodata, "
"%luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
", %luK highmem"
#endif
"%s%s)\n",
nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
totalcma_pages << (PAGE_SHIFT-10),
#ifdef CONFIG_HIGHMEM
totalhigh_pages << (PAGE_SHIFT-10),
#endif
str ? ", " : "", str ? str : "");
}
* set_dma_reserve - set the specified number of pages reserved in the first zone
* @new_dma_reserve: The number of pages to mark reserved
* The per-cpu batchsize and zone watermarks are determined by managed_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* and other unfreeable allocations which can skew the watermarks badly. This
* function may optionally be used to account for unfreeable pages in the
* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
* smaller per-cpu batchsize.
*/
void __init set_dma_reserve(unsigned long new_dma_reserve)
{
dma_reserve = new_dma_reserve;
}
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
static int page_alloc_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (unsigned long)hcpu;
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
lru_add_drain_cpu(cpu);
drain_pages(cpu);
/*
* Spill the event counters of the dead processor
* into the current processors event counters.
* This artificially elevates the count of the current
* processor.
*/
/*
* Zero the differential counters of the dead processor
* so that the vm statistics are consistent.
*
* This is only okay since the processor is dead and cannot
* race with what we are doing.
*/
Christoph Lameter
committed
cpu_vm_stats_fold(cpu);
}
return NOTIFY_OK;
}
void __init page_alloc_init(void)
{
hotcpu_notifier(page_alloc_cpu_notify, 0);
}
Yaowei Bai
committed
* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
* or min_free_kbytes changes.
*/
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
Mel Gorman
committed
long max = 0;
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
if (zone->lowmem_reserve[j] > max)
max = zone->lowmem_reserve[j];
}
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
if (max > zone->managed_pages)
max = zone->managed_pages;
zone->totalreserve_pages = max;
reserve_pages += max;
}
}
totalreserve_pages = reserve_pages;
}
/*
* setup_per_zone_lowmem_reserve - called whenever
Yaowei Bai
committed
* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long managed_pages = zone->managed_pages;
if (sysctl_lowmem_reserve_ratio[idx] < 1)
sysctl_lowmem_reserve_ratio[idx] = 1;
lower_zone = pgdat->node_zones + idx;
lower_zone->lowmem_reserve[j] = managed_pages /
managed_pages += lower_zone->managed_pages;
/* update totalreserve_pages */
calculate_totalreserve_pages();
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->managed_pages;
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone->managed_pages;
do_div(tmp, lowmem_pages);
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* need highmem pages, so cap pages_min to a small
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control asynch page reclaim, and so should
unsigned long min_pages;
min_pages = zone->managed_pages / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->watermark[WMARK_MIN] = min_pages;
/*
* If it's a lowmem zone, reserve a number of pages
zone->watermark[WMARK_MIN] = tmp;
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
spin_unlock_irqrestore(&zone->lock, flags);
/* update totalreserve_pages */
calculate_totalreserve_pages();
/**
* setup_per_zone_wmarks - called when min_free_kbytes changes
* or when memory is hot-{added|removed}
*
* Ensures that the watermark[min,low,high] values for each zone are set
* correctly with respect to min_free_kbytes.
*/
void setup_per_zone_wmarks(void)
{
mutex_lock(&zonelists_mutex);
__setup_per_zone_wmarks();
mutex_unlock(&zonelists_mutex);
}
* The inactive anon list should be small enough that the VM never has to
* do too much work, but large enough that each inactive page has a chance
* to be referenced again before it is swapped out.
*
* The inactive_anon ratio is the target ratio of ACTIVE_ANON to
* INACTIVE_ANON pages on this zone's LRU, maintained by the
* pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
* the anonymous pages are kept on the inactive list.
*
* total target max
* memory ratio inactive anon
* -------------------------------------
* 10MB 1 5MB
* 100MB 1 50MB
* 1GB 3 250MB
* 10GB 10 0.9GB
* 100GB 31 3GB
* 1TB 101 10GB
* 10TB 320 32GB
*/
static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
unsigned int gb, ratio;
/* Zone size in gigabytes */
gb = zone->managed_pages >> (30 - PAGE_SHIFT);
if (gb)
ratio = int_sqrt(10 * gb);
else
ratio = 1;
zone->inactive_ratio = ratio;
}
KOSAKI Motohiro
committed
static void __meminit setup_per_zone_inactive_ratio(void)
{
struct zone *zone;
for_each_zone(zone)
calculate_zone_inactive_ratio(zone);
/*
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
* we want it large (64MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
*
* which yields
*
* 16MB: 512k
* 32MB: 724k
* 64MB: 1024k
* 128MB: 1448k
* 256MB: 2048k
* 512MB: 2896k
* 1024MB: 4096k
* 2048MB: 5792k
* 4096MB: 8192k
* 8192MB: 11584k
* 16384MB: 16384k
*/
int __meminit init_per_zone_wmark_min(void)
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
if (new_min_free_kbytes > user_min_free_kbytes) {
min_free_kbytes = new_min_free_kbytes;
if (min_free_kbytes < 128)
min_free_kbytes = 128;
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
} else {
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes);
}
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_inactive_ratio();
module_init(init_per_zone_wmark_min)
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
Christoph Lameter
committed
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
Christoph Lameter
committed
{
struct zone *zone;
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
Christoph Lameter
committed
if (rc)
return rc;
for_each_zone(zone)
zone->min_unmapped_pages = (zone->managed_pages *
Christoph Lameter
committed
sysctl_min_unmapped_ratio) / 100;
return 0;
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
for_each_zone(zone)
zone->min_slab_pages = (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
Christoph Lameter
committed
#endif
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
* whenever sysctl_lowmem_reserve_ratio changes.
*
* The reserve ratio obviously has absolutely no relation with the
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
proc_dointvec_minmax(table, write, buffer, length, ppos);
setup_per_zone_lowmem_reserve();
return 0;
}
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
* cpu. It is the fraction of total pages in each zone that a hot per cpu
* pagelist can have before it gets flushed back to buddy allocator.
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int old_percpu_pagelist_fraction;
int ret;
mutex_lock(&pcp_batch_high_lock);
old_percpu_pagelist_fraction = percpu_pagelist_fraction;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (!write || ret < 0)
goto out;
/* Sanity checking to avoid pcp imbalance */
if (percpu_pagelist_fraction &&
percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
percpu_pagelist_fraction = old_percpu_pagelist_fraction;
ret = -EINVAL;
goto out;
}
/* No change? */
if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
goto out;
for_each_populated_zone(zone) {
unsigned int cpu;
Cody P Schafer
committed
for_each_possible_cpu(cpu)
pageset_set_high_and_batch(zone,
per_cpu_ptr(zone->pageset, cpu));
mutex_unlock(&pcp_batch_high_lock);
}
int hashdist = HASHDIST_DEFAULT;
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
static int __init set_hashdist(char *str)
{
if (!str)
return 0;
hashdist = simple_strtoul(str, &str, 0);
return 1;
}
__setup("hashdist=", set_hashdist);
#endif
/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
* quantity of entries
* - limit is the number of hash buckets, not the total allocation size
*/
void *__init alloc_large_system_hash(const char *tablename,
unsigned long bucketsize,
unsigned long numentries,
int scale,
int flags,
unsigned int *_hash_shift,
unsigned int *_hash_mask,
unsigned long low_limit,
unsigned long high_limit)
unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
/* allow the kernel cmdline to have a say */
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
/* It isn't necessary when PAGE_SIZE >= 1MB */
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
else
numentries <<= (PAGE_SHIFT - scale);
/* Make sure we've got at least a 0-order allocation.. */
if (unlikely(flags & HASH_SMALL)) {
/* Makes no sense without HASH_EARLY */
WARN_ON(!(flags & HASH_EARLY));
if (!(numentries >> *_hash_shift)) {
numentries = 1UL << *_hash_shift;
BUG_ON(!numentries);
}
} else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
numentries = PAGE_SIZE / bucketsize;
numentries = roundup_pow_of_two(numentries);
/* limit allocation size to 1/16 total memory by default */
if (max == 0) {
max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
do_div(max, bucketsize);
}
max = min(max, 0x80000000ULL);
if (numentries < low_limit)
numentries = low_limit;
log2qty = ilog2(numentries);
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
else {
Eric Dumazet
committed
/*
* If bucketsize is not a power-of-two, we may free
Mel Gorman
committed
* some pages at the end of hash table which
* alloc_pages_exact() automatically does
Eric Dumazet
committed
*/
if (get_order(size) < MAX_ORDER) {
Mel Gorman
committed
table = alloc_pages_exact(size, GFP_ATOMIC);
kmemleak_alloc(table, size, 1, GFP_ATOMIC);
}
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
(1UL << log2qty),
ilog2(size) - PAGE_SHIFT,
size);
if (_hash_shift)
*_hash_shift = log2qty;
if (_hash_mask)
*_hash_mask = (1 << log2qty) - 1;
return table;
}
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
return __pfn_to_section(pfn)->pageblock_flags;
#else
return zone->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}
static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}
/**
* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @end_bitidx: The last bit of interest to retrieve
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
unsigned long end_bitidx,
unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long word;
zone = page_zone(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
bitidx += end_bitidx;
return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
}
/**
* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @flags: The flags to set
* @pfn: The target page frame number
* @end_bitidx: The last bit of interest
* @mask: mask of bits that the caller is interested in
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long pfn,
unsigned long end_bitidx,
unsigned long mask)
{
struct zone *zone;
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long old_word, word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
zone = page_zone(page);
bitmap = get_pageblock_bitmap(zone, pfn);
bitidx = pfn_to_bitidx(zone, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
bitidx += end_bitidx;
mask <<= (BITS_PER_LONG - bitidx - 1);
flags <<= (BITS_PER_LONG - bitidx - 1);
word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
if (word == old_word)
break;
word = old_word;
}
* This function checks whether pageblock includes unmovable pages or not.
* If @count is not zero, it is okay to include less @count unmovable pages
*
* PageLRU check without isolation or lru_lock could race so that
* MIGRATE_MOVABLE block might include unmovable pages. It means you can't
* expect this function should be exact.
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
bool skip_hwpoisoned_pages)
{
unsigned long pfn, iter, found;
/*
* For avoiding noise data, lru_add_drain_all() should be called
* If ZONE_MOVABLE, the zone never contains unmovable pages
*/
if (zone_idx(zone) == ZONE_MOVABLE)
mt = get_pageblock_migratetype(page);
if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
pfn = page_to_pfn(page);
for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter;
if (!pfn_valid_within(check))
continue;
page = pfn_to_page(check);
/*
* Hugepages are not in LRU lists, but they're movable.
* We need not scan over tail pages bacause we don't
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
continue;
}
/*
* We can't use page_count without pin a page
* because another CPU can free compound page.
* This check already skips compound tails of THP
* because their page->_count is zero at all time.
*/
if (!atomic_read(&page->_count)) {
if (PageBuddy(page))
iter += (1 << page_order(page)) - 1;
continue;
}
/*
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
if (skip_hwpoisoned_pages && PageHWPoison(page))
continue;
if (!PageLRU(page))
found++;
/*
* If there are RECLAIMABLE pages, we need to check
* it. But now, memory offline itself doesn't call
* shrink_node_slabs() and it still to be fixed.
*/
/*
* If the page is not RAM, page_count()should be 0.
* we don't need more check. This is an _used_ not-movable page.
*
* The problematic thing here is PG_reserved pages. PG_reserved
* is set to both of a memory hole page and a _used_ kernel
* page at boot.
*/
if (found > count)
}
bool is_pageblock_removable_nolock(struct page *page)
{
struct zone *zone;
unsigned long pfn;
/*
* We have to be careful here because we are iterating over memory
* sections which are not zone aware so we might end up outside of
* the zone but still within the section.
* We have to take care about the node as well. If the node is offline
* its NODE_DATA will be NULL - see page_zone.
if (!node_online(page_to_nid(page)))
return false;
zone = page_zone(page);
pfn = page_to_pfn(page);
if (!zone_spans_pfn(zone, pfn))
return false;
return !has_unmovable_pages(zone, page, 0, true);
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
{
return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
pageblock_nr_pages) - 1);
}
static unsigned long pfn_max_align_up(unsigned long pfn)
{
return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
pageblock_nr_pages));
}
/* [start, end) must belong to a single zone. */
static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
unsigned long nr_reclaimed;
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
migrate_prep();
while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
if (list_empty(&cc->migratepages)) {
cc->nr_migratepages = 0;
Vlastimil Babka
committed
pfn = isolate_migratepages_range(cc, pfn, end);
if (!pfn) {
ret = -EINTR;
break;
}
tries = 0;
} else if (++tries == 5) {
ret = ret < 0 ? ret : -EBUSY;
break;
}
nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
Minchan Kim
committed
ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
NULL, 0, cc->mode, MR_CMA);
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
return ret;
}
return 0;
}
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
* @end: one-past-the-last PFN to allocate
* @migratetype: migratetype of the underlaying pageblocks (either
* #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
* in range must have the same migratetype and it must
* be either of the two.
*
* The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
* aligned, however it's the caller's responsibility to guarantee that
* we are the only thread that changes migrate type of pageblocks the
* pages fall in.
*
* The PFN range must belong to a single zone.
*
* Returns zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype)
{
unsigned long outer_start, outer_end;
unsigned int order;
int ret = 0;
struct compact_control cc = {
.nr_migratepages = 0,
.order = -1,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
};
INIT_LIST_HEAD(&cc.migratepages);
/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
* have different sizes, and due to the way page allocator
* work, we align the range to biggest of the two pages so
* that page allocator won't try to merge buddies from
* different pageblocks and change MIGRATE_ISOLATE to some
* other migration type.
*
* Once the pageblocks are marked as MIGRATE_ISOLATE, we
* migrate the pages from an unaligned range (ie. pages that