Newer
Older
zone_pcp_init(zone);
/* For bootup, initialized properly in watermark setup */
mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
Yasunori Goto
committed
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
unsigned long __maybe_unused start = 0;
unsigned long __maybe_unused offset = 0;
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
/*
* The zone's endpoints aren't required to be MAX_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
map = memblock_virt_alloc_node_nopanic(size,
pgdat->node_id);
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0)) {
#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
Mel Gorman
committed
reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
#else
start_pfn = node_start_pfn;
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
nid, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
#endif
free_area_init_core(pgdat);
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
*/
void __init setup_nr_node_ids(void)
unsigned int highest;
highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
nr_node_ids = highest + 1;
}
#endif
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
/**
* node_map_pfn_alignment - determine the maximum internode alignment
*
* This function should be called after node map is populated and sorted.
* It calculates the maximum power of two alignment which can distinguish
* all the nodes.
*
* For example, if all nodes are 1GiB and aligned to 1GiB, the return value
* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
* shifted, 1GiB is enough and this function will indicate so.
*
* This is used to test whether pfn -> nid mapping of the chosen memory
* model has fine enough granularity to avoid incorrect mapping for the
* populated node map.
*
* Returns the determined alignment in pfn's. 0 if there is no alignment
* requirement (single node).
*/
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
int last_nid = -1;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
if (!start || last_nid < 0 || last_nid == nid) {
last_nid = nid;
last_end = end;
continue;
}
/*
* Start with a mask granular enough to pin-point to the
* start pfn and tick off bits one-by-one until it becomes
* too coarse to separate the current node from the last.
*/
mask = ~((1 << __ffs(start)) - 1);
while (mask && last_end <= (start & (mask << 1)))
mask <<= 1;
/* accumulate all internode masks */
accl_mask |= mask;
}
/* convert mask to number of pages */
return ~accl_mask + 1;
}
/* Find the lowest pfn for a node */
static unsigned long __init find_min_pfn_for_node(int nid)
unsigned long min_pfn = ULONG_MAX;
unsigned long start_pfn;
int i;
for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
min_pfn = min(min_pfn, start_pfn);
if (min_pfn == ULONG_MAX) {
pr_warn("Could not find start_pfn for node %d\n", nid);
return 0;
}
return min_pfn;
}
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
* It returns the minimum PFN based on information provided via
* memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
return find_min_pfn_for_node(MAX_NUMNODES);
}
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
Lai Jiangshan
committed
* Populate N_MEMORY for calculating usable_nodes.
static unsigned long __init early_calculate_totalpages(void)
{
unsigned long totalpages = 0;
unsigned long start_pfn, end_pfn;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
unsigned long pages = end_pfn - start_pfn;
totalpages += pages;
if (pages)
Lai Jiangshan
committed
node_set_state(nid, N_MEMORY);
/*
* Find the PFN the Movable zone begins in each node. Kernel memory
* is spread evenly between nodes as long as the nodes have enough
* memory. When they don't, some nodes will have more kernelcore than
* others
*/
static void __init find_zone_movable_pfns_for_nodes(void)
{
int i, nid;
unsigned long usable_startpfn;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
Lai Jiangshan
committed
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
Lai Jiangshan
committed
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
/* Need to find movable_zone earlier when movable_node is specified. */
find_usable_zone_for_movable();
/*
* If movable_node is specified, ignore kernelcore and movablecore
* options.
*/
if (movable_node_is_enabled()) {
for_each_memblock(memory, r) {
if (!memblock_is_hotpluggable(r))
continue;
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
min(usable_startpfn, zone_movable_pfn[nid]) :
usable_startpfn;
}
goto out2;
}
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
/*
* If kernelcore=mirror is specified, ignore movablecore option
*/
if (mirrored_kernelcore) {
bool mem_below_4gb_not_mirrored = false;
for_each_memblock(memory, r) {
if (memblock_is_mirror(r))
continue;
nid = r->nid;
usable_startpfn = memblock_region_memory_base_pfn(r);
if (usable_startpfn < 0x100000) {
mem_below_4gb_not_mirrored = true;
continue;
}
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
min(usable_startpfn, zone_movable_pfn[nid]) :
usable_startpfn;
}
if (mem_below_4gb_not_mirrored)
pr_warn("This configuration results in unmirrored kernel memory.");
goto out2;
}
* If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
* and movablecore are specified, then the value of kernelcore
* will be used for required_kernelcore if it's greater than
* what movablecore would have allowed.
*/
if (required_movablecore) {
unsigned long corepages;
/*
* Round-up so that ZONE_MOVABLE is at least as large as what
* was requested by the user
*/
required_movablecore =
roundup(required_movablecore, MAX_ORDER_NR_PAGES);
required_movablecore = min(totalpages, required_movablecore);
corepages = totalpages - required_movablecore;
required_kernelcore = max(required_kernelcore, corepages);
}
Xishi Qiu
committed
/*
* If kernelcore was not specified or kernelcore size is larger
* than totalpages, there is no ZONE_MOVABLE.
*/
if (!required_kernelcore || required_kernelcore >= totalpages)
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
Lai Jiangshan
committed
for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
/*
* Recalculate kernelcore_node if the division per node
* now exceeds what is necessary to satisfy the requested
* amount of memory for the kernel
*/
if (required_kernelcore < kernelcore_node)
kernelcore_node = required_kernelcore / usable_nodes;
/*
* As the map is walked, we track how much memory is usable
* by the kernel using kernelcore_remaining. When it is
* 0, the rest of the node is usable by ZONE_MOVABLE
*/
kernelcore_remaining = kernelcore_node;
/* Go through each range of PFNs within this node */
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
start_pfn = max(start_pfn, zone_movable_pfn[nid]);
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
if (start_pfn >= end_pfn)
continue;
/* Account for what is only usable for kernelcore */
if (start_pfn < usable_startpfn) {
unsigned long kernel_pages;
kernel_pages = min(end_pfn, usable_startpfn)
- start_pfn;
kernelcore_remaining -= min(kernel_pages,
kernelcore_remaining);
required_kernelcore -= min(kernel_pages,
required_kernelcore);
/* Continue if range is now fully accounted */
if (end_pfn <= usable_startpfn) {
/*
* Push zone_movable_pfn to the end so
* that if we have to rebalance
* kernelcore across nodes, we will
* not double account here
*/
zone_movable_pfn[nid] = end_pfn;
continue;
}
start_pfn = usable_startpfn;
}
/*
* The usable PFN range for ZONE_MOVABLE is from
* start_pfn->end_pfn. Calculate size_pages as the
* number of pages used as kernelcore
*/
size_pages = end_pfn - start_pfn;
if (size_pages > kernelcore_remaining)
size_pages = kernelcore_remaining;
zone_movable_pfn[nid] = start_pfn + size_pages;
/*
* Some kernelcore has been met, update counts and
* break if the kernelcore for this node has been
*/
required_kernelcore -= min(required_kernelcore,
size_pages);
kernelcore_remaining -= size_pages;
if (!kernelcore_remaining)
break;
}
}
/*
* If there is still required_kernelcore, we do another pass with one
* less node in the count. This will push zone_movable_pfn[nid] further
* along on the nodes that still have memory until kernelcore is
*/
usable_nodes--;
if (usable_nodes && required_kernelcore > usable_nodes)
goto restart;
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
Lai Jiangshan
committed
node_states[N_MEMORY] = saved_node_state;
Lai Jiangshan
committed
/* Any regular or high memory on that node ? */
static void check_for_memory(pg_data_t *pgdat, int nid)
{
enum zone_type zone_type;
Lai Jiangshan
committed
if (N_MEMORY == N_NORMAL_MEMORY)
return;
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
Lai Jiangshan
committed
node_set_state(nid, N_HIGH_MEMORY);
if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
zone_type <= ZONE_NORMAL)
node_set_state(nid, N_NORMAL_MEMORY);
}
}
/**
* free_area_init_nodes - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
* Using the page ranges provided by memblock_set_node(), the size of each
* zone in each node and their holes is calculated. If the maximum PFN
* between two adjacent zones match, it is assumed that the zone is empty.
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
/* Print out the early node map */
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
/* Any memory on that node */
if (pgdat->node_present_pages)
Lai Jiangshan
committed
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
}
static int __init cmdline_parse_core(char *p, unsigned long *core)
{
unsigned long long coremem;
if (!p)
return -EINVAL;
coremem = memparse(p, &p);
*core = coremem >> PAGE_SHIFT;
/* Paranoid check that UL is enough for the coremem value */
WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
return 0;
}
/*
* kernelcore=size sets the amount of memory for use for allocations that
* cannot be reclaimed or migrated.
*/
static int __init cmdline_parse_kernelcore(char *p)
{
/* parse kernelcore=mirror */
if (parse_option_str(p, "mirror")) {
mirrored_kernelcore = true;
return 0;
}
return cmdline_parse_core(p, &required_kernelcore);
}
/*
* movablecore=size sets the amount of memory for use for allocations that
* can be reclaimed or migrated.
*/
static int __init cmdline_parse_movablecore(char *p)
{
return cmdline_parse_core(p, &required_movablecore);
}
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
void adjust_managed_page_count(struct page *page, long count)
{
spin_lock(&managed_page_count_lock);
page_zone(page)->managed_pages += count;
totalram_pages += count;
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
totalhigh_pages += count;
#endif
spin_unlock(&managed_page_count_lock);
}
EXPORT_SYMBOL(adjust_managed_page_count);
unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
void *pos;
unsigned long pages = 0;
start = (void *)PAGE_ALIGN((unsigned long)start);
end = (void *)((unsigned long)end & PAGE_MASK);
for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
if ((unsigned int)poison <= 0xFF)
memset(pos, poison, PAGE_SIZE);
free_reserved_page(virt_to_page(pos));
}
if (pages && s)
pr_info("Freeing %s memory: %ldK (%p - %p)\n",
s, pages << (PAGE_SHIFT - 10), start, end);
return pages;
}
EXPORT_SYMBOL(free_reserved_area);
Jiang Liu
committed
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
{
__free_reserved_page(page);
totalram_pages++;
page_zone(page)->managed_pages++;
Jiang Liu
committed
totalhigh_pages++;
}
#endif
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
void __init mem_init_print_info(const char *str)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
unsigned long init_code_size, init_data_size;
physpages = get_num_physpages();
codesize = _etext - _stext;
datasize = _edata - _sdata;
rosize = __end_rodata - __start_rodata;
bss_size = __bss_stop - __bss_start;
init_data_size = __init_end - __init_begin;
init_code_size = _einittext - _sinittext;
/*
* Detect special cases and adjust section sizes accordingly:
* 1) .init.* may be embedded into .data sections
* 2) .init.text.* may be out of [__init_begin, __init_end],
* please refer to arch/tile/kernel/vmlinux.lds.S.
* 3) .rodata.* may be embedded into .text or .data sections.
*/
#define adj_init_size(start, end, size, pos, adj) \
do { \
if (start <= pos && pos < end && size > adj) \
size -= adj; \
} while (0)
adj_init_size(__init_begin, __init_end, init_data_size,
_sinittext, init_code_size);
adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
#undef adj_init_size
pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
#endif
"%s%s)\n",
nr_free_pages() << (PAGE_SHIFT - 10),
physpages << (PAGE_SHIFT - 10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
(physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
#endif
}
* set_dma_reserve - set the specified number of pages reserved in the first zone
* @new_dma_reserve: The number of pages to mark reserved
* The per-cpu batchsize and zone watermarks are determined by managed_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* and other unfreeable allocations which can skew the watermarks badly. This
* function may optionally be used to account for unfreeable pages in the
* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
* smaller per-cpu batchsize.
*/
void __init set_dma_reserve(unsigned long new_dma_reserve)
{
dma_reserve = new_dma_reserve;
}
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
static int page_alloc_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
int cpu = (unsigned long)hcpu;
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
lru_add_drain_cpu(cpu);
drain_pages(cpu);
/*
* Spill the event counters of the dead processor
* into the current processors event counters.
* This artificially elevates the count of the current
* processor.
*/
/*
* Zero the differential counters of the dead processor
* so that the vm statistics are consistent.
*
* This is only okay since the processor is dead and cannot
* race with what we are doing.
*/
Christoph Lameter
committed
cpu_vm_stats_fold(cpu);
}
return NOTIFY_OK;
}
void __init page_alloc_init(void)
{
hotcpu_notifier(page_alloc_cpu_notify, 0);
}
Yaowei Bai
committed
* calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
* or min_free_kbytes changes.
*/
static void calculate_totalreserve_pages(void)
{
struct pglist_data *pgdat;
unsigned long reserve_pages = 0;
for_each_online_pgdat(pgdat) {
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
Mel Gorman
committed
long max = 0;
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
if (zone->lowmem_reserve[j] > max)
max = zone->lowmem_reserve[j];
}
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
if (max > zone->managed_pages)
max = zone->managed_pages;
zone->totalreserve_pages = max;
reserve_pages += max;
}
}
totalreserve_pages = reserve_pages;
}
/*
* setup_per_zone_lowmem_reserve - called whenever
Yaowei Bai
committed
* sysctl_lowmem_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long managed_pages = zone->managed_pages;
if (sysctl_lowmem_reserve_ratio[idx] < 1)
sysctl_lowmem_reserve_ratio[idx] = 1;
lower_zone = pgdat->node_zones + idx;
lower_zone->lowmem_reserve[j] = managed_pages /
managed_pages += lower_zone->managed_pages;
/* update totalreserve_pages */
calculate_totalreserve_pages();
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->managed_pages;
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone->managed_pages;
do_div(tmp, lowmem_pages);
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* need highmem pages, so cap pages_min to a small
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas control asynch page reclaim, and so should
unsigned long min_pages;
min_pages = zone->managed_pages / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->watermark[WMARK_MIN] = min_pages;
/*
* If it's a lowmem zone, reserve a number of pages
zone->watermark[WMARK_MIN] = tmp;
/*
* Set the kswapd watermarks distance according to the
* scale factor in proportion to available memory, but
* ensure a minimum size on small systems.
*/
tmp = max_t(u64, tmp >> 2,
mult_frac(zone->managed_pages,
watermark_scale_factor, 10000));
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
spin_unlock_irqrestore(&zone->lock, flags);
/* update totalreserve_pages */
calculate_totalreserve_pages();
/**
* setup_per_zone_wmarks - called when min_free_kbytes changes
* or when memory is hot-{added|removed}
*
* Ensures that the watermark[min,low,high] values for each zone are set
* correctly with respect to min_free_kbytes.
*/
void setup_per_zone_wmarks(void)
{
mutex_lock(&zonelists_mutex);
__setup_per_zone_wmarks();
mutex_unlock(&zonelists_mutex);
}
/*
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
* we want it large (64MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
*
* which yields
*
* 16MB: 512k
* 32MB: 724k
* 64MB: 1024k
* 128MB: 1448k
* 256MB: 2048k
* 512MB: 2896k
* 1024MB: 4096k
* 2048MB: 5792k
* 4096MB: 8192k
* 8192MB: 11584k
* 16384MB: 16384k
*/
int __meminit init_per_zone_wmark_min(void)
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
if (new_min_free_kbytes > user_min_free_kbytes) {
min_free_kbytes = new_min_free_kbytes;
if (min_free_kbytes < 128)
min_free_kbytes = 128;
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
} else {
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes);
}
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
return 0;
}
core_initcall(init_per_zone_wmark_min)
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call two helper functions whenever min_free_kbytes
* changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write) {
user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write)
setup_per_zone_wmarks();
return 0;
}
Christoph Lameter
committed
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
Christoph Lameter
committed
{
struct zone *zone;
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
Christoph Lameter
committed
if (rc)
return rc;
for_each_zone(zone)
zone->min_unmapped_pages = (zone->managed_pages *
Christoph Lameter
committed
sysctl_min_unmapped_ratio) / 100;
return 0;
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
for_each_zone(zone)
zone->min_slab_pages = (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}
Christoph Lameter
committed
#endif
/*
* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
* whenever sysctl_lowmem_reserve_ratio changes.
*
* The reserve ratio obviously has absolutely no relation with the
* minimum watermarks. The lowmem reserve ratio can only make sense
* if in function of the boot time zone sizes.
*/