Newer
Older
* allocate from, they may fallback to other nodes when oom.
* Thus, it's possible for tsk to have allocated memory from
* nodes in mask.
*/
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
ret = nodes_intersects(mempolicy->v.nodes, *mask);
break;
default:
BUG();
}
out:
task_unlock(tsk);
return ret;
}
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
unsigned nid)
{
struct zonelist *zl;
struct page *page;
zl = node_zonelist(nid, gfp);
if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
* alloc_pages_vma - Allocate a page for a VMA.
*
* @gfp:
* %GFP_USER user allocation.
* %GFP_KERNEL kernel allocations,
* %GFP_HIGHMEM highmem/user allocations,
* %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep.
*
* @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
* When VMA is not NULL caller must hold down_read on the mmap_sem of the
* mm_struct of the VMA to prevent it from going away. Should be used for
* all allocations for pages that will be mapped into
* user space. Returns NULL when no page can be allocated.
*
* Should be called with the mm_sem of the vma hold.
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node)
struct mempolicy *pol;
struct page *page;
unsigned int cpuset_mems_cookie;
retry_cpuset:
pol = get_vma_policy(current, vma, addr);
cpuset_mems_cookie = get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, node),
policy_nodemask(gfp, pol));
if (unlikely(mpol_needs_cond_ref(pol)))
__mpol_put(pol);
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
}
/**
* alloc_pages_current - Allocate pages.
*
* @gfp:
* %GFP_USER user allocation,
* %GFP_KERNEL kernel allocation,
* %GFP_HIGHMEM highmem allocation,
* %GFP_FS don't call back into a file system.
* %GFP_ATOMIC don't sleep.
* @order: Power of two of allocation size in pages. 0 is a single page.
*
* Allocate a page from the kernel page pool. When not in
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
*
* Don't call cpuset_update_task_memory_state() unless
* 1) it's ok to take cpuset_sem (can WAIT), and
* 2) allocating for current task (not interrupt).
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
struct mempolicy *pol = get_task_policy(current);
struct page *page;
unsigned int cpuset_mems_cookie;
Christoph Lameter
committed
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
return page;
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
struct mempolicy *pol = mpol_dup(vma_policy(src));
if (IS_ERR(pol))
return PTR_ERR(pol);
dst->vm_policy = pol;
return 0;
}
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
*
* current's mempolicy may be rebinded by the other task(the task that changes
* cpuset's mems), so we needn't do rebind work for current task.
/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
/* task's mempolicy is protected by alloc_lock */
if (old == current->mempolicy) {
task_lock(current);
*new = *old;
task_unlock(current);
} else
*new = *old;
rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
if (new->flags & MPOL_F_REBINDING)
mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
else
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
if (a->mode != b->mode)
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
switch (a->mode) {
case MPOL_BIND:
/* Fall through */
return !!nodes_equal(a->v.nodes, b->v.nodes);
return a->v.preferred_node == b->v.preferred_node;
}
}
/*
* Shared memory backing store policy support.
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
* They are protected by the sp->lock spinlock, which should be held
* for any accesses to the tree.
*/
/* lookup first element intersecting start-end */
/* Caller holds sp->lock */
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
struct rb_node *n = sp->root.rb_node;
while (n) {
struct sp_node *p = rb_entry(n, struct sp_node, nd);
if (start >= p->end)
n = n->rb_right;
else if (end <= p->start)
n = n->rb_left;
else
break;
}
if (!n)
return NULL;
for (;;) {
struct sp_node *w = NULL;
struct rb_node *prev = rb_prev(n);
if (!prev)
break;
w = rb_entry(prev, struct sp_node, nd);
if (w->end <= start)
break;
n = prev;
}
return rb_entry(n, struct sp_node, nd);
}
/* Insert a new shared policy into the list. */
/* Caller holds sp->lock */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
struct rb_node *parent = NULL;
struct sp_node *nd;
while (*p) {
parent = *p;
nd = rb_entry(parent, struct sp_node, nd);
if (new->start < nd->start)
p = &(*p)->rb_left;
else if (new->end > nd->end)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
struct mempolicy *pol = NULL;
struct sp_node *sn;
if (!sp->root.rb_node)
return NULL;
spin_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
spin_unlock(&sp->lock);
static void sp_free(struct sp_node *n)
{
mpol_put(n->policy);
kmem_cache_free(sn_cache, n);
}
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
/**
* mpol_misplaced - check whether current page node is valid in policy
*
* @page - page to be checked
* @vma - vm area where page mapped
* @addr - virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
* node id.
*
* Returns:
* -1 - not misplaced, page is in the right node
* node - node id where the page should be
*
* Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
int polnid = -1;
int ret = -1;
BUG_ON(!vma);
pol = get_vma_policy(current, vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
BUG_ON(addr >= vma->vm_end);
BUG_ON(addr < vma->vm_start);
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
polnid = offset_il_node(pol, vma, pgoff);
break;
case MPOL_PREFERRED:
if (pol->flags & MPOL_F_LOCAL)
polnid = numa_node_id();
else
polnid = pol->v.preferred_node;
break;
case MPOL_BIND:
/*
* allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->v.nodes))
goto out;
(void)first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes, &zone);
polnid = zone->node;
break;
default:
BUG();
}
/* Migrate the page towards the node whose CPU is referencing it */
Mel Gorman
committed
if (pol->flags & MPOL_F_MORON) {
if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
Mel Gorman
committed
}
if (curnid != polnid)
ret = polnid;
out:
mpol_cond_put(pol);
return ret;
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
static void sp_node_init(struct sp_node *node, unsigned long start,
unsigned long end, struct mempolicy *pol)
{
node->start = start;
node->end = end;
node->policy = pol;
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
struct sp_node *n;
struct mempolicy *newpol;
n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
newpol = mpol_dup(pol);
if (IS_ERR(newpol)) {
kmem_cache_free(sn_cache, n);
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
sp_node_init(n, start, end, newpol);
return n;
}
/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
struct sp_node *n_new = NULL;
struct mempolicy *mpol_new = NULL;
restart:
spin_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
struct rb_node *next = rb_next(&n->nd);
if (n->start >= start) {
if (n->end <= end)
sp_delete(sp, n);
else
n->start = end;
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
if (!n_new)
goto alloc_new;
*mpol_new = *n->policy;
atomic_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n_new = NULL;
mpol_new = NULL;
break;
} else
n->end = start;
}
if (!next)
break;
n = rb_entry(next, struct sp_node, nd);
}
if (new)
sp_insert(sp, new);
spin_unlock(&sp->lock);
ret = 0;
err_out:
if (mpol_new)
mpol_put(mpol_new);
if (n_new)
kmem_cache_free(sn_cache, n_new);
alloc_new:
spin_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
goto err_out;
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
goto restart;
/**
* mpol_shared_policy_init - initialize shared policy for inode
* @sp: pointer to inode shared policy
* @mpol: struct mempolicy to install
*
* Install non-NULL @mpol in inode's shared policy rb-tree.
* On entry, the current task has a reference on a non-NULL @mpol.
* This must be released on exit.
* This is called at get_inode() calls and we can use GFP_KERNEL.
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
sp->root = RB_ROOT; /* empty tree == default mempolicy */
spin_lock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
struct mempolicy *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
goto put_mpol;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
if (IS_ERR(new))
goto free_scratch; /* no valid nodemask intersection */
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
goto put_new;
/* Create pseudo-vma that contains just the policy */
memset(&pvma, 0, sizeof(struct vm_area_struct));
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
mpol_put(new); /* drop initial ref */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
mpol_put(mpol); /* drop our incoming ref on sb mpol */
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
int err;
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
sz, npol ? npol->mode : -1,
npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
if (!new)
return -ENOMEM;
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
sp_free(new);
return err;
}
/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *p)
{
struct sp_node *n;
struct rb_node *next;
if (!p->root.rb_node)
return;
spin_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
spin_unlock(&p->lock);
#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
bool numabalancing_default = false;
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
if (numabalancing_override)
set_numabalancing_state(numabalancing_override == 1);
if (nr_node_ids > 1 && !numabalancing_override) {
pr_info("%s automatic NUMA balancing. "
"Configure with numa_balancing= or the "
"kernel.numa_balancing sysctl",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
static int __init setup_numabalancing(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
unsigned long largest = 0;
int nid, prefer = 0;
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL);
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.v = { .preferred_node = nid, },
};
}
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
if (largest < total_pages) {
largest = total_pages;
prefer = nid;
}
/* Interleave this node? */
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
node_set(nid, interleave_nodes);
}
/* All too small, use the largest */
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
check_numabalancing_enable();
/* Reset policy of current process to default */
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
/*
* Parse and format mempolicy from/to strings
*/
* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
* On success, returns 0, else 1
int mpol_parse_str(char *str, struct mempolicy **mpol)
struct mempolicy *new = NULL;
unsigned short mode;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
int err = 1;
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
if (!nodes_subset(nodes, node_states[N_MEMORY]))
} else
nodes_clear(nodes);
if (flags)
*flags++ = '\0'; /* terminate mode string */
for (mode = 0; mode < MPOL_MAX; mode++) {
if (!strcmp(str, policy_modes[mode])) {
break;
}
}
if (mode >= MPOL_MAX)
switch (mode) {
case MPOL_PREFERRED:
/*
* Insist on a nodelist of one node only
*/
if (nodelist) {
char *rest = nodelist;
while (isdigit(*rest))
rest++;
}
break;
case MPOL_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
nodes = node_states[N_MEMORY];
case MPOL_LOCAL:
* Don't allow a nodelist; mpol_new() checks flags
if (nodelist)
mode = MPOL_PREFERRED;
case MPOL_DEFAULT:
/*
* Insist on a empty nodelist
*/
if (!nodelist)
err = 0;
goto out;
case MPOL_BIND:
/*
* Insist on a nodelist
*/
if (!nodelist)
goto out;
mode_flags = 0;
if (flags) {
/*
* Currently, we only support two mutually exclusive
* mode flags.
*/
if (!strcmp(flags, "static"))
mode_flags |= MPOL_F_STATIC_NODES;
else if (!strcmp(flags, "relative"))
mode_flags |= MPOL_F_RELATIVE_NODES;
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
/*
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
if (mode != MPOL_PREFERRED)
new->v.nodes = nodes;
else if (nodelist)
new->v.preferred_node = first_node(nodes);
else
new->flags |= MPOL_F_LOCAL;
/*
* Save nodes for contextualization: this will be used to "clone"
* the mempolicy in a specific context [cpuset] at a later time.
*/
new->w.user_nodemask = nodes;
out:
/* Restore string for error message */
if (nodelist)
*--nodelist = ':';
if (flags)
*--flags = '=';
if (!err)
*mpol = new;
return err;
}
#endif /* CONFIG_TMPFS */
/**
* mpol_to_str - format a mempolicy structure for printing
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
* Convert @pol into a string. If @buffer is too short, truncate the string.
* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
* longest flag, "relative", and to display at least a few node ids.
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
nodemask_t nodes = NODE_MASK_NONE;
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mode = pol->mode;
flags = pol->flags;
}
switch (mode) {
case MPOL_DEFAULT:
break;
case MPOL_PREFERRED:
if (flags & MPOL_F_LOCAL)
node_set(pol->v.preferred_node, nodes);
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
WARN_ON_ONCE(1);
snprintf(p, maxlen, "unknown");
return;
p += snprintf(p, maxlen, "%s", policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
p += snprintf(p, buffer + maxlen - p, "=");
/*
* Currently, the only defined flags are mutually exclusive
*/
if (flags & MPOL_F_STATIC_NODES)
p += snprintf(p, buffer + maxlen - p, "static");
else if (flags & MPOL_F_RELATIVE_NODES)
p += snprintf(p, buffer + maxlen - p, "relative");
if (!nodes_empty(nodes)) {
p += snprintf(p, buffer + maxlen - p, ":");
p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
}
}