Newer
Older
// SPDX-License-Identifier: GPL-2.0-or-later
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
* Author Balbir Singh <balbir@linux.vnet.ibm.com>
*
* Copyright 2007 OpenVZ SWsoft Inc
* Author: Pavel Emelianov <xemul@openvz.org>
*
* Memory thresholds
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Kernel Memory Controller
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
* Native page reclaim
* Charge lifetime sanitation
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/pagewalk.h>
Ingo Molnar
committed
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/vm_event_item.h>
KAMEZAWA Hiroyuki
committed
#include <linux/smp.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/poll.h>
#include <linux/seq_file.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
#include <linux/uaccess.h>
#include <trace/events/vmscan.h>
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
/* Active memory cgroup to use from an interrupt context */
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
Johannes Weiner
committed
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem;
/* Whether the swap controller is active */
bool cgroup_memory_noswap __read_mostly;
#define cgroup_memory_noswap 1
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
*/
struct mem_cgroup_tree_per_node {
struct rb_root rb_root;
spinlock_t lock;
};
struct mem_cgroup_tree {
struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
struct eventfd_ctx *eventfd;
};
/*
* cgroup_event represents events which userspace want to receive.
*/
* memcg which the event belongs to.
struct mem_cgroup *memcg;
/*
* eventfd to signal userspace about the event.
*/
struct eventfd_ctx *eventfd;
/*
* Each of these stored in a list by the cgroup.
*/
struct list_head list;
/*
* register_event() callback will be used to add new userspace
* waiter for changes related to this event. Use eventfd_signal()
* on eventfd to send notification to userspace.
*/
int (*register_event)(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args);
/*
* unregister_event() callback will be called when userspace closes
* the eventfd or on cgroup removing. This callback must be set,
* if you want provide notification functionality.
*/
void (*unregister_event)(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd);
/*
* All fields below needed to unregister event when
* userspace closes eventfd.
*/
poll_table pt;
wait_queue_head_t *wqh;
wait_queue_entry_t wait;
struct work_struct remove;
};
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved.
#define MOVE_ANON 0x1U
#define MOVE_FILE 0x2U
#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mm_struct *mm;
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
*/
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
/* for encoding cft->private value on file */
enum res_type {
_MEM,
_MEMSWAP,
_OOM_TYPE,
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
static inline bool should_force_charge(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
(current->flags & PF_EXITING);
}
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
if (!memcg)
memcg = root_mem_cgroup;
return &memcg->vmpressure;
}
struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}
Kirill Tkhai
committed
#ifdef CONFIG_MEMCG_KMEM
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
extern spinlock_t css_set_lock;
static void obj_cgroup_release(struct percpu_ref *ref)
{
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
struct mem_cgroup *memcg;
unsigned int nr_bytes;
unsigned int nr_pages;
unsigned long flags;
/*
* At this point all allocated objects are freed, and
* objcg->nr_charged_bytes can't have an arbitrary byte value.
* However, it can be PAGE_SIZE or (x * PAGE_SIZE).
*
* The following sequence can lead to it:
* 1) CPU0: objcg == stock->cached_objcg
* 2) CPU1: we do a small allocation (e.g. 92 bytes),
* PAGE_SIZE bytes are charged
* 3) CPU1: a process from another memcg is allocating something,
* the stock if flushed,
* objcg->nr_charged_bytes = PAGE_SIZE - 92
* 5) CPU0: we do release this object,
* 92 bytes are added to stock->nr_bytes
* 6) CPU0: stock is flushed,
* 92 bytes are added to objcg->nr_charged_bytes
*
* In the result, nr_charged_bytes == PAGE_SIZE.
* This page will be uncharged in obj_cgroup_release().
*/
nr_bytes = atomic_read(&objcg->nr_charged_bytes);
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
spin_lock_irqsave(&css_set_lock, flags);
memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
__memcg_kmem_uncharge(memcg, nr_pages);
list_del(&objcg->list);
mem_cgroup_put(memcg);
spin_unlock_irqrestore(&css_set_lock, flags);
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
}
static struct obj_cgroup *obj_cgroup_alloc(void)
{
struct obj_cgroup *objcg;
int ret;
objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
if (!objcg)
return NULL;
ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
GFP_KERNEL);
if (ret) {
kfree(objcg);
return NULL;
}
INIT_LIST_HEAD(&objcg->list);
return objcg;
}
static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
struct mem_cgroup *parent)
{
struct obj_cgroup *objcg, *iter;
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
spin_lock_irq(&css_set_lock);
/* Move active objcg to the parent's list */
xchg(&objcg->memcg, parent);
css_get(&parent->css);
list_add(&objcg->list, &parent->objcg_list);
/* Move already reparented objcgs to the parent's list */
list_for_each_entry(iter, &memcg->objcg_list, list) {
css_get(&parent->css);
xchg(&iter->memcg, parent);
css_put(&memcg->css);
}
list_splice(&memcg->objcg_list, &parent->objcg_list);
spin_unlock_irq(&css_set_lock);
percpu_ref_kill(&objcg->refcnt);
}
Roman Gushchin
committed
* This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
* 200 entry array for that.
* The current size of the caches array is stored in memcg_nr_cache_ids. It
* will double each time we have to increase it.
static DEFINE_IDA(memcg_cache_ida);
int memcg_nr_cache_ids;
/* Protects memcg_nr_cache_ids */
static DECLARE_RWSEM(memcg_cache_ids_sem);
void memcg_get_cache_ids(void)
{
down_read(&memcg_cache_ids_sem);
}
void memcg_put_cache_ids(void)
{
up_read(&memcg_cache_ids_sem);
}
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
* the alloc/free process all the time. In a small machine, 4 kmem-limited
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
* MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
* cgrp_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
{
kvfree(container_of(head, struct memcg_shrinker_map, rcu));
}
static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
int size, int old_size)
{
struct memcg_shrinker_map *new, *old;
int nid;
lockdep_assert_held(&memcg_shrinker_map_mutex);
for_each_node(nid) {
old = rcu_dereference_protected(
mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
/* Not yet online memcg */
if (!old)
return 0;
new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
if (!new)
return -ENOMEM;
/* Set all old bits, clear all new bits */
memset(new->map, (int)0xff, old_size);
memset((void *)new->map + old_size, 0, size - old_size);
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
}
return 0;
}
static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
{
struct mem_cgroup_per_node *pn;
struct memcg_shrinker_map *map;
int nid;
if (mem_cgroup_is_root(memcg))
return;
for_each_node(nid) {
pn = mem_cgroup_nodeinfo(memcg, nid);
map = rcu_dereference_protected(pn->shrinker_map, true);
if (map)
kvfree(map);
rcu_assign_pointer(pn->shrinker_map, NULL);
}
}
static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
{
struct memcg_shrinker_map *map;
int nid, size, ret = 0;
if (mem_cgroup_is_root(memcg))
return 0;
mutex_lock(&memcg_shrinker_map_mutex);
size = memcg_shrinker_map_size;
for_each_node(nid) {
map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
if (!map) {
memcg_free_shrinker_maps(memcg);
ret = -ENOMEM;
break;
}
rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
}
mutex_unlock(&memcg_shrinker_map_mutex);
return ret;
}
int memcg_expand_shrinker_maps(int new_id)
{
int size, old_size, ret = 0;
struct mem_cgroup *memcg;
size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
old_size = memcg_shrinker_map_size;
if (size <= old_size)
return 0;
mutex_lock(&memcg_shrinker_map_mutex);
if (!root_mem_cgroup)
goto unlock;
for_each_mem_cgroup(memcg) {
if (mem_cgroup_is_root(memcg))
continue;
ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
if (ret) {
mem_cgroup_iter_break(NULL, memcg);
}
unlock:
if (!ret)
memcg_shrinker_map_size = size;
mutex_unlock(&memcg_shrinker_map_mutex);
return ret;
}
Kirill Tkhai
committed
void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
{
if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
struct memcg_shrinker_map *map;
rcu_read_lock();
map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
/* Pairs with smp mb in shrink_slab() */
smp_mb__before_atomic();
Kirill Tkhai
committed
set_bit(shrinker_id, map->map);
rcu_read_unlock();
}
}
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
*
* If memcg is bound to the default hierarchy, css of the memcg associated
* with @page is returned. The returned css remains associated with @page
* until it is released.
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
*/
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
memcg = page->mem_cgroup;
Tejun Heo
committed
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
return &memcg->css;
}
/**
* page_cgroup_ino - return inode number of the memcg a page is charged to
* @page: the page
*
* Look up the closest online ancestor of the memory cgroup @page is charged to
* and return its inode number or 0 if @page is not charged to any cgroup. It
* is safe to call this function without holding a reference to @page.
*
* Note, this function is inherently racy, because there is nothing to prevent
* the cgroup inode from getting torn down and potentially reallocated a moment
* after page_cgroup_ino() returns, so it only should be used by callers that
* do not care (such as procfs interfaces).
*/
ino_t page_cgroup_ino(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long ino = 0;
rcu_read_lock();
Roman Gushchin
committed
memcg = page->mem_cgroup;
Roman Gushchin
committed
/*
* The lowest bit set means that memcg isn't a valid
* memcg pointer, but a obj_cgroups pointer.
* In this case the page is shared and doesn't belong
* to any specific memory cgroup.
*/
if ((unsigned long) memcg & 0x1UL)
memcg = NULL;
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
ino = cgroup_ino(memcg->css.cgroup);
rcu_read_unlock();
return ino;
}
static struct mem_cgroup_per_node *
mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
int nid = page_to_nid(page);
return memcg->nodeinfo[nid];
static struct mem_cgroup_tree_per_node *
soft_limit_tree_node(int nid)
return soft_limit_tree.rb_tree_per_node[nid];
static struct mem_cgroup_tree_per_node *
soft_limit_tree_from_page(struct page *page)
{
int nid = page_to_nid(page);
return soft_limit_tree.rb_tree_per_node[nid];
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
struct mem_cgroup_per_node *mz_node;
if (mz->on_tree)
return;
mz->usage_in_excess = new_usage_in_excess;
if (!mz->usage_in_excess)
return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_node,
if (mz->usage_in_excess < mz_node->usage_in_excess) {
p = &(*p)->rb_left;
/*
* We can't avoid mem cgroups that are over their soft
* limit by the same amount
*/
else if (mz->usage_in_excess >= mz_node->usage_in_excess)
p = &(*p)->rb_right;
}
if (rightmost)
mctz->rb_rightmost = &mz->tree_node;
rb_link_node(&mz->tree_node, parent, p);
rb_insert_color(&mz->tree_node, &mctz->rb_root);
mz->on_tree = true;
}
static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz)
{
if (!mz->on_tree)
return;
if (&mz->tree_node == mctz->rb_rightmost)
mctz->rb_rightmost = rb_prev(&mz->tree_node);
rb_erase(&mz->tree_node, &mctz->rb_root);
mz->on_tree = false;
}
static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz)
unsigned long flags;
spin_lock_irqsave(&mctz->lock, flags);
Johannes Weiner
committed
__mem_cgroup_remove_exceeded(mz, mctz);
spin_unlock_irqrestore(&mctz->lock, flags);
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;
if (nr_pages > soft_limit)
excess = nr_pages - soft_limit;
return excess;
}
static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
mctz = soft_limit_tree_from_page(page);
if (!mctz)
return;
/*
* Necessary to update all ancestors when hierarchy is used.
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
mz = mem_cgroup_page_nodeinfo(memcg, page);
excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
if (excess || mz->on_tree) {
unsigned long flags;
spin_lock_irqsave(&mctz->lock, flags);
/* if on-tree, remove it */
if (mz->on_tree)
Johannes Weiner
committed
__mem_cgroup_remove_exceeded(mz, mctz);
/*
* Insert again. mz->usage_in_excess will be updated.
* If excess is 0, no tree ops.
*/
Johannes Weiner
committed
__mem_cgroup_insert_exceeded(mz, mctz, excess);
spin_unlock_irqrestore(&mctz->lock, flags);
}
}
}
static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
struct mem_cgroup_tree_per_node *mctz;
struct mem_cgroup_per_node *mz;
int nid;
mz = mem_cgroup_nodeinfo(memcg, nid);
mctz = soft_limit_tree_node(nid);
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
static struct mem_cgroup_per_node *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
struct mem_cgroup_per_node *mz;
retry:
mz = NULL;
goto done; /* Nothing to reclaim from */
mz = rb_entry(mctz->rb_rightmost,
struct mem_cgroup_per_node, tree_node);
/*
* Remove the node now but someone else can add it back,
* we will to add it back at the end of reclaim to its correct
* position in the tree.
*/
Johannes Weiner
committed
__mem_cgroup_remove_exceeded(mz, mctz);
if (!soft_limit_excess(mz->memcg) ||
goto retry;
done:
return mz;
}
static struct mem_cgroup_per_node *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
struct mem_cgroup_per_node *mz;
mz = __mem_cgroup_largest_soft_limit_node(mctz);
return mz;
}
/**
* __mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
* @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
* @val: delta to add to the counter, can be negative
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
long x, threshold = MEMCG_CHARGE_BATCH;
if (mem_cgroup_disabled())
return;
if (memcg_stat_item_in_bytes(idx))
threshold <<= PAGE_SHIFT;
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
if (unlikely(abs(x) > threshold)) {
struct mem_cgroup *mi;
/*
* Batch local counters to keep them in sync with
* the hierarchical ones.
*/
__this_cpu_add(memcg->vmstats_local->stat[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmstats[idx]);
x = 0;
}
__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
}
static struct mem_cgroup_per_node *
parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
{
struct mem_cgroup *parent;
parent = parent_mem_cgroup(pn->memcg);
if (!parent)
return NULL;
return mem_cgroup_nodeinfo(parent, nid);
}
Roman Gushchin
committed
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
/* Update memcg */
__mod_memcg_state(memcg, idx, val);
Roman Gushchin
committed
/* Update lruvec */
__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
if (vmstat_item_in_bytes(idx))
threshold <<= PAGE_SHIFT;
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
if (unlikely(abs(x) > threshold)) {
Roman Gushchin
committed
pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pi;
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
atomic_long_add(x, &pi->lruvec_stat[idx]);
x = 0;
}
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
}
Roman Gushchin
committed
/**
* __mod_lruvec_state - update lruvec memory statistics
* @lruvec: the lruvec
* @idx: the stat item
* @val: delta to add to the counter, can be negative
*
* The lruvec is the intersection of the NUMA node and a cgroup. This
* function updates the all three counters that are affected by a
* change of state at this level: per-node, per-cgroup, per-lruvec.
*/
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val)
{
/* Update node */
__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
/* Update memcg and lruvec */
if (!mem_cgroup_disabled())
__mod_memcg_lruvec_state(lruvec, idx, val);
}
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
{
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
memcg = mem_cgroup_from_obj(p);
/* Untracked pages have no memcg, no lruvec. Update only the node */
if (!memcg || memcg == root_mem_cgroup) {
__mod_node_page_state(pgdat, idx, val);
} else {
lruvec = mem_cgroup_lruvec(memcg, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
rcu_read_unlock();
}
void mod_memcg_obj_state(void *p, int idx, int val)
{
struct mem_cgroup *memcg;
rcu_read_lock();
memcg = mem_cgroup_from_obj(p);
if (memcg)
mod_memcg_state(memcg, idx, val);
rcu_read_unlock();
}
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
* @idx: the event item
* @count: the number of events that occured
*/
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
unsigned long x;
if (mem_cgroup_disabled())
return;
x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;
/*
* Batch local counters to keep them in sync with
* the hierarchical ones.
*/
__this_cpu_add(memcg->vmstats_local->events[idx], x);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
atomic_long_add(x, &mi->vmevents[idx]);
x = 0;
}
__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
return atomic_long_read(&memcg->vmevents[event]);
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
long x = 0;
int cpu;
for_each_possible_cpu(cpu)
x += per_cpu(memcg->vmstats_local->events[event], cpu);
return x;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
int nr_pages)
KAMEZAWA Hiroyuki
committed
{
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__count_memcg_events(memcg, PGPGIN, 1);
__count_memcg_events(memcg, PGPGOUT, 1);
nr_pages = -nr_pages; /* for event */
}
__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
KAMEZAWA Hiroyuki
committed
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
enum mem_cgroup_events_target target)
{
unsigned long val, next;
val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
case MEM_CGROUP_TARGET_THRESH:
next = val + THRESHOLDS_EVENTS_TARGET;
break;
case MEM_CGROUP_TARGET_SOFTLIMIT:
next = val + SOFTLIMIT_EVENTS_TARGET;
break;
default:
break;
}
__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
return true;
return false;
}
/*
* Check events in order.
*
*/
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
{
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
do_softlimit = mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
if (unlikely(do_softlimit))
mem_cgroup_update_tree(memcg, page);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
/*
* mm_update_next_owner() may clear mm->owner to NULL
* if it races with swapoff, page migration, etc.
* So this can be called with p == NULL.