Skip to content
Snippets Groups Projects
mmu_notifier.c 34.5 KiB
Newer Older
// SPDX-License-Identifier: GPL-2.0-only
Andrea Arcangeli's avatar
Andrea Arcangeli committed
/*
 *  linux/mm/mmu_notifier.c
 *
 *  Copyright (C) 2008  Qumranet, Inc.
 *  Copyright (C) 2008  SGI
 *             Christoph Lameter <cl@linux.com>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
 */

#include <linux/rculist.h>
#include <linux/mmu_notifier.h>
#include <linux/export.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/interval_tree.h>
Andrea Arcangeli's avatar
Andrea Arcangeli committed
#include <linux/rcupdate.h>
#include <linux/sched.h>
DEFINE_STATIC_SRCU(srcu);
#ifdef CONFIG_LOCKDEP
struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
	.name = "mmu_notifier_invalidate_range_start"
};
#endif

 * The mmu_notifier_subscriptions structure is allocated and installed in
 * mm->notifier_subscriptions inside the mm_take_all_locks() protected
 * critical section and it's released only when mm_count reaches zero
 * in mmdrop().
 */
	/* all mmu notifiers registered in this mm are queued in this list */
	struct hlist_head list;
	/* to serialize the list modifications and hlist_unhashed */
	spinlock_t lock;
	unsigned long invalidate_seq;
	unsigned long active_invalidate_ranges;
	struct rb_root_cached itree;
	wait_queue_head_t wq;
	struct hlist_head deferred_list;
/*
 * This is a collision-retry read-side/write-side 'lock', a lot like a
 * seqcount, however this allows multiple write-sides to hold it at
 * once. Conceptually the write side is protecting the values of the PTEs in
 * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
 * writer exists.
 *
 * Note that the core mm creates nested invalidate_range_start()/end() regions
 * within the same thread, and runs invalidate_range_start()/end() in parallel
 * on multiple CPUs. This is designed to not reduce concurrency or block
 * progress on the mm side.
 *
 * As a secondary function, holding the full write side also serves to prevent
 * writers for the itree, this is an optimization to avoid extra locking
 * during invalidate_range_start/end notifiers.
 *
 * The write side has two states, fully excluded:
 *  - mm->active_invalidate_ranges != 0
 *  - subscriptions->invalidate_seq & 1 == True (odd)
 *  - some range on the mm_struct is being invalidated
 *  - the itree is not allowed to change
 *
 * And partially excluded:
 *  - mm->active_invalidate_ranges != 0
 *  - subscriptions->invalidate_seq & 1 == False (even)
 *  - some range on the mm_struct is being invalidated
 *  - the itree is allowed to change
 *
 * Operations on notifier_subscriptions->invalidate_seq (under spinlock):
 *    seq |= 1  # Begin writing
 *    seq++     # Release the writing state
 *    seq & 1   # True if a writer exists
 *
 * The later state avoids some expensive work on inv_end in the common case of
 * no mmu_interval_notifier monitoring the VA.
static bool
mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
	lockdep_assert_held(&subscriptions->lock);
	return subscriptions->invalidate_seq & 1;
}

static struct mmu_interval_notifier *
mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
			 const struct mmu_notifier_range *range,
			 unsigned long *seq)
{
	struct interval_tree_node *node;
	struct mmu_interval_notifier *res = NULL;

	spin_lock(&subscriptions->lock);
	subscriptions->active_invalidate_ranges++;
	node = interval_tree_iter_first(&subscriptions->itree, range->start,
					range->end - 1);
	if (node) {
		res = container_of(node, struct mmu_interval_notifier,
				   interval_tree);
	}

	*seq = subscriptions->invalidate_seq;
	spin_unlock(&subscriptions->lock);
	return res;
}

static struct mmu_interval_notifier *
mn_itree_inv_next(struct mmu_interval_notifier *interval_sub,
		  const struct mmu_notifier_range *range)
{
	struct interval_tree_node *node;

	node = interval_tree_iter_next(&interval_sub->interval_tree,
				       range->start, range->end - 1);
	if (!node)
		return NULL;
	return container_of(node, struct mmu_interval_notifier, interval_tree);
}

static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
	struct mmu_interval_notifier *interval_sub;
	struct hlist_node *next;

	spin_lock(&subscriptions->lock);
	if (--subscriptions->active_invalidate_ranges ||
	    !mn_itree_is_invalidating(subscriptions)) {
		spin_unlock(&subscriptions->lock);
		return;
	}

	/* Make invalidate_seq even */

	/*
	 * The inv_end incorporates a deferred mechanism like rtnl_unlock().
	 * Adds and removes are queued until the final inv_end happens then
	 * they are progressed. This arrangement for tree updates is used to
	 * avoid using a blocking lock during invalidate_range_start.
	 */
	hlist_for_each_entry_safe(interval_sub, next,
				  &subscriptions->deferred_list,
		if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb))
			interval_tree_insert(&interval_sub->interval_tree,
			interval_tree_remove(&interval_sub->interval_tree,
}

/**
 * mmu_interval_read_begin - Begin a read side critical section against a VA
 *                           range
 * interval_sub: The interval subscription
 *
 * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
 * collision-retry scheme similar to seqcount for the VA range under
 * subscription. If the mm invokes invalidation during the critical section
 * then mmu_interval_read_retry() will return true.
 *
 * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
 * require a blocking context.  The critical region formed by this can sleep,
 * and the required 'user_lock' can also be a sleeping lock.
 *
 * The caller is required to provide a 'user_lock' to serialize both teardown
 * and setup.
 *
 * The return value should be passed to mmu_interval_read_retry().
 */
unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
	struct mmu_notifier_subscriptions *subscriptions =
		interval_sub->mm->notifier_subscriptions;
	unsigned long seq;
	bool is_invalidating;

	/*
	 * If the subscription has a different seq value under the user_lock
	 * than we started with then it has collided.
	 * If the subscription currently has the same seq value as the
	 * subscriptions seq, then it is currently between
	 * invalidate_start/end and is colliding.
Loading
Loading full blame...