Newer
Older
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/mmu_notifier.c
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
*/
#include <linux/rculist.h>
#include <linux/mmu_notifier.h>
#include <linux/export.h>
#include <linux/interval_tree.h>
Sagi Grimberg
committed
#include <linux/srcu.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
Ingo Molnar
committed
#include <linux/sched/mm.h>
#include <linux/slab.h>
Sagi Grimberg
committed
/* global SRCU for all MMs */
Sagi Grimberg
committed
#ifdef CONFIG_LOCKDEP
struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
.name = "mmu_notifier_invalidate_range_start"
};
#endif
/*
Jason Gunthorpe
committed
* The mmu_notifier_subscriptions structure is allocated and installed in
* mm->notifier_subscriptions inside the mm_take_all_locks() protected
* critical section and it's released only when mm_count reaches zero
* in mmdrop().
*/
Jason Gunthorpe
committed
struct mmu_notifier_subscriptions {
/* all mmu notifiers registered in this mm are queued in this list */
struct hlist_head list;
/* to serialize the list modifications and hlist_unhashed */
spinlock_t lock;
unsigned long invalidate_seq;
unsigned long active_invalidate_ranges;
struct rb_root_cached itree;
wait_queue_head_t wq;
struct hlist_head deferred_list;
};
/*
* This is a collision-retry read-side/write-side 'lock', a lot like a
* seqcount, however this allows multiple write-sides to hold it at
* once. Conceptually the write side is protecting the values of the PTEs in
* this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
* writer exists.
*
* Note that the core mm creates nested invalidate_range_start()/end() regions
* within the same thread, and runs invalidate_range_start()/end() in parallel
* on multiple CPUs. This is designed to not reduce concurrency or block
* progress on the mm side.
*
* As a secondary function, holding the full write side also serves to prevent
* writers for the itree, this is an optimization to avoid extra locking
* during invalidate_range_start/end notifiers.
*
* The write side has two states, fully excluded:
* - mm->active_invalidate_ranges != 0
Jason Gunthorpe
committed
* - subscriptions->invalidate_seq & 1 == True (odd)
* - some range on the mm_struct is being invalidated
* - the itree is not allowed to change
*
* And partially excluded:
* - mm->active_invalidate_ranges != 0
Jason Gunthorpe
committed
* - subscriptions->invalidate_seq & 1 == False (even)
* - some range on the mm_struct is being invalidated
* - the itree is allowed to change
*
Jason Gunthorpe
committed
* Operations on notifier_subscriptions->invalidate_seq (under spinlock):
* seq |= 1 # Begin writing
* seq++ # Release the writing state
* seq & 1 # True if a writer exists
*
* The later state avoids some expensive work on inv_end in the common case of
Jason Gunthorpe
committed
* no mmu_interval_notifier monitoring the VA.
Jason Gunthorpe
committed
static bool
mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
Jason Gunthorpe
committed
lockdep_assert_held(&subscriptions->lock);
return subscriptions->invalidate_seq & 1;
}
static struct mmu_interval_notifier *
Jason Gunthorpe
committed
mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
const struct mmu_notifier_range *range,
unsigned long *seq)
{
struct interval_tree_node *node;
struct mmu_interval_notifier *res = NULL;
Jason Gunthorpe
committed
spin_lock(&subscriptions->lock);
subscriptions->active_invalidate_ranges++;
node = interval_tree_iter_first(&subscriptions->itree, range->start,
range->end - 1);
if (node) {
Jason Gunthorpe
committed
subscriptions->invalidate_seq |= 1;
res = container_of(node, struct mmu_interval_notifier,
interval_tree);
}
Jason Gunthorpe
committed
*seq = subscriptions->invalidate_seq;
spin_unlock(&subscriptions->lock);
return res;
}
static struct mmu_interval_notifier *
Jason Gunthorpe
committed
mn_itree_inv_next(struct mmu_interval_notifier *interval_sub,
const struct mmu_notifier_range *range)
{
struct interval_tree_node *node;
Jason Gunthorpe
committed
node = interval_tree_iter_next(&interval_sub->interval_tree,
range->start, range->end - 1);
if (!node)
return NULL;
return container_of(node, struct mmu_interval_notifier, interval_tree);
}
Jason Gunthorpe
committed
static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
Jason Gunthorpe
committed
struct mmu_interval_notifier *interval_sub;
struct hlist_node *next;
Jason Gunthorpe
committed
spin_lock(&subscriptions->lock);
if (--subscriptions->active_invalidate_ranges ||
!mn_itree_is_invalidating(subscriptions)) {
spin_unlock(&subscriptions->lock);
return;
}
/* Make invalidate_seq even */
Jason Gunthorpe
committed
subscriptions->invalidate_seq++;
/*
* The inv_end incorporates a deferred mechanism like rtnl_unlock().
* Adds and removes are queued until the final inv_end happens then
* they are progressed. This arrangement for tree updates is used to
* avoid using a blocking lock during invalidate_range_start.
*/
Jason Gunthorpe
committed
hlist_for_each_entry_safe(interval_sub, next,
&subscriptions->deferred_list,
Jason Gunthorpe
committed
if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb))
interval_tree_insert(&interval_sub->interval_tree,
Jason Gunthorpe
committed
&subscriptions->itree);
Jason Gunthorpe
committed
interval_tree_remove(&interval_sub->interval_tree,
Jason Gunthorpe
committed
&subscriptions->itree);
Jason Gunthorpe
committed
hlist_del(&interval_sub->deferred_item);
Jason Gunthorpe
committed
spin_unlock(&subscriptions->lock);
Jason Gunthorpe
committed
wake_up_all(&subscriptions->wq);
}
/**
* mmu_interval_read_begin - Begin a read side critical section against a VA
* range
Jason Gunthorpe
committed
* interval_sub: The interval subscription
*
* mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
Jason Gunthorpe
committed
* collision-retry scheme similar to seqcount for the VA range under
* subscription. If the mm invokes invalidation during the critical section
* then mmu_interval_read_retry() will return true.
*
* This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
* require a blocking context. The critical region formed by this can sleep,
* and the required 'user_lock' can also be a sleeping lock.
*
* The caller is required to provide a 'user_lock' to serialize both teardown
* and setup.
*
* The return value should be passed to mmu_interval_read_retry().
*/
Jason Gunthorpe
committed
unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
Jason Gunthorpe
committed
struct mmu_notifier_subscriptions *subscriptions =
Jason Gunthorpe
committed
interval_sub->mm->notifier_subscriptions;
unsigned long seq;
bool is_invalidating;
/*
Jason Gunthorpe
committed
* If the subscription has a different seq value under the user_lock
* than we started with then it has collided.
Jason Gunthorpe
committed
* If the subscription currently has the same seq value as the
* subscriptions seq, then it is currently between
* invalidate_start/end and is colliding.
Loading
Loading full blame...