Newer
Older
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
goto oom_free_page;
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
* the set_pte_at() write.
*/
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (!pte_none(*vmf->pte))
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto release;
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
put_page(page);
/*
* The mmap_sem must have been held on entry, and may have been
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
Kirill A. Shutemov
committed
{
struct vm_area_struct *vma = vmf->vma;
Kirill A. Shutemov
committed
int ret;
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
Kirill A. Shutemov
committed
if (unlikely(PageHWPoison(vmf->page))) {
Kirill A. Shutemov
committed
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf->page);
put_page(vmf->page);
Kirill A. Shutemov
committed
return VM_FAULT_HWPOISON;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
Kirill A. Shutemov
committed
else
VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
Kirill A. Shutemov
committed
return ret;
}
/*
* The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
* If we check pmd_trans_unstable() first we will trip the bad_pmd() check
* inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
* returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
*/
static int pmd_devmap_trans_unstable(pmd_t *pmd)
{
return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}
static int pte_alloc_one_map(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
goto map_pte;
if (vmf->prealloc_pte) {
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
spin_unlock(vmf->ptl);
goto map_pte;
}
atomic_long_inc(&vma->vm_mm->nr_ptes);
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
spin_unlock(vmf->ptl);
} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
return VM_FAULT_OOM;
}
map_pte:
/*
* If a huge pmd materialized under us just retry later. Use
* pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
* pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
* under us and then back to pmd_none, as a result of MADV_DONTNEED
* running immediately after a huge pmd fault in a different thread of
* this mm, in turn leading to a misleading pmd_trans_huge() retval.
* All we have to ensure is that it is a regular pmd that we can walk
* with pte_offset_map() and we can do that through an atomic read in
* C, which is what pmd_trans_unstable() provides.
if (pmd_devmap_trans_unstable(vmf->pmd))
return VM_FAULT_NOPAGE;
/*
* At this point we know that our vmf->pmd points to a page of ptes
* and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
* for the duration of the fault. If a racing MADV_DONTNEED runs and
* we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
* be valid and we will re-check to make sure the vmf->pte isn't
* pte_none() under vmf->ptl protection when we return to
* alloc_set_pte().
*/
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
return 0;
}
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
unsigned long haddr)
{
if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
return false;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return false;
return true;
}
static void deposit_prealloc_pte(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
/*
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
atomic_long_inc(&vma->vm_mm->nr_ptes);
static int do_set_pmd(struct vm_fault *vmf, struct page *page)
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i, ret;
if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
ret = VM_FAULT_FALLBACK;
page = compound_head(page);
/*
* Archs like ppc64 need additonal space to store information
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd)))
goto out;
for (i = 0; i < HPAGE_PMD_NR; i++)
flush_icache_page(vma, page + i);
entry = mk_huge_pmd(page, vma->vm_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
page_add_file_rmap(page, true);
/*
* deposit and withdraw with pmd lock held
*/
if (arch_needs_pgtable_deposit())
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, haddr, vmf->pmd);
/* fault is handled */
ret = 0;
count_vm_event(THP_FILE_MAPPED);
static int do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
}
#endif
* alloc_set_pte - setup new PTE entry for given page and add reverse page
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
* @memcg: memcg to charge page (only for private mappings)
* Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
* return.
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
/* THP on COW? */
VM_BUG_ON_PAGE(memcg, page);
if (!vmf->pte) {
ret = pte_alloc_one_map(vmf);
if (ret)
}
/* Re-check under ptl */
if (unlikely(!pte_none(*vmf->pte)))
return VM_FAULT_NOPAGE;
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, vmf->address, vmf->pte);
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
* @vmf: structure describing the fault
*
* This function handles all that is needed to finish a page fault once the
* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
* given page, adds reverse page mapping, handles memcg charges and LRU
* addition. The function returns 0 on success, VM_FAULT_ code in case of
* error.
*
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
*/
int finish_fault(struct vm_fault *vmf)
{
struct page *page;
int ret = 0;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) &&
!(vmf->vma->vm_flags & VM_SHARED))
page = vmf->cow_page;
else
page = vmf->page;
/*
* check even for read faults because we might have lost our CoWed
* page
*/
if (!(vmf->vma->vm_flags & VM_SHARED))
ret = check_stable_address_space(vmf->vma->vm_mm);
if (!ret)
ret = alloc_set_pte(vmf, vmf->memcg, page);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
static unsigned long fault_around_bytes __read_mostly =
rounddown_pow_of_two(65536);
#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
*val = fault_around_bytes;
return 0;
}
/*
* fault_around_pages() and fault_around_mask() expects fault_around_bytes
* rounded down to nearest page order. It's what do_fault_around() expects to
* see.
*/
static int fault_around_bytes_set(void *data, u64 val)
if (val / PAGE_SIZE > PTRS_PER_PTE)
if (val > PAGE_SIZE)
fault_around_bytes = rounddown_pow_of_two(val);
else
fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
static int __init fault_around_debugfs(void)
{
void *ret;
ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
&fault_around_bytes_fops);
pr_warn("Failed to create fault_around_bytes in debugfs");
return 0;
}
late_initcall(fault_around_debugfs);
#endif
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
/*
* do_fault_around() tries to map few pages around the fault address. The hope
* is that the pages will be needed soon and this will lower the number of
* faults to handle.
*
* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
* not ready to be mapped: not up-to-date, locked, etc.
*
* This function is called with the page table lock taken. In the split ptlock
* case the page table lock only protects only those entries which belong to
* the page table corresponding to the fault address.
*
* This function doesn't cross the VMA boundaries, in order to call map_pages()
* only once.
*
* fault_around_pages() defines how many pages we'll try to map.
* do_fault_around() expects it to return a power of two less than or equal to
* PTRS_PER_PTE.
*
* The virtual address of the area that we map is naturally aligned to the
* fault_around_pages() value (and therefore to page order). This way it's
* easier to guarantee that we don't cross page table boundaries.
*/
static int do_fault_around(struct vm_fault *vmf)
unsigned long address = vmf->address, nr_pages, mask;
pgoff_t start_pgoff = vmf->pgoff;
int off, ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
vmf->address = max(address & mask, vmf->vma->vm_start);
off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
* end_pgoff is either end of page table or end of vma
* or fault_around_pages() from start_pgoff, depending what is nearest.
((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
if (pmd_none(*vmf->pmd)) {
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
vmf->address);
if (!vmf->prealloc_pte)
smp_wmb(); /* See comment in __pte_alloc() */
vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
/* Huge page is mapped? Page fault is solved */
ret = VM_FAULT_NOPAGE;
goto out;
}
/* ->map_pages() haven't done anything useful. Cold page cache? */
goto out;
/* check if the page fault is solved */
vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
if (!pte_none(*vmf->pte))
ret = VM_FAULT_NOPAGE;
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->address = address;
vmf->pte = NULL;
return ret;
static int do_read_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
int ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
ret = do_fault_around(vmf);
if (ret)
return ret;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
static int do_cow_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
int ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!vmf->cow_page)
if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
&vmf->memcg, false)) {
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
if (ret & VM_FAULT_DONE_COW)
return ret;
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
ret |= finish_fault(vmf);
unlock_page(vmf->page);
put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
static int do_shared_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
Kirill A. Shutemov
committed
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
* Check if the backing address space wants to know that the page is
* about to become writable
if (vma->vm_ops->page_mkwrite) {
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
ret |= finish_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
unlock_page(vmf->page);
put_page(vmf->page);
fault_dirty_shared_page(vma, vmf->page);
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults).
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
ret = VM_FAULT_SIGBUS;
else if (!(vmf->flags & FAULT_FLAG_WRITE))
ret = do_read_fault(vmf);
else if (!(vma->vm_flags & VM_SHARED))
ret = do_cow_fault(vmf);
else
ret = do_shared_fault(vmf);
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
pte_free(vma->vm_mm, vmf->prealloc_pte);
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid,
int *flags)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
*flags |= TNF_FAULT_LOCAL;
}
return mpol_misplaced(page, vma, addr);
}
static int do_numa_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
bool migrated = false;
pte_t pte;
Aneesh Kumar K.V
committed
bool was_writable = pte_savedwrite(vmf->orig_pte);
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*/
vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
/*
* Make it present again, Depending on how arch implementes non
* accessible ptes, some can allow access by kernel mode.
*/
pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
pte = pte_modify(pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
Mel Gorman
committed
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
* the case where a mapping is writable but the process never writes
* to it but pte_write gets cleared during protection updates and
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
if (!pte_write(pte))
/*
* Flag if the page is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == -1) {
put_page(page);
goto out;
}
/* Migrate to the requested node */
migrated = migrate_misplaced_page(page, vma, target_nid);
} else
flags |= TNF_MIGRATE_FAIL;
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
static inline int create_huge_pmd(struct vm_fault *vmf)
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_wp_page(vmf, orig_pmd);
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
/* COW handled on pte level: split pmd */
VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
static inline bool vma_is_accessible(struct vm_area_struct *vma)
{
return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
}
static int create_huge_pud(struct vm_fault *vmf)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
return VM_FAULT_FALLBACK;
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
return VM_FAULT_FALLBACK;
if (vmf->vma->vm_ops->huge_fault)
return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
* concurrent faults).
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
static int handle_pte_fault(struct vm_fault *vmf)
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
} else {
/* See comment in pte_alloc_one_map() */
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
* pmd from under us anymore at this point because we hold the
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
* CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee
* atomic accesses. The code below just needs a consistent
* view for the ifs and we later double check anyway with the
* ptl lock held. So here a barrier will do.
*/
barrier();
pte_unmap(vmf->pte);
vmf->pte = NULL;
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
}
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, entry)))
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (vmf->flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
pte_unmap_unlock(vmf->pte, vmf->ptl);
}
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
.address = address & PAGE_MASK,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
struct mm_struct *mm = vma->vm_mm;
p4d = p4d_alloc(mm, pgd, address);
if (!p4d)
return VM_FAULT_OOM;
vmf.pud = pud_alloc(mm, p4d, address);
if (!vmf.pud)
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pud_t orig_pud = *vmf.pud;
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
/* NUMA case for anonymous PUDs would go here */
if (dirty && !pud_write(orig_pud)) {
ret = wp_huge_pud(&vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pud_set_accessed(&vmf, orig_pud);
return 0;
}
}
}
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
Kirill A. Shutemov
committed
if (!(ret & VM_FAULT_FALLBACK))
return ret;
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf, orig_pmd);
if ((vmf.flags & FAULT_FLAG_WRITE) &&
if (!(ret & VM_FAULT_FALLBACK))
return ret;
huge_pmd_set_accessed(&vmf, orig_pmd);
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
int ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
count_memcg_event_mm(vma->vm_mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
* We've already handled the fast-path in-line.
*/
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
p4d_t *new = p4d_alloc_one(mm, address);
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
if (pgd_present(*pgd)) /* Another has populated it */
p4d_free(mm, new);