Newer
Older
do_set_pte(vma, address, fault_page, pte, true, false);
pte_unmap_unlock(pte, ptl);
if (set_page_dirty(fault_page))
dirtied = 1;
/*
* Take a local copy of the address_space - page.mapping may be zeroed
* by truncate after unlock_page(). The address_space itself remains
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
mapping = fault_page->mapping;
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
/* file_update_time outside page_lock */
if (vma->vm_file && !vma->vm_ops->page_mkwrite)
file_update_time(vma->vm_file);
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults).
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
if (!(vma->vm_flags & VM_SHARED))
return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
flags |= FAULT_FLAG_NONLINEAR;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
/*
* Page table corrupted: show pte and kill process.
*/
print_bad_pte(vma, address, orig_pte, NULL);
}
pgoff = pte_to_pgoff(orig_pte);
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
if (!(vma->vm_flags & VM_SHARED))
return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid,
int *flags)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
*flags |= TNF_FAULT_LOCAL;
}
return mpol_misplaced(page, vma, addr);
}
static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
{
bool migrated = false;
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*
* ptep_modify_prot_start is not called as this is clearing
* the _PAGE_NUMA bit and it is not really expected that there
* would be concurrent hardware modifications to the PTE.
*/
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*ptep, pte))) {
pte_unmap_unlock(ptep, ptl);
goto out;
}
pte = pte_mknonnuma(pte);
set_pte_at(mm, addr, ptep, pte);
update_mmu_cache(vma, addr, ptep);
page = vm_normal_page(vma, addr, pte);
if (!page) {
pte_unmap_unlock(ptep, ptl);
return 0;
}
BUG_ON(is_zero_pfn(page_to_pfn(page)));
/*
* Avoid grouping on DSO/COW pages in specific and RO pages
* in general, RO pages shouldn't hurt as much anyway since
* they can be in shared cache state.
*/
if (!pte_write(pte))
flags |= TNF_NO_GROUP;
/*
* Flag if the page is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
put_page(page);
goto out;
}
/* Migrate to the requested node */
migrated = migrate_misplaced_page(page, vma, target_nid);
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
Kirill A. Shutemov
committed
static int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
* so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
* The code below just needs a consistent view for the ifs and
* we later double check anyway with the ptl lock held. So here
* a barrier will do.
*/
entry = *pte;
barrier();
return do_linear_fault(mm, vma, address,
pte, pmd, flags, entry);
pte, pmd, flags);
pte, pmd, flags, entry);
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
if (pte_numa(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, pte);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vma, address);
unlock:
pte_unmap_unlock(pte, ptl);
}
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
Kirill A. Shutemov
committed
int ret = VM_FAULT_FALLBACK;
Kirill A. Shutemov
committed
ret = do_huge_pmd_anonymous_page(mm, vma, address,
pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
barrier();
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
/*
* If the pmd is splitting, return and retry the
* the fault. Alternative: wait until the split
* is done, and goto retry.
*/
if (pmd_trans_splitting(orig_pmd))
return 0;
Linus Torvalds
committed
if (pmd_numa(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
Linus Torvalds
committed
if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
}
}
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
* from under us anymore at this point because we hold the mmap_sem
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
pte = pte_offset_map(pmd, address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
/*
* By the time we get here, we already hold the mm semaphore
*
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
int ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
mem_cgroup_count_vm_event(mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
ret = __handle_mm_fault(mm, vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
}
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
* We've already handled the fast-path in-line.
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
pud_t *new = pud_alloc_one(mm, address);
if (!new)
smp_wmb(); /* See comment in __pte_alloc */
if (pgd_present(*pgd)) /* Another has populated it */
else
pgd_populate(mm, pgd, new);
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
* We've already handled the fast-path in-line.
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
smp_wmb(); /* See comment in __pte_alloc */
if (pud_present(*pud)) /* Another has populated it */
else
pud_populate(mm, pud, new);
if (pgd_present(*pud)) /* Another has populated it */
else
pgd_populate(mm, pud, new);
static int __follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
goto out;
pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
goto out;
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
/* We cannot handle huge page PFN maps. Luckily they don't exist. */
if (pmd_huge(*pmd))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!ptep)
goto out;
if (!pte_present(*ptep))
goto unlock;
*ptepp = ptep;
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
out:
return -EINVAL;
}
static inline int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
!(res = __follow_pte(mm, address, ptepp, ptlp)));
return res;
}
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
* @pfn: location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed.
*
* Returns zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
{
int ret = -EINVAL;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
pte_unmap_unlock(ptep, ptl);
return 0;
}
EXPORT_SYMBOL(follow_pfn);
#ifdef CONFIG_HAVE_IOREMAP_PROT
venkatesh.pallipadi@intel.com
committed
int follow_phys(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned long *prot, resource_size_t *phys)
pte_t *ptep, pte;
spinlock_t *ptl;
venkatesh.pallipadi@intel.com
committed
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
venkatesh.pallipadi@intel.com
committed
goto out;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
*prot = pgprot_val(pte_pgprot(pte));
*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
unlock:
pte_unmap_unlock(ptep, ptl);
out:
venkatesh.pallipadi@intel.com
committed
return ret;
}
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
unsigned long prot = 0;
int offset = addr & (PAGE_SIZE-1);
venkatesh.pallipadi@intel.com
committed
if (follow_phys(vma, addr, write, &prot, &phys_addr))
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
if (write)
memcpy_toio(maddr + offset, buf, len);
else
memcpy_fromio(buf, maddr + offset, len);
iounmap(maddr);
return len;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
* Access another process' address space as given in mm. If non-NULL, use the
* given task for page fault accounting.
static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
struct vm_area_struct *vma;
void *old_buf = buf;
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was successfully transferred */
while (len) {
int bytes, ret, offset;
void *maddr;
struct page *page = NULL;
ret = get_user_pages(tsk, mm, addr, 1,
write, 1, &page, &vma);
#ifndef CONFIG_HAVE_IOREMAP_PROT
break;
#else
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
len, write);
if (ret <= 0)
break;
bytes = ret;
} else {
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
kunmap(page);
page_cache_release(page);
}
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
return buf - old_buf;
}
Andi Kleen
committed
* access_remote_vm - access another process' address space
* @mm: the mm_struct of the target address space
* @addr: start address to access
* @buf: source or destination buffer
* @len: number of bytes to transfer
* @write: whether the access is a write
*
* The caller must hold a reference on @mm.
*/
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, int write)
{
return __access_remote_vm(NULL, mm, addr, buf, len, write);
}
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr,
void *buf, int len, int write)
{
struct mm_struct *mm;
int ret;
mm = get_task_mm(tsk);
if (!mm)
return 0;
ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
mmput(mm);
return ret;
}
Andi Kleen
committed
/*
* Print the name of a VMA.
*/
void print_vma_addr(char *prefix, unsigned long ip)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
Ingo Molnar
committed
/*
* Do not print if we are in atomic
* contexts (in exception stacks, etc.):
*/
if (preempt_count())
return;
Andi Kleen
committed
down_read(&mm->mmap_sem);
vma = find_vma(mm, ip);
if (vma && vma->vm_file) {
struct file *f = vma->vm_file;
char *buf = (char *)__get_free_page(GFP_KERNEL);
if (buf) {
Andi Kleen
committed
p = d_path(&f->f_path, buf, PAGE_SIZE);
Andi Kleen
committed
if (IS_ERR(p))
p = "?";
Andi Kleen
committed
vma->vm_start,
vma->vm_end - vma->vm_start);
free_page((unsigned long)buf);
}
}
up_read(&mm->mmap_sem);
Andi Kleen
committed
}
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void might_fault(void)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
* holding the mmap_sem, this is safe because kernel memory doesn't
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
if (segment_eq(get_fs(), KERNEL_DS))
return;
/*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
* providing helpers like get_user_atomic.
*/
if (in_atomic())
return;
__might_sleep(__FILE__, __LINE__, 0);
if (current->mm)
might_lock_read(¤t->mm->mmap_sem);
}
EXPORT_SYMBOL(might_fault);
#endif
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
static void clear_gigantic_page(struct page *page,
unsigned long addr,
unsigned int pages_per_huge_page)
{
int i;
struct page *p = page;
might_sleep();
for (i = 0; i < pages_per_huge_page;
i++, p = mem_map_next(p, page, i)) {
cond_resched();
clear_user_highpage(p, addr + i * PAGE_SIZE);
}
}
void clear_huge_page(struct page *page,
unsigned long addr, unsigned int pages_per_huge_page)
{
int i;
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
}
might_sleep();
for (i = 0; i < pages_per_huge_page; i++) {
cond_resched();
clear_user_highpage(page + i, addr + i * PAGE_SIZE);
}
}
static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr,
struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
int i;
struct page *dst_base = dst;
struct page *src_base = src;
for (i = 0; i < pages_per_huge_page; ) {
cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
i++;
dst = mem_map_next(dst, dst_base, i);
src = mem_map_next(src, src_base, i);
}
}
void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
int i;
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
copy_user_gigantic_page(dst, src, addr, vma,
pages_per_huge_page);
return;
}
might_sleep();
for (i = 0; i < pages_per_huge_page; i++) {
cond_resched();
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
Kirill A. Shutemov
committed
#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
static struct kmem_cache *page_ptl_cachep;
void __init ptlock_cache_init(void)
{
page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
SLAB_PANIC, NULL);
}
bool ptlock_alloc(struct page *page)
Kirill A. Shutemov
committed
{
spinlock_t *ptl;
ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
Kirill A. Shutemov
committed
if (!ptl)
return false;
page->ptl = ptl;
Kirill A. Shutemov
committed
return true;
}
void ptlock_free(struct page *page)
Kirill A. Shutemov
committed
{
kmem_cache_free(page_ptl_cachep, page->ptl);