Newer
Older
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
inc_mm_counter_fast(mm, MM_ANONPAGES);
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, page_table);
pte_unmap_unlock(page_table, ptl);
page_cache_release(page);
goto unlock;
* __do_fault() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
* the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
* the next page fault.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte neither mapped nor locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
struct page *dirty_page = NULL;
int page_mkwrite = 0;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY)))
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
return VM_FAULT_HWPOISON;
}
* For consistency in subsequent calls, make the faulted page always
if (flags & FAULT_FLAG_WRITE) {
if (!(vma->vm_flags & VM_SHARED)) {
if (unlikely(anon_vma_prepare(vma))) {
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
vma, address);
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
ret = VM_FAULT_OOM;
page_cache_release(page);
goto out;
}
charged = 1;
/*
* If the page will be shareable, see if the backing
* address space wants to know that the page is about
* to become writable
*/
if (vma->vm_ops->page_mkwrite) {
unlock_page(page);
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(page);
if (!page->mapping) {
ret = 0; /* retry the fault */
unlock_page(page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(page));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if FAULT_FLAG_WRITE is set, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
if (likely(pte_same(*page_table, orig_pte))) {
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page);
if (flags & FAULT_FLAG_WRITE) {
set_pte_at(mm, address, page_table, entry);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, page_table);
if (charged)
mem_cgroup_uncharge_page(page);
if (anon)
page_cache_release(page);
else
anon = 1; /* no anon but release faulted_page */
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
struct address_space *mapping = page->mapping;
if (set_page_dirty(dirty_page))
page_mkwrite = 1;
unlock_page(dirty_page);
if (page_mkwrite && mapping) {
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
} else {
unlock_page(vmf.page);
if (anon)
page_cache_release(vmf.page);
unwritable_page:
page_cache_release(page);
return ret;
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
flags |= FAULT_FLAG_NONLINEAR;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
/*
* Page table corrupted: show pte and kill process.
*/
print_bad_pte(vma, address, orig_pte, NULL);
}
pgoff = pte_to_pgoff(orig_pte);
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
return do_linear_fault(mm, vma, address,
pte, pmd, flags, entry);
pte, pmd, flags);
pte, pmd, flags, entry);
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, pte);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vma, address);
unlock:
pte_unmap_unlock(pte, ptl);
}
/*
* By the time we get here, we already hold the mm semaphore
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
__set_current_state(TASK_RUNNING);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
if (!vma->vm_ops)
return do_huge_pmd_anonymous_page(mm, vma, address,
pmd, flags);
} else {
pmd_t orig_pmd = *pmd;
barrier();
if (pmd_trans_huge(orig_pmd)) {
if (flags & FAULT_FLAG_WRITE &&
!pmd_write(orig_pmd) &&
!pmd_trans_splitting(orig_pmd))
return do_huge_pmd_wp_page(mm, vma, address,
pmd, orig_pmd);
return 0;
}
}
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
if (unlikely(__pte_alloc(mm, vma, pmd, address)))
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
* from under us anymore at this point because we hold the mmap_sem
* read mode and khugepaged takes it in write mode. So now it's
* safe to run pte_offset_map().
*/
pte = pte_offset_map(pmd, address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
* We've already handled the fast-path in-line.
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
pud_t *new = pud_alloc_one(mm, address);
if (!new)
smp_wmb(); /* See comment in __pte_alloc */
if (pgd_present(*pgd)) /* Another has populated it */
else
pgd_populate(mm, pgd, new);
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
* We've already handled the fast-path in-line.
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
smp_wmb(); /* See comment in __pte_alloc */
if (pud_present(*pud)) /* Another has populated it */
else
pud_populate(mm, pud, new);
if (pgd_present(*pud)) /* Another has populated it */
else
pgd_populate(mm, pud, new);
#endif /* __PAGETABLE_PMD_FOLDED */
int make_pages_present(unsigned long addr, unsigned long end)
{
int ret, len, write;
struct vm_area_struct * vma;
vma = find_vma(current->mm, addr);
if (!vma)
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
* and we would not want to dirty them for nothing.
*/
write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
return ret == len ? 0 : -EFAULT;
}
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
static struct vm_area_struct gate_vma;
static int __init gate_vma_init(void)
{
gate_vma.vm_mm = NULL;
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
gate_vma.vm_page_prot = __P101;
/*
* Make sure the vDSO gets into every core dump.
* Dumping its contents makes post-mortem fully interpretable later
* without matching up the same kernel and hardware config to see
* what PC values meant.
*/
gate_vma.vm_flags |= VM_ALWAYSDUMP;
return 0;
}
__initcall(gate_vma_init);
#endif
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef AT_SYSINFO_EHDR
return &gate_vma;
#else
return NULL;
#endif
}
int in_gate_area_no_mm(unsigned long addr)
{
#ifdef AT_SYSINFO_EHDR
if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
return 1;
#endif
return 0;
}
#endif /* __HAVE_ARCH_GATE_AREA */
static int __follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep;
pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
goto out;
pud = pud_offset(pgd, address);
if (pud_none(*pud) || unlikely(pud_bad(*pud)))
goto out;
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
/* We cannot handle huge page PFN maps. Luckily they don't exist. */
if (pmd_huge(*pmd))
goto out;
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!ptep)
goto out;
if (!pte_present(*ptep))
goto unlock;
*ptepp = ptep;
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
out:
return -EINVAL;
}
static inline int follow_pte(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
!(res = __follow_pte(mm, address, ptepp, ptlp)));
return res;
}
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
* @address: user virtual address
* @pfn: location to store found PFN
*
* Only IO mappings and raw PFN mappings are allowed.
*
* Returns zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
{
int ret = -EINVAL;
spinlock_t *ptl;
pte_t *ptep;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
pte_unmap_unlock(ptep, ptl);
return 0;
}
EXPORT_SYMBOL(follow_pfn);
#ifdef CONFIG_HAVE_IOREMAP_PROT
venkatesh.pallipadi@intel.com
committed
int follow_phys(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
unsigned long *prot, resource_size_t *phys)
pte_t *ptep, pte;
spinlock_t *ptl;
venkatesh.pallipadi@intel.com
committed
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
venkatesh.pallipadi@intel.com
committed
goto out;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
*prot = pgprot_val(pte_pgprot(pte));
*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
unlock:
pte_unmap_unlock(ptep, ptl);
out:
venkatesh.pallipadi@intel.com
committed
return ret;
}
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
unsigned long prot = 0;
int offset = addr & (PAGE_SIZE-1);
venkatesh.pallipadi@intel.com
committed
if (follow_phys(vma, addr, write, &prot, &phys_addr))
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
if (write)
memcpy_toio(maddr + offset, buf, len);
else
memcpy_fromio(buf, maddr + offset, len);
iounmap(maddr);
return len;
}
#endif
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
{
struct mm_struct *mm;
struct vm_area_struct *vma;
void *old_buf = buf;
mm = get_task_mm(tsk);
if (!mm)
return 0;
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was successfully transferred */
while (len) {
int bytes, ret, offset;
void *maddr;
struct page *page = NULL;
ret = get_user_pages(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0) {
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.
*/
#ifdef CONFIG_HAVE_IOREMAP_PROT
vma = find_vma(mm, addr);
if (!vma)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
len, write);
if (ret <= 0)
#endif
break;
bytes = ret;
} else {
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
kunmap(page);
page_cache_release(page);
}
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
mmput(mm);
return buf - old_buf;
}
Andi Kleen
committed
/*
* Print the name of a VMA.
*/
void print_vma_addr(char *prefix, unsigned long ip)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
Ingo Molnar
committed
/*
* Do not print if we are in atomic
* contexts (in exception stacks, etc.):
*/
if (preempt_count())
return;
Andi Kleen
committed
down_read(&mm->mmap_sem);
vma = find_vma(mm, ip);
if (vma && vma->vm_file) {
struct file *f = vma->vm_file;
char *buf = (char *)__get_free_page(GFP_KERNEL);
if (buf) {
char *p, *s;
p = d_path(&f->f_path, buf, PAGE_SIZE);
Andi Kleen
committed
if (IS_ERR(p))
p = "?";
s = strrchr(p, '/');
if (s)
p = s+1;
printk("%s%s[%lx+%lx]", prefix, p,
vma->vm_start,
vma->vm_end - vma->vm_start);
free_page((unsigned long)buf);
}
}
up_read(¤t->mm->mmap_sem);
}
#ifdef CONFIG_PROVE_LOCKING
void might_fault(void)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
* holding the mmap_sem, this is safe because kernel memory doesn't
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
if (segment_eq(get_fs(), KERNEL_DS))
return;
might_sleep();
/*
* it would be nicer only to annotate paths which are not under
* pagefault_disable, however that requires a larger audit and
* providing helpers like get_user_atomic.
*/
if (!in_atomic() && current->mm)
might_lock_read(¤t->mm->mmap_sem);
}
EXPORT_SYMBOL(might_fault);
#endif
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
static void clear_gigantic_page(struct page *page,
unsigned long addr,
unsigned int pages_per_huge_page)
{
int i;
struct page *p = page;
might_sleep();
for (i = 0; i < pages_per_huge_page;
i++, p = mem_map_next(p, page, i)) {
cond_resched();
clear_user_highpage(p, addr + i * PAGE_SIZE);
}
}
void clear_huge_page(struct page *page,
unsigned long addr, unsigned int pages_per_huge_page)
{
int i;
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
clear_gigantic_page(page, addr, pages_per_huge_page);
return;
}
might_sleep();
for (i = 0; i < pages_per_huge_page; i++) {
cond_resched();
clear_user_highpage(page + i, addr + i * PAGE_SIZE);
}
}
static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr,
struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
int i;
struct page *dst_base = dst;
struct page *src_base = src;
for (i = 0; i < pages_per_huge_page; ) {
cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
i++;
dst = mem_map_next(dst, dst_base, i);
src = mem_map_next(src, src_base, i);
}
}
void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma,
unsigned int pages_per_huge_page)
{
int i;
if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
copy_user_gigantic_page(dst, src, addr, vma,
pages_per_huge_page);
return;
}
might_sleep();
for (i = 0; i < pages_per_huge_page; i++) {
cond_resched();
copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */