Newer
Older
entry = pte_to_swp_entry(orig_pte);
if (is_migration_entry(entry)) {
migration_entry_wait(mm, pmd, address);
goto out;
}
Shailabh Nagar
committed
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
grab_swap_token(); /* Contend for token _before_ read-in */
page = swapin_readahead(entry,
GFP_HIGHUSER_MOVABLE, vma, address);
* Back out if somebody else faulted in this pte
* while we released the pte lock.
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
Shailabh Nagar
committed
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
}
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
}
mark_page_accessed(page);
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
* Back out if somebody else already faulted in this pte.
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
goto out_nomap;
}
/* The page isn't present yet, go ahead with the fault. */
pte = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
write_access = 0;
}
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
page_add_anon_rmap(page, vma, address);
swap_free(entry);
if (vm_swap_full())
remove_exclusive_swap_page(page);
unlock_page(page);
/* XXX: We could OR the do_wp_page code with this one? */
ret = VM_FAULT_OOM;
goto out;
}
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
pte_unmap_unlock(page_table, ptl);
out_nomap:
pte_unmap_unlock(page_table, ptl);
unlock_page(page);
page_cache_release(page);
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access)
struct page *page;
spinlock_t *ptl;
/* Allocate our own private page. */
pte_unmap(page_table);
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
entry = mk_pte(page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
page_add_new_anon_rmap(page, vma, address);
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, entry);
unlock:
pte_unmap_unlock(page_table, ptl);
release:
page_cache_release(page);
goto unlock;
* __do_fault() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
* the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
* the next page fault.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte neither mapped nor locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
struct page *dirty_page = NULL;
int page_mkwrite = 0;
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
BUG_ON(vma->vm_flags & VM_PFNMAP);
if (likely(vma->vm_ops->fault)) {
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
return ret;
} else {
/* Legacy ->nopage path */
vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
/* no page was available -- either SIGBUS or OOM */
return VM_FAULT_SIGBUS;
return VM_FAULT_OOM;
}
* For consistency in subsequent calls, make the faulted page always
if (flags & FAULT_FLAG_WRITE) {
if (!(vma->vm_flags & VM_SHARED)) {
if (unlikely(anon_vma_prepare(vma))) {
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
vma, address);
/*
* If the page will be shareable, see if the backing
* address space wants to know that the page is about
* to become writable
*/
if (vma->vm_ops->page_mkwrite) {
unlock_page(page);
if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
ret = VM_FAULT_SIGBUS;
anon = 1; /* no anon but release vmf.page */
goto out_unlocked;
}
lock_page(page);
/*
* XXX: this is not quite right (racy vs
* invalidate) to unlock and relock the page
* like this, however a better fix requires
* reworking page_mkwrite locking API, which
* is better done later.
*/
if (!page->mapping) {
anon = 1; /* no anon but release vmf.page */
goto out;
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
if (likely(pte_same(*page_table, orig_pte))) {
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
if (flags & FAULT_FLAG_WRITE)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
if (anon) {
inc_mm_counter(mm, anon_rss);
lru_cache_add_active(page);
page_add_new_anon_rmap(page, vma, address);
page_add_file_rmap(page);
if (flags & FAULT_FLAG_WRITE) {
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
if (anon)
page_cache_release(page);
else
anon = 1; /* no anon but release faulted_page */
pte_unmap_unlock(page_table, ptl);
else if (dirty_page) {
if (vma->vm_file)
file_update_time(vma->vm_file);
set_page_dirty_balance(dirty_page, page_mkwrite);
static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
pte_unmap(page_table);
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
/*
* do_no_pfn() tries to create a new page mapping for a page without
* a struct_page backing it
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*
* It is expected that the ->nopfn handler always returns the same pfn
* for a given virtual mapping.
*
* Mark this `noinline' to prevent it from bloating the main pagefault code.
*/
static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access)
{
spinlock_t *ptl;
pte_t entry;
unsigned long pfn;
pte_unmap(page_table);
BUG_ON(!(vma->vm_flags & VM_PFNMAP));
BUG_ON(is_cow_mapping(vma->vm_flags));
pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
if (unlikely(pfn == NOPFN_OOM))
else if (unlikely(pfn == NOPFN_SIGBUS))
else if (unlikely(pfn == NOPFN_REFAULT))
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/* Only go through if we didn't race with anybody else... */
if (pte_none(*page_table)) {
entry = pfn_pte(pfn, vma->vm_page_prot);
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
}
pte_unmap_unlock(page_table, ptl);
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access, pte_t orig_pte)
unsigned int flags = FAULT_FLAG_NONLINEAR |
(write_access ? FAULT_FLAG_WRITE : 0);
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
if (unlikely(!(vma->vm_flags & VM_NONLINEAR) ||
!(vma->vm_flags & VM_CAN_NONLINEAR))) {
/*
* Page table corrupted: show pte and kill process.
*/
return VM_FAULT_OOM;
}
pgoff = pte_to_pgoff(orig_pte);
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, int write_access)
if (vma->vm_ops->fault || vma->vm_ops->nopage)
return do_linear_fault(mm, vma, address,
pte, pmd, write_access, entry);
if (unlikely(vma->vm_ops->nopfn))
return do_no_pfn(mm, vma, address, pte,
pmd, write_access);
}
return do_anonymous_page(mm, vma, address,
pte, pmd, write_access);
pte, pmd, write_access, entry);
return do_swap_page(mm, vma, address,
pte, pmd, write_access, entry);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
update_mmu_cache(vma, address, entry);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (write_access)
flush_tlb_page(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
}
/*
* By the time we get here, we already hold the mm semaphore
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
__set_current_state(TASK_RUNNING);
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, write_access);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
}
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
* We've already handled the fast-path in-line.
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
pud_t *new = pud_alloc_one(mm, address);
if (!new)
if (pgd_present(*pgd)) /* Another has populated it */
else
pgd_populate(mm, pgd, new);
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
* We've already handled the fast-path in-line.
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
if (pud_present(*pud)) /* Another has populated it */
else
pud_populate(mm, pud, new);
if (pgd_present(*pud)) /* Another has populated it */
else
pgd_populate(mm, pud, new);
#endif /* __PAGETABLE_PMD_FOLDED */
int make_pages_present(unsigned long addr, unsigned long end)
{
int ret, len, write;
struct vm_area_struct * vma;
vma = find_vma(current->mm, addr);
if (!vma)
return -1;
write = (vma->vm_flags & VM_WRITE) != 0;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
if (ret < 0)
return ret;
return ret == len ? 0 : -1;
}
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
static struct vm_area_struct gate_vma;
static int __init gate_vma_init(void)
{
gate_vma.vm_mm = NULL;
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
gate_vma.vm_page_prot = __P101;
/*
* Make sure the vDSO gets into every core dump.
* Dumping its contents makes post-mortem fully interpretable later
* without matching up the same kernel and hardware config to see
* what PC values meant.
*/
gate_vma.vm_flags |= VM_ALWAYSDUMP;
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
return 0;
}
__initcall(gate_vma_init);
#endif
struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef AT_SYSINFO_EHDR
return &gate_vma;
#else
return NULL;
#endif
}
int in_gate_area_no_task(unsigned long addr)
{
#ifdef AT_SYSINFO_EHDR
if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
return 1;
#endif
return 0;
}
#endif /* __HAVE_ARCH_GATE_AREA */
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
{
struct mm_struct *mm;
struct vm_area_struct *vma;
struct page *page;
void *old_buf = buf;
mm = get_task_mm(tsk);
if (!mm)
return 0;
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was successfully transferred */
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
while (len) {
int bytes, ret, offset;
void *maddr;
ret = get_user_pages(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0)
break;
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
kunmap(page);
page_cache_release(page);
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
mmput(mm);
return buf - old_buf;
}
Andi Kleen
committed
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
/*
* Print the name of a VMA.
*/
void print_vma_addr(char *prefix, unsigned long ip)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
vma = find_vma(mm, ip);
if (vma && vma->vm_file) {
struct file *f = vma->vm_file;
char *buf = (char *)__get_free_page(GFP_KERNEL);
if (buf) {
char *p, *s;
p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
s = strrchr(p, '/');
if (s)
p = s+1;
printk("%s%s[%lx+%lx]", prefix, p,
vma->vm_start,
vma->vm_end - vma->vm_start);
free_page((unsigned long)buf);
}
}
up_read(¤t->mm->mmap_sem);
}