Newer
Older
Shailabh Nagar
committed
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
mark_page_accessed(page);
lock_page(page);
/*
* Back out if somebody else already faulted in this pte.
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*page_table, orig_pte)))
goto out_nomap;
if (unlikely(!PageUptodate(page))) {
ret = VM_FAULT_SIGBUS;
goto out_nomap;
}
/* The page isn't present yet, go ahead with the fault. */
pte = mk_pte(page, vma->vm_page_prot);
if (write_access && can_share_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
write_access = 0;
}
flush_icache_page(vma, page);
set_pte_at(mm, address, page_table, pte);
page_add_anon_rmap(page, vma, address);
swap_free(entry);
if (vm_swap_full())
remove_exclusive_swap_page(page);
unlock_page(page);
if (write_access) {
if (do_wp_page(mm, vma, address,
page_table, pmd, ptl, pte) == VM_FAULT_OOM)
ret = VM_FAULT_OOM;
goto out;
}
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, pte);
lazy_mmu_prot_update(pte);
pte_unmap_unlock(page_table, ptl);
out_nomap:
pte_unmap_unlock(page_table, ptl);
unlock_page(page);
page_cache_release(page);
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access)
struct page *page;
spinlock_t *ptl;
/* Allocate our own private page. */
pte_unmap(page_table);
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage(vma, address);
entry = mk_pte(page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, anon_rss);
page_add_new_anon_rmap(page, vma, address);
/* Map the ZERO_PAGE - vm_page_prot is readonly */
page = ZERO_PAGE(address);
page_cache_get(page);
entry = mk_pte(page, vma->vm_page_prot);
spin_lock(ptl);
if (!pte_none(*page_table))
goto release;
inc_mm_counter(mm, file_rss);
page_add_file_rmap(page);
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, entry);
pte_unmap_unlock(page_table, ptl);
release:
page_cache_release(page);
goto unlock;
return VM_FAULT_OOM;
}
/*
* do_no_page() tries to create a new page mapping. It aggressively
* tries to share with existing pages, but makes a separate copy if
* the "write_access" parameter is true in order to avoid the next
* page fault.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access)
struct address_space *mapping = NULL;
pte_t entry;
unsigned int sequence = 0;
int ret = VM_FAULT_MINOR;
int anon = 0;
struct page *dirty_page = NULL;
BUG_ON(vma->vm_flags & VM_PFNMAP);
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
if (vma->vm_file) {
mapping = vma->vm_file->f_mapping;
sequence = mapping->truncate_count;
smp_rmb(); /* serializes i_size against truncate_count */
}
retry:
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
/*
* No smp_rmb is needed here as long as there's a full
* spin_lock/unlock sequence inside the ->nopage callback
* (for the pagecache lookup) that acts as an implicit
* smp_mb() and prevents the i_size read to happen
* after the next truncate_count read.
*/
/* no page was available -- either SIGBUS or OOM */
if (new_page == NOPAGE_SIGBUS)
return VM_FAULT_SIGBUS;
if (new_page == NOPAGE_OOM)
return VM_FAULT_OOM;
/*
* Should we do an early C-O-W break?
*/
if (write_access) {
if (!(vma->vm_flags & VM_SHARED)) {
struct page *page;
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!page)
goto oom;
copy_user_highpage(page, new_page, address);
page_cache_release(new_page);
new_page = page;
anon = 1;
} else {
/* if the page will be shareable, see if the backing
* address space wants to know that the page is about
* to become writable */
if (vma->vm_ops->page_mkwrite &&
vma->vm_ops->page_mkwrite(vma, new_page) < 0
) {
page_cache_release(new_page);
return VM_FAULT_SIGBUS;
}
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
* For a file-backed vma, someone could have truncated or otherwise
* invalidated this page. If unmap_mapping_range got called,
* retry getting the page.
*/
if (mapping && unlikely(sequence != mapping->truncate_count)) {
pte_unmap_unlock(page_table, ptl);
cond_resched();
sequence = mapping->truncate_count;
smp_rmb();
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
goto retry;
}
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it's valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
/* Only go through if we didn't race with anybody else... */
if (pte_none(*page_table)) {
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
if (anon) {
page_add_new_anon_rmap(new_page, vma, address);
if (write_access) {
dirty_page = new_page;
get_page(dirty_page);
}
} else {
/* One of our sibling threads was faster, back out. */
page_cache_release(new_page);
}
/* no need to invalidate: a not-present page shouldn't be cached */
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
pte_unmap_unlock(page_table, ptl);
return ret;
oom:
page_cache_release(new_page);
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
/*
* do_no_pfn() tries to create a new page mapping for a page without
* a struct_page backing it
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*
* It is expected that the ->nopfn handler always returns the same pfn
* for a given virtual mapping.
*
* Mark this `noinline' to prevent it from bloating the main pagefault code.
*/
static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access)
{
spinlock_t *ptl;
pte_t entry;
unsigned long pfn;
int ret = VM_FAULT_MINOR;
pte_unmap(page_table);
BUG_ON(!(vma->vm_flags & VM_PFNMAP));
BUG_ON(is_cow_mapping(vma->vm_flags));
pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
if (pfn == NOPFN_OOM)
return VM_FAULT_OOM;
if (pfn == NOPFN_SIGBUS)
return VM_FAULT_SIGBUS;
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/* Only go through if we didn't race with anybody else... */
if (pte_none(*page_table)) {
entry = pfn_pte(pfn, vma->vm_page_prot);
if (write_access)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
set_pte_at(mm, address, page_table, entry);
}
pte_unmap_unlock(page_table, ptl);
return ret;
}
/*
* Fault of a previously existing named mapping. Repopulate the pte
* from the encoded file_pte if possible. This enables swappable
* nonlinear vmas.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
int write_access, pte_t orig_pte)
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
/*
* Page table corrupted: show pte and kill process.
*/
return VM_FAULT_OOM;
}
/* We can then assume vm->vm_ops && vma->vm_ops->populate */
pgoff = pte_to_pgoff(orig_pte);
err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
vma->vm_page_prot, pgoff, 0);
if (err == -ENOMEM)
return VM_FAULT_OOM;
if (err)
return VM_FAULT_SIGBUS;
return VM_FAULT_MAJOR;
}
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, int write_access)
pte_t old_entry;
old_entry = entry = *pte;
if (vma->vm_ops) {
if (vma->vm_ops->nopage)
return do_no_page(mm, vma, address,
pte, pmd,
write_access);
if (unlikely(vma->vm_ops->nopfn))
return do_no_pfn(mm, vma, address, pte,
pmd, write_access);
}
return do_anonymous_page(mm, vma, address,
pte, pmd, write_access);
return do_file_page(mm, vma, address,
pte, pmd, write_access, entry);
return do_swap_page(mm, vma, address,
pte, pmd, write_access, entry);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (!pte_same(old_entry, entry)) {
ptep_set_access_flags(vma, address, pte, entry, write_access);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (write_access)
flush_tlb_page(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
return VM_FAULT_MINOR;
}
/*
* By the time we get here, we already hold the mm semaphore
*/
int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
__set_current_state(TASK_RUNNING);
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, write_access);
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
pte = pte_alloc_map(mm, pmd, address);
if (!pte)
return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
EXPORT_SYMBOL_GPL(__handle_mm_fault);
#ifndef __PAGETABLE_PUD_FOLDED
/*
* Allocate page upper directory.
* We've already handled the fast-path in-line.
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
pud_t *new = pud_alloc_one(mm, address);
if (!new)
if (pgd_present(*pgd)) /* Another has populated it */
else
pgd_populate(mm, pgd, new);
#else
/* Workaround for gcc 2.96 */
int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */
#ifndef __PAGETABLE_PMD_FOLDED
/*
* Allocate page middle directory.
* We've already handled the fast-path in-line.
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
pmd_t *new = pmd_alloc_one(mm, address);
if (!new)
if (pud_present(*pud)) /* Another has populated it */
else
pud_populate(mm, pud, new);
if (pgd_present(*pud)) /* Another has populated it */
else
pgd_populate(mm, pud, new);
}
#else
/* Workaround for gcc 2.96 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */
int make_pages_present(unsigned long addr, unsigned long end)
{
int ret, len, write;
struct vm_area_struct * vma;
vma = find_vma(current->mm, addr);
if (!vma)
return -1;
write = (vma->vm_flags & VM_WRITE) != 0;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
ret = get_user_pages(current, current->mm, addr,
len, write, 0, NULL, NULL);
if (ret < 0)
return ret;
return ret == len ? 0 : -1;
}
/*
* Map a vmalloc()-space virtual address to the physical page.
*/
struct page * vmalloc_to_page(void * vmalloc_addr)
{
unsigned long addr = (unsigned long) vmalloc_addr;
struct page *page = NULL;
pgd_t *pgd = pgd_offset_k(addr);
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;
if (!pgd_none(*pgd)) {
pud = pud_offset(pgd, addr);
if (!pud_none(*pud)) {
pmd = pmd_offset(pud, addr);
if (!pmd_none(*pmd)) {
ptep = pte_offset_map(pmd, addr);
pte = *ptep;
if (pte_present(pte))
page = pte_page(pte);
pte_unmap(ptep);
}
}
}
return page;
}
EXPORT_SYMBOL(vmalloc_to_page);
/*
* Map a vmalloc()-space virtual address to the physical page frame number.
*/
unsigned long vmalloc_to_pfn(void * vmalloc_addr)
{
return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);
#if !defined(__HAVE_ARCH_GATE_AREA)
#if defined(AT_SYSINFO_EHDR)
static struct vm_area_struct gate_vma;
static int __init gate_vma_init(void)
{
gate_vma.vm_mm = NULL;
gate_vma.vm_start = FIXADDR_USER_START;
gate_vma.vm_end = FIXADDR_USER_END;
gate_vma.vm_page_prot = PAGE_READONLY;
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
return 0;
}
__initcall(gate_vma_init);
#endif
struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef AT_SYSINFO_EHDR
return &gate_vma;
#else
return NULL;
#endif
}
int in_gate_area_no_task(unsigned long addr)
{
#ifdef AT_SYSINFO_EHDR
if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
return 1;
#endif
return 0;
}
#endif /* __HAVE_ARCH_GATE_AREA */
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
/*
* Access another process' address space.
* Source/target buffer must be kernel space,
* Do not walk the page table directly, use get_user_pages
*/
int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
{
struct mm_struct *mm;
struct vm_area_struct *vma;
struct page *page;
void *old_buf = buf;
mm = get_task_mm(tsk);
if (!mm)
return 0;
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was sucessfully transfered */
while (len) {
int bytes, ret, offset;
void *maddr;
ret = get_user_pages(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0)
break;
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
maddr = kmap(page);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
set_page_dirty_lock(page);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
kunmap(page);
page_cache_release(page);
len -= bytes;
buf += bytes;
addr += bytes;
}
up_read(&mm->mmap_sem);
mmput(mm);
return buf - old_buf;
}