Newer
Older
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by page_cache_release() or put_page().
*
* Returns NULL on any kind of failure - a hole must then be inserted into
* the corefile, to preserve alignment with its headers; and also returns
* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
* allowing a hole to be left in the corefile to save diskspace.
*
* Called without mmap_sem, but after all other threads have been killed.
*/
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr)
{
struct vm_area_struct *vma;
struct page *page;
if (__get_user_pages(current, current->mm, addr, 1,
FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
NULL) < 1)
return NULL;
flush_cache_page(vma, addr, page_to_pfn(page));
return page;
}
#endif /* CONFIG_ELF_CORE */
pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
{
pgd_t * pgd = pgd_offset(mm, addr);
pud_t * pud = pud_alloc(mm, pgd, addr);
if (pud) {
pmd_t * pmd = pmd_alloc(mm, pud, addr);
if (pmd) {
VM_BUG_ON(pmd_trans_huge(*pmd));
return pte_alloc_map_lock(mm, pmd, addr, ptl);
}
return NULL;
}
/*
* This is the old fallback for page remapping.
*
* For historical reasons, it only allows reserved pages. Only
* old drivers should use this, and they needed to mark their
* pages reserved for the old functions anyway.
*/
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
retval = -ENOMEM;
flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
retval = -EBUSY;
if (!pte_none(*pte))
goto out_unlock;
/* Ok, finally just insert the thing.. */
get_page(page);
inc_mm_counter_fast(mm, MM_FILEPAGES);
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
retval = 0;
pte_unmap_unlock(pte, ptl);
return retval;
out_unlock:
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
/**
* vm_insert_page - insert single page into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @page: source kernel page
*
* This allows drivers to insert individual pages they've allocated
* into a user vma.
*
* The page has to be a nice clean _individual_ kernel allocation.
* If you allocate a compound page, you need to have marked it as
* such (__GFP_COMP), or manually just split the page up yourself
*
* NOTE! Traditionally this was done with "remap_pfn_range()" which
* took an arbitrary page protection parameter. This doesn't allow
* that. Your vma protection will have to be set up correctly, which
* means that if you want a shared writable mapping, you'd better
* ask for a shared writable mapping!
*
* The page does not need to be reserved.
*
* Usually this function is called from f_op->mmap() handler
* under mm->mmap_sem write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
if (!page_count(page))
return -EINVAL;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
return insert_page(vma, addr, page, vma->vm_page_prot);
EXPORT_SYMBOL(vm_insert_page);
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte, entry;
spinlock_t *ptl;
retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
goto out;
retval = -EBUSY;
if (!pte_none(*pte))
goto out_unlock;
/* Ok, finally just insert the thing.. */
entry = pte_mkspecial(pfn_pte(pfn, prot));
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
retval = 0;
out_unlock:
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
/**
* vm_insert_pfn - insert single pfn into user vma
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
*
* Similar to vm_insert_page, this allows drivers to insert individual pages
* they've allocated into a user vma. Same comments apply.
*
* This function should only be called from a vm_ops->fault handler, and
* in that case the handler should return NULL.
*
* vma cannot be a COW mapping.
*
* As this is called only for pages that do not currently exist, we
* do not need to flush old virtual caches or the TLB.
*/
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
venkatesh.pallipadi@intel.com
committed
int ret;
venkatesh.pallipadi@intel.com
committed
pgprot_t pgprot = vma->vm_page_prot;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
* consistency in testing and feature parity among all, so we should
* try to keep these invariants in place for everybody.
*/
BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
Suresh Siddha
committed
if (track_pfn_insert(vma, &pgprot, pfn))
venkatesh.pallipadi@intel.com
committed
return -EINVAL;
venkatesh.pallipadi@intel.com
committed
ret = insert_pfn(vma, addr, pfn, pgprot);
venkatesh.pallipadi@intel.com
committed
return ret;
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
{
BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
/*
* If we don't have pte special, then we have to use the pfn_valid()
* based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
* refcount the page if pfn_valid is true (hence insert_page rather
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
struct page *page;
page = pfn_to_page(pfn);
return insert_page(vma, addr, page, vma->vm_page_prot);
}
return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
* in null mappings (currently treated as "copy-on-access")
*/
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pte_t *pte;
pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
return 0;
}
static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pmd_t *pmd;
unsigned long next;
pfn -= addr >> PAGE_SHIFT;
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
VM_BUG_ON(pmd_trans_huge(*pmd));
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
do {
next = pmd_addr_end(addr, end);
if (remap_pte_range(mm, pmd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
pud_t *pud;
unsigned long next;
pfn -= addr >> PAGE_SHIFT;
pud = pud_alloc(mm, pgd, addr);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
if (remap_pmd_range(mm, pud, addr, next,
pfn + (addr >> PAGE_SHIFT), prot))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
* @addr: target user address to start at
* @pfn: physical address of kernel memory
* @size: size of map area
* @prot: page protection flags for this mapping
*
* Note: this is only safe if the mm semaphore is held when called.
*/
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
pgd_t *pgd;
unsigned long next;
unsigned long end = addr + PAGE_ALIGN(size);
struct mm_struct *mm = vma->vm_mm;
int err;
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
* VM_DONTEXPAND
* Disable vma merging and expanding with mremap().
* VM_DONTDUMP
* Omit vma from core dump, even when VM_IO turned off.
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
* See vm_normal_page() for details.
if (is_cow_mapping(vma->vm_flags)) {
if (addr != vma->vm_start || end != vma->vm_end)
return -EINVAL;
}
err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
if (err)
venkatesh.pallipadi@intel.com
committed
return -EINVAL;
vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
do {
next = pgd_addr_end(addr, end);
err = remap_pud_range(mm, pgd, addr, next,
pfn + (addr >> PAGE_SHIFT), prot);
if (err)
break;
} while (pgd++, addr = next, addr != end);
venkatesh.pallipadi@intel.com
committed
if (err)
Suresh Siddha
committed
untrack_pfn(vma, pfn, PAGE_ALIGN(size));
venkatesh.pallipadi@intel.com
committed
return err;
}
EXPORT_SYMBOL(remap_pfn_range);
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data)
{
pte_t *pte;
int err;
spinlock_t *uninitialized_var(ptl);
pte = (mm == &init_mm) ?
pte_alloc_kernel(pmd, addr) :
pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
BUG_ON(pmd_huge(*pmd));
arch_enter_lazy_mmu_mode();
token = pmd_pgtable(*pmd);
do {
err = fn(pte++, token, addr, data);
if (err)
break;
} while (addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
if (mm != &init_mm)
pte_unmap_unlock(pte-1, ptl);
return err;
}
static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data)
{
pmd_t *pmd;
unsigned long next;
int err;
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
if (err)
break;
} while (pmd++, addr = next, addr != end);
return err;
}
static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
pte_fn_t fn, void *data)
{
pud_t *pud;
unsigned long next;
int err;
pud = pud_alloc(mm, pgd, addr);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
if (err)
break;
} while (pud++, addr = next, addr != end);
return err;
}
/*
* Scan a region of virtual memory, filling in page tables as necessary
* and calling a provided function on each leaf page table.
*/
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long size, pte_fn_t fn, void *data)
{
pgd_t *pgd;
unsigned long next;
unsigned long end = addr + size;
int err;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
if (err)
break;
} while (pgd++, addr = next, addr != end);
return err;
}
EXPORT_SYMBOL_GPL(apply_to_page_range);
/*
* handle_pte_fault chooses page fault handler according to an entry
* which was read non-atomically. Before making any commitment, on
* those architectures or configurations (e.g. i386 with PAE) which
* might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
* must check under lock before unmapping the pte and proceeding
* (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
spinlock_t *ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
same = pte_same(*page_table, orig_pte);
}
#endif
pte_unmap(page_table);
return same;
}
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
* just copying from the original user address. If that
* fails, we just zero-fill it. Live with it.
*/
if (unlikely(!src)) {
void *kaddr = kmap_atomic(dst);
void __user *uaddr = (void __user *)(va & PAGE_MASK);
/*
* This really shouldn't fail, because the page is there
* in the page tables. But it might just be unreadable,
* in which case we just give up and fill the result with
* zeroes.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
clear_page(kaddr);
flush_dcache_page(dst);
} else
copy_user_highpage(dst, src, va, vma);
/*
* This routine handles present pages, when users try to write
* to a shared page. It is done by copying the page to a new address
* and decrementing the shared-page counter for the old page.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
* Thus we can safely just mark it writable once we've done any necessary
* COW.
*
* We also mark the page dirty at this point even though the page will
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
struct page *old_page, *new_page = NULL;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
unsigned long mmun_start = 0; /* For mmu_notifiers */
unsigned long mmun_end = 0; /* For mmu_notifiers */
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable as we can't do any dirty
* accounting on raw pfn maps.
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
goto reuse;
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
if (PageAnon(old_page) && !PageKsm(old_page)) {
if (!trylock_page(old_page)) {
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
goto unlock;
}
page_cache_release(old_page);
if (reuse_swap_page(old_page)) {
/*
* The page is all ours. Move it to our anon_vma so
* the rmap code will not search our parent or siblings.
* Protected against the rmap code by the page lock.
*/
page_move_anon_rmap(old_page, vma, address);
unlock_page(old_page);
goto reuse;
}
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
/*
* Only catch write-faults on shared writable pages,
* read-only shared pages can get COWed by
* get_user_pages(.write=1, .force=1).
*/
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
struct vm_fault vmf;
int tmp;
vmf.virtual_address = (void __user *)(address &
PAGE_MASK);
vmf.pgoff = old_page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = old_page;
/*
* Notify the address space that the page is about to
* become writable so that it can prohibit this or wait
* for the page to get into an appropriate state.
*
* We do this without the lock held, so that it can
* sleep if it needs to.
*/
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(old_page);
if (!old_page->mapping) {
ret = 0; /* retry the fault */
unlock_page(old_page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(old_page));
/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
page_mkwrite = 1;
dirty_page = old_page;
get_page(dirty_page);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, page_table);
pte_unmap_unlock(page_table, ptl);
if (!dirty_page)
return ret;
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
* __do_fault is protected similarly.
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page, page_mkwrite);
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
}
put_page(dirty_page);
if (page_mkwrite) {
struct address_space *mapping = dirty_page->mapping;
set_page_dirty(dirty_page);
unlock_page(dirty_page);
page_cache_release(dirty_page);
if (mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
}
return ret;
}
/*
* Ok, we need to copy. Oh, well..
*/
pte_unmap_unlock(page_table, ptl);
new_page = alloc_zeroed_user_highpage_movable(vma, address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
__SetPageUptodate(new_page);
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
Haggai Eran
committed
mmun_start = address & PAGE_MASK;
mmun_end = mmun_start + PAGE_SIZE;
Haggai Eran
committed
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
* Re-check the pte - we dropped the lock
*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm, MM_FILEPAGES);
inc_mm_counter_fast(mm, MM_ANONPAGES);
inc_mm_counter_fast(mm, MM_ANONPAGES);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
* Clear the pte entry and flush it first, before updating the
* pte with the new entry. This will avoid a race condition
* seen in the presence of one thread doing SMC and another
* thread doing COW.
*/
ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, page_table);
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
if (old_page) {
/*
* Only after switching the pte to the new page may
* we remove the mapcount here. Otherwise another
* process may come and find the rmap count decremented
* before the pte is switched to the new page, and
* "reuse" the old page writing into it while our pte
* here still points into it and can be read by other
* threads.
*
* The critical issue is to order this
* page_remove_rmap with the ptp_clear_flush above.
* Those stores are ordered by (if nothing else,)
* the barrier present in the atomic_add_negative
* in page_remove_rmap.
*
* Then the TLB flush in ptep_clear_flush ensures that
* no process can access the old page before the
* decremented mapcount is visible. And the old page
* cannot be reused until after the decremented
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
} else
mem_cgroup_uncharge_page(new_page);
Haggai Eran
committed
if (new_page)
page_cache_release(new_page);
pte_unmap_unlock(page_table, ptl);
if (mmun_end > mmun_start)
Haggai Eran
committed
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
}
if (old_page) {
if (page_mkwrite) {
unlock_page(old_page);
page_cache_release(old_page);
}
unwritable_page:
page_cache_release(old_page);
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
static inline void unmap_mapping_range_tree(struct rb_root *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
pgoff_t vba, vea, zba, zea;
vma_interval_tree_foreach(vma, root,
details->first_index, details->last_index) {
vba = vma->vm_pgoff;
vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
zba = details->first_index;
if (zba < vba)
zba = vba;
zea = details->last_index;
if (zea > vea)
zea = vea;
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
}
}
static inline void unmap_mapping_range_list(struct list_head *head,
struct zap_details *details)
{
struct vm_area_struct *vma;
/*
* In nonlinear VMAs there is no correspondence between virtual address
* offset and file offset. So we must perform an exhaustive search
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
list_for_each_entry(vma, head, shared.nonlinear) {
unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
* unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
* @mapping: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
* boundary. Note that this is different from truncate_pagecache(), which
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
* must keep the partial page. In contrast, we must get rid of
* partial pages.
* @holelen: size of prospective hole in bytes. This will be rounded
* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
* end of the file.
* @even_cows: 1 when truncating a file, unmap even private COWed pages;
* but 0 when invalidating pagecache, don't throw away private data.
*/
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
struct zap_details details;
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
/* Check for overflow. */
if (sizeof(holelen) > sizeof(hlen)) {
long long holeend =
(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
details.check_mapping = even_cows? NULL: mapping;
details.nonlinear_vma = NULL;
details.first_index = hba;
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
mutex_lock(&mapping->i_mmap_mutex);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
mutex_unlock(&mapping->i_mmap_mutex);
}
EXPORT_SYMBOL(unmap_mapping_range);
/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
struct mem_cgroup *ptr;
int exclusive = 0;
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
entry = pte_to_swp_entry(orig_pte);
if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) {
migration_entry_wait(mm, pmd, address);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
print_bad_pte(vma, address, orig_pte, NULL);
goto out;
}
Shailabh Nagar
committed
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = swapin_readahead(entry,
GFP_HIGHUSER_MOVABLE, vma, address);
* Back out if somebody else faulted in this pte
* while we released the pte lock.
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte)))
ret = VM_FAULT_OOM;
Shailabh Nagar
committed
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
}
/* Had to read the page from swap area: Major fault */
ret = VM_FAULT_MAJOR;
mem_cgroup_count_vm_event(mm, PGMAJFAULT);
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
* owner processes (which may be unknown at hwpoison time)
*/
ret = VM_FAULT_HWPOISON;
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
goto out_release;
locked = lock_page_or_retry(page, mm, flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
}
* Make sure try_to_free_swap or reuse_swap_page or swapoff did not
* release the swapcache from under us. The page pin, and pte_same
* test below, are not enough to exclude that. Even if it is still
* swapcache, we need to check that the page's swap has not changed.