Newer
Older
kfree(pages);
ret |= VM_FAULT_OOM;
goto out;
}
set_page_private(pages[i], (unsigned long)memcg);
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
copy_user_highpage(pages[i], page + i,
__SetPageUptodate(pages[i]);
cond_resched();
}
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
pmdp_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vma, haddr);
mem_cgroup_commit_charge(pages[i], memcg, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
pte_unmap(pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
out:
return ret;
out_free_pages:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
mem_cgroup_cancel_charge(pages[i], memcg);
kfree(pages);
goto out;
}
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
struct page *page = NULL, *new_page;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
ptl = pmd_lockptr(mm, pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
update_mmu_cache_pmd(vma, address, pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_user_huge_page(page);
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
gfp_t gfp;
gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
if (unlikely(!new_page)) {
if (!page) {
split_huge_page_pmd(vma, address, pmd);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
split_huge_page(page);
ret |= VM_FAULT_FALLBACK;
}
put_user_huge_page(page);
count_vm_event(THP_FAULT_FALLBACK);
if (unlikely(mem_cgroup_try_charge(new_page, mm,
GFP_TRANSHUGE, &memcg))) {
if (page) {
split_huge_page(page);
put_user_huge_page(page);
} else
split_huge_page_pmd(vma, address, pmd);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_ALLOC);
if (!page)
clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
put_user_huge_page(page);
mem_cgroup_cancel_charge(new_page, memcg);
goto out_mn;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_clear_flush_notify(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
update_mmu_cache_pmd(vma, address, pmd);
if (!page) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
page_remove_rmap(page);
put_page(page);
}
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_unlock:
return ret;
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
assert_spin_locked(pmd_lockptr(mm, pmd));
if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;
/* Avoid dumping huge zero page */
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
goto out;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (flags & FOLL_TOUCH) {
pmd_t _pmd;
/*
* We should set the dirty bit only for FOLL_WRITE but
* for now the dirty bit in the pmd is meaningless.
* And if the dirty bit will become meaningful and
* we'll only set it with FOLL_WRITE, an atomic
* set_bit will be required on the pmd to set the
* young bit, instead of the current set_pmd_at.
*/
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
mlock_vma_page(page);
unlock_page(page);
}
}
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page), page);
out:
return page;
}
/* NUMA hinting page fault entry point for trans huge pmds */
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
/*
* If there are potential migrations, wait for completion and retry
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*pmdp))) {
page = pmd_page(*pmdp);
spin_unlock(ptl);
wait_on_page_locked(page);
goto out;
}
page = pmd_page(pmd);
BUG_ON(is_huge_zero_page(page));
page_nid = page_to_nid(page);
last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == this_nid) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
flags |= TNF_FAULT_LOCAL;
}
/*
* Avoid grouping on DSO/COW pages in specific and RO pages
* in general, RO pages shouldn't hurt as much anyway since
* they can be in shared cache state.
*/
if (!pmd_write(pmd))
flags |= TNF_NO_GROUP;
/*
* Acquire the page lock to serialise THP migrations but avoid dropping
* page_table_lock if at all possible
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
wait_on_page_locked(page);
page_nid = -1;
/*
* Page is misplaced. Page lock serialises migrations. Acquire anon_vma
* to serialises splits
*/
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
page_nid = -1;
}
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
page_nid = -1;
goto clear_pmdnuma;
}
/*
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
clear_pmdnuma:
BUG_ON(!PageLocked(page));
pmd = pmd_mknonnuma(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_protnone(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
unlock_page(page);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
return 0;
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spinlock_t *ptl;
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
/*
* For architectures like ppc64 we look at deposited pgtable
* when calling pmdp_get_and_clear. So do the
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
page_remove_rmap(page);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
ret = 1;
}
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
int ret = 0;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
if ((old_addr & ~HPAGE_PMD_MASK) ||
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE ||
(new_vma->vm_flags & VM_NOHUGEPAGE))
goto out;
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have release it.
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
goto out;
}
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
out:
return ret;
}
/*
* Returns
* - 0 if PMD could not be locked
* - 1 if PMD was locked but protections unchange and TLB flush unnecessary
* - HPAGE_PMD_NR is protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
ret = 1;
entry = pmdp_get_and_clear_notify(mm, addr, pmd);
if (pmd_protnone(entry))
entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
/*
* Do not trap faults against the zero page. The
* read-only data is likely to be read-cached on the
* local CPU cache and it is less useful to know about
* local vs remote hits on the zero page.
*/
if (!is_huge_zero_page(page) &&
!pmd_protnone(*pmd)) {
pmdp_set_numa(mm, addr, pmd);
ret = HPAGE_PMD_NR;
}
}
spin_unlock(ptl);
}
return ret;
}
/*
* Returns 1 if a given pmd maps a stable (not under splitting) thp.
* Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
*
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
*ptl = pmd_lock(vma->vm_mm, pmd);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
spin_unlock(*ptl);
wait_split_huge_page(vma->anon_vma, pmd);
/* Thp mapped by 'pmd' is stable, so we can
* handle it as it is. */
return 1;
spin_unlock(*ptl);
/*
* This function returns whether a given @page is mapped onto the @address
* in the virtual space of @mm.
*
* When it's true, this function returns *pmd with holding the page table lock
* and passing it back to the caller via @ptl.
* If it's false, returns NULL without holding the page table lock.
*/
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
enum page_check_address_pmd_flag flag,
spinlock_t **ptl)
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
return NULL;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return NULL;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
return NULL;
pmd = pmd_offset(pud, address);
*ptl = pmd_lock(mm, pmd);
if (!pmd_present(*pmd))
goto unlock;
goto unlock;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
* splitting bit set before __split_huge_page_refcount
* runs. Finding the same huge pmd more than once during the
* same rmap walk is not a problem.
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
goto unlock;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
return pmd;
unlock:
spin_unlock(*ptl);
return NULL;
}
static int __split_huge_page_splitting(struct page *page,
struct vm_area_struct *vma,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
/* For mmu_notifiers */
const unsigned long mmun_start = address;
const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
if (pmd) {
/*
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
* and it won't wait on the anon_vma->root->rwsem to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush(vma, address, pmd);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
static void __split_huge_page_refcount(struct page *page,
struct list_head *list)
{
int i;
struct zone *zone = page_zone(page);
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
lruvec = mem_cgroup_page_lruvec(page, zone);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(page);
for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
struct page *page_tail = page + i;
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
/* tail_page->_mapcount cannot change */
BUG_ON(page_mapcount(page_tail) < 0);
tail_count += page_mapcount(page_tail);
/* check for overflow */
BUG_ON(tail_count < 0);
BUG_ON(atomic_read(&page_tail->_count) != 0);
/*
* tail_page->_count is zero and not changing from
* under us. But get_page_unless_zero() may be running
* from under us on the tail_page. If we used
* atomic_set() below instead of atomic_add(), we
* would then run atomic_set() concurrently with
* get_page_unless_zero(), and atomic_set() is
* implemented in C not using locked ops. spin_unlock
* on x86 sometime uses locked ops because of PPro
* errata 66, 92, so unless somebody can guarantee
* atomic_set() here would be safe on all archs (and
* not only on x86), it's safer to use atomic_add().
*/
atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
&page_tail->_count);
/* after clearing PageTail the gup refcount can be released */
smp_mb__after_atomic();
/*
* retain hwpoison flag of the poisoned tail page:
* fix for the unsuitable process killed on Guest Machine(KVM)
* by the memory-failure.
*/
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
page_tail->flags |= (page->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
(1L << PG_unevictable)));
page_tail->flags |= (1L << PG_dirty);
/* clear PageTail before overwriting first_page */
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
smp_wmb();
/*
* __split_huge_page_splitting() already set the
* splitting bit in all pmd that could map this
* hugepage, that will ensure no CPU can alter the
* mapcount on the head page. The mapcount is only
* accounted in the head page and it has to be
* transferred to all tail pages in the below code. So
* for this code to be safe, the split the mapcount
* can't change. But that doesn't mean userland can't
* keep changing and reading the page contents while
* we transfer the mapcount, so the pmd splitting
* status is achieved setting a reserved bit in the
* pmd, not by clearing the present bit.
*/
page_tail->_mapcount = page->_mapcount;
BUG_ON(page_tail->mapping);
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
BUG_ON(!PageDirty(page_tail));
BUG_ON(!PageSwapBacked(page_tail));
lru_add_page_tail(page, page_tail, lruvec, list);
atomic_sub(tail_count, &page->_count);
BUG_ON(atomic_read(&page->_count) <= 0);
__mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
ClearPageCompound(page);
compound_unlock(page);
spin_unlock_irq(&zone->lru_lock);
for (i = 1; i < HPAGE_PMD_NR; i++) {
struct page *page_tail = page + i;
BUG_ON(page_count(page_tail) <= 0);
/*
* Tail pages may be freed if there wasn't any mapping
* like if add_to_swap() is running on a lru page that
* had its mapping zapped. And freeing these pages
* requires taking the lru_lock so we do the put_page
* of the tail pages after the split is complete.
*/
put_page(page_tail);
}
/*
* Only the head page (now become a regular page) is required
* to be pinned by the caller.
*/
BUG_ON(page_count(page) <= 0);
}
static int __split_huge_page_map(struct page *page,
struct vm_area_struct *vma,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
pmd_t *pmd, _pmd;
int ret = 0, i;
pgtable_t pgtable;
unsigned long haddr;
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
if (pmd_write(*pmd))
BUG_ON(page_mapcount(page) != 1);
haddr = address;
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
* Note that NUMA hinting access restrictions are not
* transferred to avoid any possibility of altering
* permissions across VMAs.
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (!pmd_write(*pmd))
entry = pte_wrprotect(entry);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
pte_unmap(pte);
}
smp_wmb(); /* make pte visible before pmd */
/*
* Up to this point the pmd is present and huge and
* userland has the whole access to the hugepage
* during the split (which happens in place). If we
* overwrite the pmd with the not-huge version
* pointing to the pte here (which of course we could
* if all CPUs were bug free), userland could trigger
* a small page size TLB miss on the small sized TLB
* while the hugepage TLB entry is still established
* in the huge TLB. Some CPU doesn't like that. See
* http://support.amd.com/us/Processor_TechDocs/41322.pdf,
* Erratum 383 on page 93. Intel should be safe but is
* also warns that it's only safe if the permission
* and cache attributes of the two entries loaded in
* the two TLB is identical (which should be the case
* here). But it is generally safer to never allow
* small and huge TLB entries for the same virtual
* address to be loaded simultaneously. So instead of
* doing "pmd_populate(); flush_tlb_range();" we first
* mark the current pmd notpresent (atomically because
* here the pmd_trans_huge and pmd_trans_splitting
* must remain set at all times on the pmd until the
* split is complete for this pmd), then we flush the
* SMP TLB and finally we write the non-huge version
* of the pmd entry with pmd_populate.
*/
pmd_populate(mm, pmd, pgtable);
ret = 1;
spin_unlock(ptl);
/* must be called with anon_vma->root->rwsem held */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma,
struct list_head *list)
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma_chain *avc;
BUG_ON(!PageHead(page));
BUG_ON(PageTail(page));
mapcount = 0;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long addr = vma_address(page, vma);
BUG_ON(is_vma_temporary_stack(vma));
mapcount += __split_huge_page_splitting(page, vma, addr);
}
/*
* It is critical that new vmas are added to the tail of the
* anon_vma list. This guarantes that if copy_huge_pmd() runs
* and establishes a child pmd before
* __split_huge_page_splitting() freezes the parent pmd (so if
* we fail to prevent copy_huge_pmd() from running until the
* whole __split_huge_page() is complete), we will still see
* the newly established pmd of the child later during the
* walk, to be able to set it as pmd_trans_splitting too.
*/
if (mapcount != page_mapcount(page)) {
pr_err("mapcount %d page_mapcount %d\n",
mapcount, page_mapcount(page));
BUG();
}
__split_huge_page_refcount(page, list);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long addr = vma_address(page, vma);
BUG_ON(is_vma_temporary_stack(vma));
mapcount2 += __split_huge_page_map(page, vma, addr);
}
if (mapcount != mapcount2) {
pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
mapcount, mapcount2, page_mapcount(page));
BUG();
}
/*
* Split a hugepage into normal pages. This doesn't change the position of head
* page. If @list is null, tail pages will be added to LRU list, otherwise, to
* @list. Both head page and tail pages will inherit mapping, flags, and so on
* from the hugepage.
* Return 0 if the hugepage is split successfully otherwise return 1.
*/
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct anon_vma *anon_vma;
int ret = 1;
BUG_ON(is_huge_zero_page(page));
/*
* The caller does not necessarily hold an mmap_sem that would prevent
* the anon_vma disappearing so we first we take a reference to it
* and then lock the anon_vma for write. This is similar to
* page_lock_anon_vma_read except the write lock is taken to serialise
* against parallel split or collapse operations.
*/
anon_vma = page_get_anon_vma(page);
anon_vma_lock_write(anon_vma);
ret = 0;
if (!PageCompound(page))
goto out_unlock;
BUG_ON(!PageSwapBacked(page));
__split_huge_page(page, anon_vma, list);
BUG_ON(PageCompound(page));
out_unlock:
anon_vma_unlock_write(anon_vma);
put_anon_vma(anon_vma);
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
switch (advice) {
case MADV_HUGEPAGE:
#ifdef CONFIG_S390
/*
* qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
* can't handle this properly after s390_enable_sie, so we simply
* ignore the madvise to prevent qemu from causing a SIGSEGV.
*/
if (mm_has_pgste(vma->vm_mm))
return 0;
#endif
/*
* Be somewhat over-protective like KSM for now!
*/
if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
* If the vma become good for khugepaged to scan,
* register it here without waiting a page fault that
* may not happen any time soon.
*/
if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
return -ENOMEM;
break;
case MADV_NOHUGEPAGE:
/*
* Be somewhat over-protective like KSM for now!
*/
if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
return -EINVAL;
*vm_flags &= ~VM_HUGEPAGE;
*vm_flags |= VM_NOHUGEPAGE;
/*
* Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
* this vma even if we leave the mm registered in khugepaged if
* it got registered before VM_NOHUGEPAGE was set.
*/
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
static int __init khugepaged_slab_init(void)
{
mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
sizeof(struct mm_slot),
__alignof__(struct mm_slot), 0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
return 0;
}
static inline struct mm_slot *alloc_mm_slot(void)
{
if (!mm_slot_cache) /* initialization failed */
return NULL;
return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
}
static inline void free_mm_slot(struct mm_slot *mm_slot)
{
kmem_cache_free(mm_slot_cache, mm_slot);
}
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
return NULL;
}
static void insert_to_mm_slots_hash(struct mm_struct *mm,