Newer
Older
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
* the dirty bit in the pud is meaningless. And if the dirty
* bit will become meaningful and we'll only set it with
* FOLL_WRITE, an atomic set_bit will be required on the pud to
* set the young bit, instead of the current set_pud_at.
*/
_pud = pud_mkyoung(pud_mkdirty(*pud));
if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
pud, _pud, 1))
update_mmu_cache_pud(vma, addr, pud);
}
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, int flags)
{
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pud_lockptr(mm, pud));
if (flags & FOLL_WRITE && !pud_write(*pud))
return NULL;
if (pud_present(*pud) && pud_devmap(*pud))
/* pass */;
else
return NULL;
if (flags & FOLL_TOUCH)
touch_pud(vma, addr, pud);
/*
* device mapped pages can only be returned if the
* caller will manage the page reference count.
*/
if (!(flags & FOLL_GET))
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
pgmap = get_dev_pagemap(pfn, NULL);
if (!pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
put_dev_pagemap(pgmap);
return page;
}
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
struct vm_area_struct *vma)
{
spinlock_t *dst_ptl, *src_ptl;
pud_t pud;
int ret;
dst_ptl = pud_lock(dst_mm, dst_pud);
src_ptl = pud_lockptr(src_mm, src_pud);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
ret = -EAGAIN;
pud = *src_pud;
if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
goto out_unlock;
/*
* When page table lock is held, the huge zero pud should not be
* under splitting since we don't split the page itself, only pud to
* a page table.
*/
if (is_huge_zero_pud(pud)) {
/* No huge zero pud yet */
}
pudp_set_wrprotect(src_mm, addr, src_pud);
pud = pud_mkold(pud_wrprotect(pud));
set_pud_at(dst_mm, addr, dst_pud, pud);
ret = 0;
out_unlock:
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
return ret;
}
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
pud_t entry;
unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
if (unlikely(!pud_same(*vmf->pud, orig_pud)))
goto unlock;
entry = pud_mkyoung(orig_pud);
if (write)
entry = pud_mkdirty(entry);
haddr = vmf->address & HPAGE_PUD_MASK;
if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
unlock:
spin_unlock(vmf->ptl);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
{
pmd_t entry;
unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
if (write)
entry = pmd_mkdirty(entry);
haddr = vmf->address & HPAGE_PMD_MASK;
if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
unlock:
static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
struct vm_area_struct *vma = vmf->vma;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
struct page **pages;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
GFP_KERNEL);
if (unlikely(!pages)) {
ret |= VM_FAULT_OOM;
goto out;
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
mem_cgroup_try_charge(pages[i], vma->vm_mm,
GFP_KERNEL, &memcg, false))) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
mem_cgroup_cancel_charge(pages[i], memcg,
false);
kfree(pages);
ret |= VM_FAULT_OOM;
goto out;
}
set_page_private(pages[i], (unsigned long)memcg);
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
copy_user_highpage(pages[i], page + i,
__SetPageUptodate(pages[i]);
cond_resched();
}
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
VM_BUG_ON_PAGE(!PageHead(page), page);
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
vmf->pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*vmf->pte));
set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
pte_unmap(vmf->pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
page_remove_rmap(page, true);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
out:
return ret;
out_free_pages:
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
mem_cgroup_cancel_charge(pages[i], memcg, false);
kfree(pages);
goto out;
}
int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
/*
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
lock_page(page);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
unlock_page(page);
put_page(page);
goto out_unlock;
}
put_page(page);
}
if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
unlock_page(page);
unlock_page(page);
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
Mel Gorman
committed
huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
if (likely(new_page)) {
prep_transhuge_page(new_page);
} else {
if (!page) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
split_huge_pmd(vma, vmf->pmd, vmf->address);
ret |= VM_FAULT_FALLBACK;
}
count_vm_event(THP_FAULT_FALLBACK);
if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
huge_gfp, &memcg, true))) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_ALLOC);
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_mn;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
if (!page) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
page_remove_rmap(page, true);
put_page(page);
}
out_mn:
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
out_unlock:
return ret;
/*
* FOLL_FORCE can write to even unwritable pmd's, but only
* after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{
return pmd_write(pmd) ||
((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
assert_spin_locked(pmd_lockptr(mm, pmd));
if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
/* Avoid dumping huge zero page */
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
goto out;
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd);
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* We don't mlock() pte-mapped THPs. This way we can avoid
* leaking mlocked pages into non-VM_LOCKED VMAs.
*
* For anon THP:
*
* In most cases the pmd is the only mapping of the page as we
* break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
* writable private mappings in populate_vma_page_range().
*
* The only scenario when we have the page shared here is if we
* mlocking read-only mapping shared over fork(). We skip
* mlocking such pages.
*
* For file THP:
*
* We can expect PageDoubleMap() to be stable under page lock:
* for file pages we set it in page_add_file_rmap(), which
* requires page to be locked.
if (PageAnon(page) && compound_mapcount(page) != 1)
goto skip_mlock;
if (PageDoubleMap(page) || !page->mapping)
goto skip_mlock;
if (!trylock_page(page))
goto skip_mlock;
lru_add_drain();
if (page->mapping && !PageDoubleMap(page))
mlock_vma_page(page);
unlock_page(page);
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
out:
return page;
}
/* NUMA hinting page fault entry point for trans huge pmds */
int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
struct vm_area_struct *vma = vmf->vma;
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
bool was_writable;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(pmd, *vmf->pmd)))
goto out_unlock;
/*
* If there are potential migrations, wait for completion and retry
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
page = pmd_page(*vmf->pmd);
if (!get_page_unless_zero(page))
goto out_unlock;
wait_on_page_locked(page);
goto out;
}
page = pmd_page(pmd);
BUG_ON(is_huge_zero_page(page));
page_nid = page_to_nid(page);
last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == this_nid) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
flags |= TNF_FAULT_LOCAL;
}
Mel Gorman
committed
/* See similar comment in do_numa_page for explanation */
Aneesh Kumar K.V
committed
if (!pmd_savedwrite(pmd))
/*
* Acquire the page lock to serialise THP migrations but avoid dropping
* page_table_lock if at all possible
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
page_nid = -1;
if (!get_page_unless_zero(page))
goto out_unlock;
wait_on_page_locked(page);
/*
* Page is misplaced. Page lock serialises migrations. Acquire anon_vma
* to serialises splits
*/
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
page_nid = -1;
}
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
page_nid = -1;
goto clear_pmdnuma;
}
/*
* Since we took the NUMA fault, we must have observed the !accessible
* bit. Make sure all other CPUs agree with that, to avoid them
* modifying the page we're about to migrate.
*
* Must be done under PTL such that we'll observe the relevant
Peter Zijlstra
committed
* inc_tlb_flush_pending().
*
* We are not sure a pending tlb flush here is for a huge page
* mapping or not. Hence use the tlb range variant
*/
if (mm_tlb_flush_pending(vma->vm_mm))
Peter Zijlstra
committed
flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
/*
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
vmf->pmd, pmd, vmf->address, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
} else
flags |= TNF_MIGRATE_FAIL;
clear_pmdnuma:
BUG_ON(!PageLocked(page));
Aneesh Kumar K.V
committed
was_writable = pmd_savedwrite(pmd);
pmd = pmd_modify(pmd, vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
unlock_page(page);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
return 0;
}
/*
* Return true if we do MADV_FREE successfully on entire pmd page.
* Otherwise, return false.
*/
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, unsigned long next)
{
spinlock_t *ptl;
pmd_t orig_pmd;
struct page *page;
struct mm_struct *mm = tlb->mm;
tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
goto out_unlocked;
orig_pmd = *pmd;
if (is_huge_zero_pmd(orig_pmd))
goto out;
if (unlikely(!pmd_present(orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(orig_pmd));
goto out;
}
page = pmd_page(orig_pmd);
/*
* If other processes are mapping this page, we couldn't discard
* the page unless they all do MADV_FREE so let's skip the page.
*/
if (page_mapcount(page) != 1)
goto out;
if (!trylock_page(page))
goto out;
/*
* If user want to discard part-pages of THP, split it so MADV_FREE
* will deactivate only them.
*/
if (next - addr != HPAGE_PMD_SIZE) {
get_page(page);
spin_unlock(ptl);
split_huge_page(page);
unlock_page(page);
goto out_unlocked;
}
if (PageDirty(page))
ClearPageDirty(page);
unlock_page(page);
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
orig_pmd = pmd_mkclean(orig_pmd);
set_pmd_at(mm, addr, pmd, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
out:
spin_unlock(ptl);
out_unlocked:
return ret;
}
static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pte_free(mm, pgtable);
atomic_long_dec(&mm->nr_ptes);
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spinlock_t *ptl;
tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
* when calling pmdp_huge_get_and_clear. So do the
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_dax(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
zap_deposited_table(tlb->mm, pmd);
tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
struct page *page = NULL;
int flush_needed = 1;
if (pmd_present(orig_pmd)) {
page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
swp_entry_t entry;
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
page = pfn_to_page(swp_offset(entry));
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
if (flush_needed)
tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
spinlock_t *old_pmd_ptl,
struct vm_area_struct *vma)
{
/*
* With split pmd lock we also need to move preallocated
* PTE page table if new_pmd is on different PMD page table.
*
* We also don't deposit and withdraw tables for file pages.
*/
return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
}
#endif
static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
#ifdef CONFIG_MEM_SOFT_DIRTY
if (unlikely(is_pmd_migration_entry(pmd)))
pmd = pmd_swp_mksoft_dirty(pmd);
else if (pmd_present(pmd))
pmd = pmd_mksoft_dirty(pmd);
#endif
return pmd;
}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
bool force_flush = false;
if ((old_addr & ~HPAGE_PMD_MASK) ||
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE)
return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have release it.
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
return false;
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
if (old_ptl) {
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
if (pmd_present(pmd) && pmd_dirty(pmd))
force_flush = true;
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
pmd = move_soft_dirty_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
if (force_flush)
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
else
*need_flush = true;
spin_unlock(old_ptl);
return true;
return false;
/*
* Returns
* - 0 if PMD could not be locked
* - 1 if PMD was locked but protections unchange and TLB flush unnecessary
* - HPAGE_PMD_NR is protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
pmd_t entry;
bool preserve_write;
int ret;
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
if (is_write_migration_entry(entry)) {
pmd_t newpmd;
/*
* A protection check is difficult so
* just be safe and disable write
*/
make_migration_entry_read(&entry);
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
set_pmd_at(mm, addr, pmd, newpmd);
}
goto unlock;
}
#endif
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
* local/remote hits to the zero page are not interesting.
*/
if (prot_numa && is_huge_zero_pmd(*pmd))
goto unlock;
if (prot_numa && pmd_protnone(*pmd))
goto unlock;
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
/*
* In case prot_numa, we are under down_read(mmap_sem). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
* which is also under down_read(mmap_sem):
*
* CPU0: CPU1:
* change_huge_pmd(prot_numa=1)
* pmdp_huge_get_and_clear_notify()
* madvise_dontneed()
* zap_pmd_range()
* pmd_trans_huge(*pmd) == 0 (without ptl)
* // skip the pmd
* set_pmd_at();
* // pmd is re-established
*
* The race makes MADV_DONTNEED miss the huge pmd and don't clear it
* which may break userspace.
*
* pmdp_invalidate() is required to make sure we don't miss
* dirty/young flags set by hardware.
*/
entry = *pmd;
pmdp_invalidate(vma, addr, pmd);
/*
* Recover dirty/young flags. It relies on pmdp_invalidate to not
* corrupt them.
*/
if (pmd_dirty(*pmd))
entry = pmd_mkdirty(entry);
if (pmd_young(*pmd))
entry = pmd_mkyoung(entry);
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mk_savedwrite(entry);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
unlock:
spin_unlock(ptl);
return ret;
}
/*
* Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
* Note that if it returns page table lock pointer, this routine returns without
* unlocking page table lock. So callers must unlock it.
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
spinlock_t *ptl;
ptl = pmd_lock(vma->vm_mm, pmd);
if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
pmd_devmap(*pmd)))
return ptl;
spin_unlock(ptl);
return NULL;
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
/*
* Returns true if a given pud maps a thp, false otherwise.
*
* Note that if it returns true, this routine returns without unlocking page
* table lock. So callers must unlock it.
*/
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
spinlock_t *ptl;
ptl = pud_lock(vma->vm_mm, pud);
if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
return ptl;
spin_unlock(ptl);
return NULL;
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
pud_t *pud, unsigned long addr)
{
pud_t orig_pud;
spinlock_t *ptl;
ptl = __pud_trans_huge_lock(pud, vma);
if (!ptl)
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
* when calling pudp_huge_get_and_clear. So do the
* pgtable_trans_huge_withdraw after finishing pudp related
* operations.
*/
orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_dax(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
/* No support for anonymous PUD pages yet */
BUG();
}
return 1;
}
static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
unsigned long haddr)
{
VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
count_vm_event(THP_SPLIT_PUD);
pudp_huge_clear_flush_notify(vma, haddr, pud);
}