Newer
Older
page_dup_rmap(src_page);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
atomic_long_inc(&dst_mm->nr_ptes);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
void huge_pmd_set_accessed(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmd, pmd_t orig_pmd,
int dirty)
{
pmd_t entry;
unsigned long haddr;
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto unlock;
entry = pmd_mkyoung(orig_pmd);
haddr = address & HPAGE_PMD_MASK;
if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
update_mmu_cache_pmd(vma, address, pmd);
unlock:
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
/*
* Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
* during copy_user_huge_page()'s copy_page_rep(): in the case when
* the source page gets split and a tail freed before copy completes.
* Called under pmd_lock of checked pmd, so safe from splitting itself.
*/
static void get_user_huge_page(struct page *page)
{
if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
struct page *endpage = page + HPAGE_PMD_NR;
atomic_add(HPAGE_PMD_NR, &page->_count);
while (++page < endpage)
get_huge_page_tail(page);
} else {
get_page(page);
}
}
static void put_user_huge_page(struct page *page)
{
if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
struct page *endpage = page + HPAGE_PMD_NR;
while (page < endpage)
put_page(page++);
} else {
put_page(page);
}
}
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmd, pmd_t orig_pmd,
struct page *page,
unsigned long haddr)
{
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
struct page **pages;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
GFP_KERNEL);
if (unlikely(!pages)) {
ret |= VM_FAULT_OOM;
goto out;
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
__GFP_OTHER_NODE,
vma, address, page_to_nid(page));
mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
mem_cgroup_cancel_charge(pages[i], memcg,
false);
kfree(pages);
ret |= VM_FAULT_OOM;
goto out;
}
set_page_private(pages[i], (unsigned long)memcg);
}
for (i = 0; i < HPAGE_PMD_NR; i++) {
copy_user_highpage(pages[i], page + i,
__SetPageUptodate(pages[i]);
cond_resched();
}
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
entry = mk_pte(pages[i], vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
page_add_new_anon_rmap(pages[i], vma, haddr, false);
mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
pte_unmap(pte);
}
kfree(pages);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page, true);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
out:
return ret;
out_free_pages:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
mem_cgroup_cancel_charge(pages[i], memcg, false);
kfree(pages);
goto out;
}
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
struct page *page = NULL, *new_page;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
ptl = pmd_lockptr(mm, pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
/*
* We can only reuse the page if nobody else maps the huge page or it's
* part. We can do it by checking page_mapcount() on each sub-page, but
* it's expensive.
* The cheaper way is to check page_count() to be equal 1: every
* mapcount takes page reference reference, so this way we can
* guarantee, that the PMD is the only mapping.
* This can give false negative if somebody pinned the page, but that's
* fine.
*/
if (page_mapcount(page) == 1 && page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
update_mmu_cache_pmd(vma, address, pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
get_user_huge_page(page);
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
new_page = NULL;
if (unlikely(!new_page)) {
if (!page) {
split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
}
put_user_huge_page(page);
count_vm_event(THP_FAULT_FALLBACK);
if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
true))) {
split_huge_pmd(vma, pmd, address);
put_user_huge_page(page);
split_huge_pmd(vma, pmd, address);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_ALLOC);
if (!page)
clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
else
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
put_user_huge_page(page);
mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_mn;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
update_mmu_cache_pmd(vma, address, pmd);
if (!page) {
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
page_remove_rmap(page, true);
put_page(page);
}
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_unlock:
return ret;
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
assert_spin_locked(pmd_lockptr(mm, pmd));
if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;
/* Avoid dumping huge zero page */
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
goto out;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (flags & FOLL_TOUCH) {
pmd_t _pmd;
/*
* We should set the dirty bit only for FOLL_WRITE but
* for now the dirty bit in the pmd is meaningless.
* And if the dirty bit will become meaningful and
* we'll only set it with FOLL_WRITE, an atomic
* set_bit will be required on the pmd to set the
* young bit, instead of the current set_pmd_at.
*/
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
mlock_vma_page(page);
unlock_page(page);
}
}
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page), page);
out:
return page;
}
/* NUMA hinting page fault entry point for trans huge pmds */
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
bool was_writable;
/* A PROT_NONE fault should not end up here */
BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
/*
* If there are potential migrations, wait for completion and retry
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*pmdp))) {
page = pmd_page(*pmdp);
spin_unlock(ptl);
wait_on_page_locked(page);
goto out;
}
page = pmd_page(pmd);
BUG_ON(is_huge_zero_page(page));
page_nid = page_to_nid(page);
last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == this_nid) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
flags |= TNF_FAULT_LOCAL;
}
Mel Gorman
committed
/* See similar comment in do_numa_page for explanation */
if (!(vma->vm_flags & VM_WRITE))
/*
* Acquire the page lock to serialise THP migrations but avoid dropping
* page_table_lock if at all possible
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
if (target_nid == -1) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
wait_on_page_locked(page);
page_nid = -1;
/*
* Page is misplaced. Page lock serialises migrations. Acquire anon_vma
* to serialises splits
*/
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
page_nid = -1;
}
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
page_nid = -1;
goto clear_pmdnuma;
}
/*
* Migrate the THP to the requested node, returns with page unlocked
* and access rights restored.
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
} else
flags |= TNF_MIGRATE_FAIL;
clear_pmdnuma:
BUG_ON(!PageLocked(page));
was_writable = pmd_write(pmd);
pmd = pmd_modify(pmd, vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
if (was_writable)
pmd = pmd_mkwrite(pmd);
set_pmd_at(mm, haddr, pmdp, pmd);
update_mmu_cache_pmd(vma, addr, pmdp);
unlock_page(page);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
return 0;
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spinlock_t *ptl;
if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
* when calling pmdp_huge_get_and_clear. So do the
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_dax(vma)) {
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
put_huge_zero_page();
} else if (is_huge_zero_pmd(orig_pmd)) {
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
put_huge_zero_page();
} else {
struct page *page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
tlb_remove_page(tlb, page);
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
int ret = 0;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
if ((old_addr & ~HPAGE_PMD_MASK) ||
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE ||
(new_vma->vm_flags & VM_NOHUGEPAGE))
goto out;
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have release it.
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
goto out;
}
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
}
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
out:
return ret;
}
/*
* Returns
* - 0 if PMD could not be locked
* - 1 if PMD was locked but protections unchange and TLB flush unnecessary
* - HPAGE_PMD_NR is protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
bool preserve_write = prot_numa && pmd_write(*pmd);
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
* local/remote hits to the zero page are not interesting.
*/
if (prot_numa && is_huge_zero_pmd(*pmd)) {
spin_unlock(ptl);
if (!prot_numa || !pmd_protnone(*pmd)) {
entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mkwrite(entry);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(!preserve_write && pmd_write(entry));
}
spin_unlock(ptl);
}
return ret;
}
/*
* Returns 1 if a given pmd maps a stable (not under splitting) thp.
* Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
*
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
spinlock_t **ptl)
*ptl = pmd_lock(vma->vm_mm, pmd);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
spin_unlock(*ptl);
wait_split_huge_page(vma->anon_vma, pmd);
/* Thp mapped by 'pmd' is stable, so we can
* handle it as it is. */
return 1;
spin_unlock(*ptl);
/*
* This function returns whether a given @page is mapped onto the @address
* in the virtual space of @mm.
*
* When it's true, this function returns *pmd with holding the page table lock
* and passing it back to the caller via @ptl.
* If it's false, returns NULL without holding the page table lock.
*/
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
enum page_check_address_pmd_flag flag,
spinlock_t **ptl)
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
return NULL;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return NULL;
pud = pud_offset(pgd, address);
if (!pud_present(*pud))
return NULL;
pmd = pmd_offset(pud, address);
*ptl = pmd_lock(mm, pmd);
if (!pmd_present(*pmd))
goto unlock;
goto unlock;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
* splitting bit set before __split_huge_page_refcount
* runs. Finding the same huge pmd more than once during the
* same rmap walk is not a problem.
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
goto unlock;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
return pmd;
unlock:
spin_unlock(*ptl);
return NULL;
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
switch (advice) {
case MADV_HUGEPAGE:
#ifdef CONFIG_S390
/*
* qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
* can't handle this properly after s390_enable_sie, so we simply
* ignore the madvise to prevent qemu from causing a SIGSEGV.
*/
if (mm_has_pgste(vma->vm_mm))
return 0;
#endif
/*
* Be somewhat over-protective like KSM for now!
*/
if (*vm_flags & VM_NO_THP)
return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
* If the vma become good for khugepaged to scan,
* register it here without waiting a page fault that
* may not happen any time soon.
*/
if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
return -ENOMEM;
break;
case MADV_NOHUGEPAGE:
/*
* Be somewhat over-protective like KSM for now!
*/
if (*vm_flags & VM_NO_THP)
return -EINVAL;
*vm_flags &= ~VM_HUGEPAGE;
*vm_flags |= VM_NOHUGEPAGE;
/*
* Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
* this vma even if we leave the mm registered in khugepaged if
* it got registered before VM_NOHUGEPAGE was set.
*/
static int __init khugepaged_slab_init(void)
{
mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
sizeof(struct mm_slot),
__alignof__(struct mm_slot), 0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
return 0;
}
static void __init khugepaged_slab_exit(void)
{
kmem_cache_destroy(mm_slot_cache);
}
static inline struct mm_slot *alloc_mm_slot(void)
{
if (!mm_slot_cache) /* initialization failed */
return NULL;
return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
}
static inline void free_mm_slot(struct mm_slot *mm_slot)
{
kmem_cache_free(mm_slot_cache, mm_slot);
}
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
return NULL;
}
static void insert_to_mm_slots_hash(struct mm_struct *mm,
struct mm_slot *mm_slot)
{
mm_slot->mm = mm;
hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
}
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
int __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int wakeup;
mm_slot = alloc_mm_slot();
if (!mm_slot)
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
}
spin_lock(&khugepaged_mm_lock);
insert_to_mm_slots_hash(mm, mm_slot);
/*
* Insert just behind the scanning cursor, to let the area settle
* down a little.
*/
wakeup = list_empty(&khugepaged_scan.mm_head);
list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
atomic_inc(&mm->mm_count);
if (wakeup)
wake_up_interruptible(&khugepaged_wait);
return 0;
}
int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
unsigned long vm_flags)
{
unsigned long hstart, hend;
if (!vma->anon_vma)
/*
* Not yet faulted in so we will register later in the
* page fault if needed.
*/
return 0;
if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
return khugepaged_enter(vma, vm_flags);
return 0;
}
void __khugepaged_exit(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
hash_del(&mm_slot->hash);
spin_unlock(&khugepaged_mm_lock);
if (free) {
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
free_mm_slot(mm_slot);
mmdrop(mm);
} else if (mm_slot) {
/*
* This is required to serialize against
* khugepaged_test_exit() (which is guaranteed to run
* under mmap sem read mode). Stop here (after we
* return all pagetables will be destroyed) until
* khugepaged has finished working on the pagetables
* under the mmap_sem.
*/
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
}
static void release_pte_page(struct page *page)
{
/* 0 stands for page_is_file_cache(page) == false */
dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
unlock_page(page);
putback_lru_page(page);
}
static void release_pte_pages(pte_t *pte, pte_t *_pte)
{
while (--_pte >= pte) {
pte_t pteval = *_pte;
if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
release_pte_page(pte_page(pteval));
}
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
int none_or_zero = 0, result = 0;
bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
if (!userfaultfd_armed(vma) &&
++none_or_zero <= khugepaged_max_ptes_none) {
} else {
result = SCAN_EXCEED_NONE_PTE;
if (!pte_present(pteval)) {
result = SCAN_PTE_NON_PRESENT;
if (unlikely(!page)) {
result = SCAN_PAGE_NULL;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
if (!trylock_page(page)) {
result = SCAN_PAGE_LOCK;
/*
* cannot use mapcount: can't collapse if there's a gup pin.
* The page must only be referenced by the scanned process
* and page swap cache.
*/
if (page_count(page) != 1 + !!PageSwapCache(page)) {
unlock_page(page);
goto out;
}
if (pte_write(pteval)) {
writable = true;
} else {
if (PageSwapCache(page) && !reuse_swap_page(page)) {
unlock_page(page);
goto out;
}
/*
* Page is not in the swap cache. It can be collapsed
* into a THP.
*/
}
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
if (isolate_lru_page(page)) {
unlock_page(page);
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);