Newer
Older
hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
}
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
int __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int wakeup;
mm_slot = alloc_mm_slot();
if (!mm_slot)
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON(khugepaged_test_exit(mm));
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
}
spin_lock(&khugepaged_mm_lock);
insert_to_mm_slots_hash(mm, mm_slot);
/*
* Insert just behind the scanning cursor, to let the area settle
* down a little.
*/
wakeup = list_empty(&khugepaged_scan.mm_head);
list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
atomic_inc(&mm->mm_count);
if (wakeup)
wake_up_interruptible(&khugepaged_wait);
return 0;
}
int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
{
unsigned long hstart, hend;
if (!vma->anon_vma)
/*
* Not yet faulted in so we will register later in the
* page fault if needed.
*/
return 0;
if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
VM_BUG_ON(vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
return khugepaged_enter(vma);
return 0;
}
void __khugepaged_exit(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
mm_slot = get_mm_slot(mm);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
hash_del(&mm_slot->hash);
spin_unlock(&khugepaged_mm_lock);
if (free) {
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
free_mm_slot(mm_slot);
mmdrop(mm);
} else if (mm_slot) {
/*
* This is required to serialize against
* khugepaged_test_exit() (which is guaranteed to run
* under mmap sem read mode). Stop here (after we
* return all pagetables will be destroyed) until
* khugepaged has finished working on the pagetables
* under the mmap_sem.
*/
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
}
static void release_pte_page(struct page *page)
{
/* 0 stands for page_is_file_cache(page) == false */
dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
unlock_page(page);
putback_lru_page(page);
}
static void release_pte_pages(pte_t *pte, pte_t *_pte)
{
while (--_pte >= pte) {
pte_t pteval = *_pte;
if (!pte_none(pteval))
release_pte_page(pte_page(pteval));
}
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
struct page *page;
pte_t *_pte;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval)) {
if (++none <= khugepaged_max_ptes_none)
continue;
if (!pte_present(pteval) || !pte_write(pteval))
goto out;
page = vm_normal_page(vma, address, pteval);
VM_BUG_ON(PageCompound(page));
BUG_ON(!PageAnon(page));
VM_BUG_ON(!PageSwapBacked(page));
/* cannot use mapcount: can't collapse if there's a gup pin */
goto out;
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
goto out;
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
*/
if (isolate_lru_page(page)) {
unlock_page(page);
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageLRU(page));
/* If there is no mapped pte young don't collapse the page */
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
if (likely(referenced))
return 1;
release_pte_pages(pte, _pte);
return 0;
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
struct vm_area_struct *vma,
unsigned long address,
spinlock_t *ptl)
{
pte_t *_pte;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
pte_t pteval = *_pte;
struct page *src_page;
if (pte_none(pteval)) {
clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
VM_BUG_ON(page_mapcount(src_page) != 1);
release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside page_remove_rmap().
*/
spin_lock(ptl);
/*
* paravirt calls inside pte_clear here are
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
page_remove_rmap(src_page);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
address += PAGE_SIZE;
page++;
}
}
static void khugepaged_alloc_sleep(void)
wait_event_freezable_timeout(khugepaged_wait, false,
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
static int khugepaged_node_load[MAX_NUMNODES];
#ifdef CONFIG_NUMA
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
static int khugepaged_find_target_node(void)
{
static int last_khugepaged_target_node = NUMA_NO_NODE;
int nid, target_node = 0, max_value = 0;
/* find first node with max normal pages hit */
for (nid = 0; nid < MAX_NUMNODES; nid++)
if (khugepaged_node_load[nid] > max_value) {
max_value = khugepaged_node_load[nid];
target_node = nid;
}
/* do some balance if several nodes have the same hit record */
if (target_node <= last_khugepaged_target_node)
for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
nid++)
if (max_value == khugepaged_node_load[nid]) {
target_node = nid;
break;
}
last_khugepaged_target_node = target_node;
return target_node;
}
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
if (!*wait)
return false;
*wait = false;
*hpage = NULL;
khugepaged_alloc_sleep();
} else if (*hpage) {
put_page(*hpage);
*hpage = NULL;
}
return true;
}
static struct page
*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
int node)
{
/*
* Allocate the page while the vma is still valid and under
* the mmap_sem read mode so there is no memory allocation
* later when we take the mmap_sem in write mode. This is more
* friendly behavior (OTOH it may actually hide bugs) to
* filesystems in userland with daemons allocating memory in
* the userland I/O paths. Allocating memory with the
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
*hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
*/
up_read(&mm->mmap_sem);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
return NULL;
count_vm_event(THP_COLLAPSE_ALLOC);
return *hpage;
}
#else
static int khugepaged_find_target_node(void)
{
return 0;
}
static inline struct page *alloc_hugepage(int defrag)
{
return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
HPAGE_PMD_ORDER);
}
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
do {
hpage = alloc_hugepage(khugepaged_defrag());
if (!hpage) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
if (!*wait)
return NULL;
*wait = false;
khugepaged_alloc_sleep();
} else
count_vm_event(THP_COLLAPSE_ALLOC);
} while (unlikely(!hpage) && likely(khugepaged_enabled()));
return hpage;
}
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (!*hpage)
*hpage = khugepaged_alloc_hugepage(wait);
if (unlikely(!*hpage))
return false;
return true;
}
static struct page
*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
int node)
{
up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
return *hpage;
}
static bool hugepage_vma_check(struct vm_area_struct *vma)
{
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
(vma->vm_flags & VM_NOHUGEPAGE))
return false;
if (!vma->anon_vma || vma->vm_ops)
return false;
if (is_vma_temporary_stack(vma))
return false;
VM_BUG_ON(vma->vm_flags & VM_NO_THP);
return true;
}
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
struct vm_area_struct *vma,
int node)
{
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
spinlock_t *ptl;
int isolated;
unsigned long hstart, hend;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
if (!new_page)
return;
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
return;
/*
* Prevent all access to pagetables with the exception of
* gup_fast later hanlded by the ptep_clear_flush and the VM
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
if (unlikely(khugepaged_test_exit(mm)))
goto out;
vma = find_vma(mm, address);
if (!vma)
goto out;
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (address < hstart || address + HPAGE_PMD_SIZE > hend)
goto out;
goto out;
Ingo Molnar
committed
anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
ptl = pte_lockptr(mm, pmd);
mmun_start = address;
mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
spin_lock(ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
spin_unlock(ptl);
if (unlikely(!isolated)) {
pte_unmap(pte);
spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
* hugepmds and never for establishing regular pmds that
* points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
anon_vma_unlock_write(vma->anon_vma);
goto out;
}
/*
* All pages are isolated and locked so anon_vma rmap
* can't run anymore.
*/
anon_vma_unlock_write(vma->anon_vma);
__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
_pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
/*
* spin_lock() below is not the equivalent of smp_wmb(), so
* this is needed to avoid the copy_huge_page writes to become
* visible after the set_pmd_at() write.
*/
smp_wmb();
spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(&mm->page_table_lock);
*hpage = NULL;
out_up_write:
out:
mem_cgroup_uncharge_page(new_page);
goto out_up_write;
}
static int khugepaged_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
struct page **hpage)
{
pmd_t *pmd;
pte_t *pte, *_pte;
int ret = 0, referenced = 0, none = 0;
struct page *page;
unsigned long _address;
spinlock_t *ptl;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval)) {
if (++none <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
}
if (!pte_present(pteval) || !pte_write(pteval))
goto out_unmap;
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page))
goto out_unmap;
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
* Khupaged will allocate hugepage from the node has the max
* hit record.
node = page_to_nid(page);
khugepaged_node_load[node]++;
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
/* cannot use mapcount: can't collapse if there's a gup pin */
if (page_count(page) != 1)
goto out_unmap;
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = 1;
}
if (referenced)
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
node = khugepaged_find_target_node();
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
out:
return ret;
}
static void collect_mm_slot(struct mm_slot *mm_slot)
{
struct mm_struct *mm = mm_slot->mm;
VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
if (khugepaged_test_exit(mm)) {
/* free mm_slot */
hash_del(&mm_slot->hash);
list_del(&mm_slot->mm_node);
/*
* Not strictly needed because the mm exited already.
*
* clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
*/
/* khugepaged_mm_lock actually not necessary for the below */
free_mm_slot(mm_slot);
mmdrop(mm);
}
}
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
struct mm_slot *mm_slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
VM_BUG_ON(!pages);
VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
else {
mm_slot = list_entry(khugepaged_scan.mm_head.next,
struct mm_slot, mm_node);
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
mm = mm_slot->mm;
down_read(&mm->mmap_sem);
if (unlikely(khugepaged_test_exit(mm)))
vma = NULL;
else
vma = find_vma(mm, khugepaged_scan.address);
progress++;
for (; vma; vma = vma->vm_next) {
unsigned long hstart, hend;
cond_resched();
if (unlikely(khugepaged_test_exit(mm))) {
progress++;
break;
}
if (!hugepage_vma_check(vma)) {
skip:
progress++;
continue;
}
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart >= hend)
goto skip;
if (khugepaged_scan.address > hend)
goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
while (khugepaged_scan.address < hend) {
int ret;
cond_resched();
if (unlikely(khugepaged_test_exit(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
ret = khugepaged_scan_pmd(mm, vma,
khugepaged_scan.address,
hpage);
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
if (ret)
/* we released mmap_sem so break loop */
goto breakouterloop_mmap_sem;
if (progress >= pages)
goto breakouterloop;
}
}
breakouterloop:
up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
breakouterloop_mmap_sem:
spin_lock(&khugepaged_mm_lock);
VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
/*
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
if (khugepaged_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
khugepaged_scan.mm_slot = list_entry(
mm_slot->mm_node.next,
struct mm_slot, mm_node);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
collect_mm_slot(mm_slot);
}
return progress;
}
static int khugepaged_has_work(void)
{
return !list_empty(&khugepaged_scan.mm_head) &&
khugepaged_enabled();
}
static int khugepaged_wait_event(void)
{
return !list_empty(&khugepaged_scan.mm_head) ||
kthread_should_stop();
static void khugepaged_do_scan(void)
struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
unsigned int pages = khugepaged_pages_to_scan;
bool wait = true;
barrier(); /* write khugepaged_pages_to_scan to local stack */
while (progress < pages) {
if (!khugepaged_prealloc_page(&hpage, &wait))
break;
if (unlikely(kthread_should_stop() || freezing(current)))
break;
spin_lock(&khugepaged_mm_lock);
if (!khugepaged_scan.mm_slot)
pass_through_head++;
if (khugepaged_has_work() &&
pass_through_head < 2)
progress += khugepaged_scan_mm_slot(pages - progress,
&hpage);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
}
if (!IS_ERR_OR_NULL(hpage))
put_page(hpage);
static void khugepaged_wait_work(void)
{
try_to_freeze();
if (khugepaged_has_work()) {
if (!khugepaged_scan_sleep_millisecs)
return;
wait_event_freezable_timeout(khugepaged_wait,
kthread_should_stop(),
msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
return;
}
if (khugepaged_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
static int khugepaged(void *none)
{
struct mm_slot *mm_slot;
while (!kthread_should_stop()) {
khugepaged_do_scan();
khugepaged_wait_work();
}
spin_lock(&khugepaged_mm_lock);
mm_slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
if (mm_slot)
collect_mm_slot(mm_slot);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd)
{
struct mm_struct *mm = vma->vm_mm;
pgtable_t pgtable;
pmd_t _pmd;
int i;
pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
entry = pte_mkspecial(entry);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
pte_unmap(pte);
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
put_huge_zero_page();
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
mmun_start = haddr;
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
put_page(page);
/*
* We don't always have down_write of mmap_sem here: a racing
* do_huge_pmd_wp_page() might have copied-on-write to another
* huge page before our split_huge_page() got the anon_vma lock.
*/
if (unlikely(pmd_trans_huge(*pmd)))
goto again;
void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
pmd_t *pmd)
{
struct vm_area_struct *vma;
vma = find_vma(mm, address);
BUG_ON(vma == NULL);
split_huge_page_pmd(vma, address, pmd);
}
static void split_huge_page_address(struct mm_struct *mm,
unsigned long address)
{
pmd_t *pmd;
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
split_huge_page_pmd_mm(mm, address, pmd);
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
}
void __vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next)
{
/*
* If the new start address isn't hpage aligned and it could
* previously contain an hugepage: check if we need to split
* an huge pmd.
*/
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
split_huge_page_address(vma->vm_mm, start);
/*
* If the new end address isn't hpage aligned and it could
* previously contain an hugepage: check if we need to split
* an huge pmd.
*/
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
split_huge_page_address(vma->vm_mm, end);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
* vm_next->vm_start isn't page aligned and it could previously
* contain an hugepage: check if we need to split an huge pmd.
*/
if (adjust_next > 0) {
struct vm_area_struct *next = vma->vm_next;
unsigned long nstart = next->vm_start;
nstart += adjust_next << PAGE_SHIFT;
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
split_huge_page_address(next->vm_mm, nstart);
}
}