Newer
Older
* isolate_freepages_block() should not terminate
* prematurely unless contended, or isolated enough
*/
VM_BUG_ON(isolate_start_pfn < block_end_pfn);
}
}
/* split_free_page does not map the pages */
map_pages(freelist);
* Record where the free scanner will restart next time. Either we
* broke from the loop and set isolate_start_pfn based on the last
* call to isolate_freepages_block(), or we met the migration scanner
* and the loop terminated due to isolate_start_pfn < low_pfn
cc->free_pfn = isolate_start_pfn;
}
/*
* This is a migrate-callback that "allocates" freepages by taking pages
* from the isolated freelists in the block we are migrating to.
*/
static struct page *compaction_alloc(struct page *migratepage,
unsigned long data,
int **result)
{
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
Vlastimil Babka
committed
/*
* Isolate free pages if necessary, and if we are not aborting due to
* contention.
*/
Vlastimil Babka
committed
if (!cc->contended)
Vlastimil Babka
committed
isolate_freepages(cc);
if (list_empty(&cc->freepages))
return NULL;
}
freepage = list_entry(cc->freepages.next, struct page, lru);
list_del(&freepage->lru);
cc->nr_freepages--;
return freepage;
}
/*
* This is a migrate-callback that "frees" freepages back to the isolated
* freelist. All pages on the freelist are from the same zone, so there is no
* special handling needed for NUMA.
*/
static void compaction_free(struct page *page, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
list_add(&page->lru, &cc->freepages);
cc->nr_freepages++;
}
/* possible outcome of isolate_migratepages */
typedef enum {
ISOLATE_ABORT, /* Abort compaction now */
ISOLATE_NONE, /* No pages isolated, continue scanning */
ISOLATE_SUCCESS, /* Pages isolated, migrate */
} isolate_migrate_t;
/*
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
int sysctl_compact_unevictable_allowed __read_mostly = 1;
Vlastimil Babka
committed
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
*/
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long block_start_pfn;
unsigned long block_end_pfn;
unsigned long low_pfn;
Vlastimil Babka
committed
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
Vlastimil Babka
committed
(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
Vlastimil Babka
committed
/*
* Start at where we last stopped, or beginning of the zone as
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
block_start_pfn = pageblock_start_pfn(low_pfn);
if (block_start_pfn < zone->zone_start_pfn)
block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
block_end_pfn = pageblock_end_pfn(low_pfn);
Vlastimil Babka
committed
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
for (; block_end_pfn <= cc->free_pfn;
low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
Vlastimil Babka
committed
/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
* need to schedule, or even abort async compaction.
*/
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
&& compact_should_abort(cc))
break;
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
if (!page)
Vlastimil Babka
committed
continue;
/* If isolation recently failed, do not retry */
if (!isolation_suitable(cc, page))
continue;
/*
* For async compaction, also only scan in MOVABLE blocks.
* Async compaction is optimistic to see if the minimum amount
* of work satisfies the allocation.
*/
if (cc->mode == MIGRATE_ASYNC &&
!migrate_async_suitable(get_pageblock_migratetype(page)))
continue;
/* Perform the isolation */
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
Vlastimil Babka
committed
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
Vlastimil Babka
committed
return ISOLATE_ABORT;
Vlastimil Babka
committed
/*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
*/
break;
}
acct_isolated(zone, cc);
/* Record where migration scanner will be restarted. */
cc->migrate_pfn = low_pfn;
Vlastimil Babka
committed
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
/*
* order == -1 is expected when compacting via
* /proc/sys/vm/compact_memory
*/
static inline bool is_via_compact_memory(int order)
{
return order == -1;
}
static int __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
unsigned int order;
unsigned long watermark;
Vlastimil Babka
committed
if (cc->contended || fatal_signal_pending(current))
return COMPACT_CONTENDED;
/* Compaction run completes if the migrate and free scanner meet */
if (compact_scanners_met(cc)) {
/* Let the next compaction start anew. */
reset_cached_positions(zone);
/*
* Mark that the PG_migrate_skip information should be cleared
* by kswapd when it goes to sleep. kcompactd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
}
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
/* Compaction run is not finished if the watermark is not met */
watermark = low_wmark_pages(zone);
if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
cc->alloc_flags))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
#ifdef CONFIG_CMA
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
if (migratetype == MIGRATE_MOVABLE &&
!list_empty(&area->free_list[MIGRATE_CMA]))
return COMPACT_PARTIAL;
#endif
/*
* Job done if allocation would steal freepages from
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
true, &can_steal) != -1)
return COMPACT_PARTIAL;
}
return COMPACT_NO_SUITABLE_PAGE;
}
static int compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
int ret;
ret = __compact_finished(zone, cc, migratetype);
trace_mm_compaction_finished(zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
return ret;
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
* Returns
* COMPACT_SKIPPED - If there are too few free pages for compaction
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
static unsigned long __compaction_suitable(struct zone *zone, int order,
int alloc_flags, int classzone_idx)
{
int fragindex;
unsigned long watermark;
if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
watermark = low_wmark_pages(zone);
/*
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
*/
if (zone_watermark_ok(zone, order, watermark, classzone_idx,
alloc_flags))
return COMPACT_PARTIAL;
/*
* Watermarks for order-0 must be met for compaction. Note the 2UL.
* This is because during migration, copies of pages need to be
* allocated and for a short time, the footprint is higher
*/
watermark += (2UL << order);
if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
return COMPACT_SKIPPED;
/*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
*
* index of -1000 would imply allocations might succeed depending on
* watermarks, but we already failed the high-order watermark check
* index towards 0 implies failure is due to lack of memory
* index towards 1000 implies failure is due to fragmentation
*
* Only compact if a failure would be due to fragmentation.
*/
fragindex = fragmentation_index(zone, order);
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
return COMPACT_NOT_SUITABLE_ZONE;
return COMPACT_CONTINUE;
}
unsigned long compaction_suitable(struct zone *zone, int order,
int alloc_flags, int classzone_idx)
{
unsigned long ret;
ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
trace_mm_compaction_suitable(zone, order, ret);
if (ret == COMPACT_NOT_SUITABLE_ZONE)
ret = COMPACT_SKIPPED;
return ret;
}
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
switch (ret) {
case COMPACT_PARTIAL:
case COMPACT_SKIPPED:
/* Compaction is likely to fail */
return ret;
case COMPACT_CONTINUE:
/* Fall through to compaction */
;
}
/*
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred.
if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
* Setup to move all movable pages to the end of the zone. Used cached
* information on where the scanners should start but check that it
* is initialised by ensuring the values are within zone boundaries.
*/
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
cc->last_migrated_pfn = 0;
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
while ((ret = compact_finished(zone, cc, migratetype)) ==
COMPACT_CONTINUE) {
int err;
Mel Gorman
committed
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
Mel Gorman
committed
goto out;
case ISOLATE_NONE:
/*
* We haven't isolated and migrated anything, but
* there might still be unflushed migrations from
* previous cc->order aligned block.
*/
goto check_drain;
Mel Gorman
committed
case ISOLATE_SUCCESS:
;
}
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
if (err) {
putback_movable_pages(&cc->migratepages);
/*
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
*/
if (err == -ENOMEM && !compact_scanners_met(cc)) {
ret = COMPACT_CONTENDED;
goto out;
}
check_drain:
/*
* Has the migration scanner moved away from the previous
* cc->order aligned block where we migrated from? If yes,
* flush the pages that were freed, so that they can merge and
* compact_finished() can detect immediately if allocation
* would succeed.
*/
if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
drain_local_pages(zone);
put_cpu();
/* No more flushing until we migrate again */
cc->last_migrated_pfn = 0;
Mel Gorman
committed
out:
/*
* Release free pages and update where the free scanner should restart,
* so we don't leave any returned pages behind in the next attempt.
*/
if (cc->nr_freepages > 0) {
unsigned long free_pfn = release_freepages(&cc->freepages);
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
/* The cached pfn is always the first in a pageblock */
free_pfn = pageblock_start_pfn(free_pfn);
/*
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
*/
if (free_pfn > zone->compact_cached_free_pfn)
zone->compact_cached_free_pfn = free_pfn;
}
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
if (ret == COMPACT_CONTENDED)
ret = COMPACT_PARTIAL;
static unsigned long compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended,
int alloc_flags, int classzone_idx)
unsigned long ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
.order = order,
.gfp_mask = gfp_mask,
.zone = zone,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
.direct_compaction = true,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
ret = compact_zone(zone, &cc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
*contended = cc.contended;
return ret;
Mel Gorman
committed
int sysctl_extfrag_threshold = 500;
/**
* try_to_compact_pages - Direct compact to satisfy a high-order allocation
* @gfp_mask: The GFP mask of the current allocation
* @order: The order of the current allocation
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
* @mode: The migration mode for async, sync light, or sync migration
* @contended: Return value that determines if compaction was aborted due to
* need_resched() or lock contention
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended)
{
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_DEFERRED;
int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
*contended = COMPACT_CONTENDED_NONE;
/* Check if the GFP flags allow compaction */
if (!order || !may_enter_fs || !may_perform_io)
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
int status;
int zone_contended;
if (compaction_deferred(zone, order))
continue;
status = compact_zone_order(zone, order, gfp_mask, mode,
&zone_contended, alloc_flags,
ac->classzone_idx);
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
* to clear all_zones_contended.
*/
all_zones_contended &= zone_contended;
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
ac->classzone_idx, alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
* will repeat this with true if allocation indeed
* succeeds in this zone.
*/
compaction_defer_reset(zone, order, false);
/*
* It is possible that async compaction aborted due to
* need_resched() and the watermarks were ok thanks to
* somebody else freeing memory. The allocation can
* however still fail so we better signal the
* need_resched() contention anyway (this will not
* prevent the allocation attempt).
*/
if (zone_contended == COMPACT_CONTENDED_SCHED)
*contended = COMPACT_CONTENDED_SCHED;
goto break_loop;
}
if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
/*
* We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up
* succeeding after all, it will be reset.
*/
defer_compaction(zone, order);
}
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
/*
* We might have stopped compacting due to need_resched() in
* async compaction, or due to a fatal signal detected. In that
* case do not try further zones and signal need_resched()
* contention.
*/
if ((zone_contended == COMPACT_CONTENDED_SCHED)
|| fatal_signal_pending(current)) {
*contended = COMPACT_CONTENDED_SCHED;
goto break_loop;
}
continue;
break_loop:
/*
* We might not have tried all the zones, so be conservative
* and assume they are not all lock contended.
*/
all_zones_contended = 0;
break;
/*
* If at least one zone wasn't deferred or skipped, we report if all
* zones that were tried were lock contended.
*/
if (rc > COMPACT_SKIPPED && all_zones_contended)
*contended = COMPACT_CONTENDED_LOCK;
return rc;
}
/* Compact all zones within a node */
static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
{
int zoneid;
struct zone *zone;
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
cc->nr_freepages = 0;
cc->nr_migratepages = 0;
cc->zone = zone;
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
/*
* When called via /proc/sys/vm/compact_memory
* this makes sure we compact the whole zone regardless of
* cached scanner positions.
*/
if (is_via_compact_memory(cc->order))
__reset_isolation_suitable(zone);
if (is_via_compact_memory(cc->order) ||
!compaction_deferred(zone, cc->order))
VM_BUG_ON(!list_empty(&cc->freepages));
VM_BUG_ON(!list_empty(&cc->migratepages));
if (is_via_compact_memory(cc->order))
continue;
if (zone_watermark_ok(zone, cc->order,
low_wmark_pages(zone), 0, 0))
compaction_defer_reset(zone, cc->order, false);
void compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
.mode = MIGRATE_ASYNC,
if (!order)
return;
__compact_pgdat(pgdat, &cc);
static void compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
__compact_pgdat(NODE_DATA(nid), &cc);
/* Compact all nodes in the system */
static void compact_nodes(void)
{
int nid;
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
for_each_online_node(nid)
compact_node(nid);
}
/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;
/*
* This is the entry point for compacting all nodes via
* /proc/sys/vm/compact_memory
*/
int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
if (write)
compact_nodes();
return 0;
}
Mel Gorman
committed
int sysctl_extfrag_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_minmax(table, write, buffer, length, ppos);
return 0;
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
static ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
int nid = dev->id;
if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
compact_node(nid);
}
return count;
}
static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
int compaction_register_node(struct node *node)
{
return device_create_file(&node->dev, &dev_attr_compact);
}
void compaction_unregister_node(struct node *node)
{
return device_remove_file(&node->dev, &dev_attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
static inline bool kcompactd_work_requested(pg_data_t *pgdat)
{
return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
}
static bool kcompactd_node_suitable(pg_data_t *pgdat)
{
int zoneid;
struct zone *zone;
enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
classzone_idx) == COMPACT_CONTINUE)
return true;
}
return false;
}
static void kcompactd_do_work(pg_data_t *pgdat)
{
/*
* With no special task, compact all zones so that a page of requested
* order is allocatable.
*/
int zoneid;
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true,
};
bool success = false;
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
cc.classzone_idx);
count_vm_event(KCOMPACTD_WAKE);
for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
int status;
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
if (compaction_deferred(zone, cc.order))
continue;
if (compaction_suitable(zone, cc.order, 0, zoneid) !=
COMPACT_CONTINUE)
continue;
cc.nr_freepages = 0;
cc.nr_migratepages = 0;
cc.zone = zone;
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
if (kthread_should_stop())
return;
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
status = compact_zone(zone, &cc);
if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
cc.classzone_idx, 0)) {
success = true;
compaction_defer_reset(zone, cc.order, false);
} else if (status == COMPACT_COMPLETE) {
/*
* We use sync migration mode here, so we defer like
* sync direct compaction does.
*/
defer_compaction(zone, cc.order);
}
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
}
/*
* Regardless of success, we are done until woken up next. But remember
* the requested order/classzone_idx in case it was higher/tighter than
* our current ones
*/
if (pgdat->kcompactd_max_order <= cc.order)
pgdat->kcompactd_max_order = 0;
if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
}
void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
{
if (!order)
return;
if (pgdat->kcompactd_max_order < order)
pgdat->kcompactd_max_order = order;
if (pgdat->kcompactd_classzone_idx > classzone_idx)
pgdat->kcompactd_classzone_idx = classzone_idx;
if (!waitqueue_active(&pgdat->kcompactd_wait))
return;
if (!kcompactd_node_suitable(pgdat))
return;
trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
classzone_idx);
wake_up_interruptible(&pgdat->kcompactd_wait);
}
/*
* The background compaction daemon, started as a kernel thread
* from the init process.
*/
static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
set_freezable();
pgdat->kcompactd_max_order = 0;
pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
wait_event_freezable(pgdat->kcompactd_wait,
kcompactd_work_requested(pgdat));
kcompactd_do_work(pgdat);
}
return 0;
}
/*
* This kcompactd start function will be called by init and node-hot-add.
* On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
*/
int kcompactd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
if (pgdat->kcompactd)
return 0;
pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
if (IS_ERR(pgdat->kcompactd)) {
pr_err("Failed to start kcompactd on node %d\n", nid);
ret = PTR_ERR(pgdat->kcompactd);
pgdat->kcompactd = NULL;
}
return ret;
}
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
* hold mem_hotplug_begin/end().
*/
void kcompactd_stop(int nid)
{
struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
if (kcompactd) {
kthread_stop(kcompactd);
NODE_DATA(nid)->kcompactd = NULL;
}
}
/*
* It's optimal to keep kcompactd on the same CPUs as their memory, but
* not required for correctness. So if the last cpu in a node goes
* away, we get changed to run anywhere: as the first one comes back,
* restore their cpu bindings.
*/
static int cpu_callback(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
int nid;
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
for_each_node_state(nid, N_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
mask = cpumask_of_node(pgdat->node_id);
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
set_cpus_allowed_ptr(pgdat->kcompactd, mask);
}
}
return NOTIFY_OK;
}
static int __init kcompactd_init(void)
{
int nid;
for_each_node_state(nid, N_MEMORY)
kcompactd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
}
subsys_initcall(kcompactd_init)
#endif /* CONFIG_COMPACTION */