Skip to content
Snippets Groups Projects
percpu.c 65.4 KiB
Newer Older
 * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009		SUSE Linux Products GmbH
 * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
 *
 * This file is released under the GPLv2 license.
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
 * are not online yet.  In short, the first chunk is structure like so:
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
 *
 * Allocation state in each chunk is kept using an array of integers
 * on chunk->map.  A positive value in the map represents a free
 * region and negative allocated.  Allocation inside a chunk is done
 * by scanning this map sequentially and serving the first matching
 * entry.  This is mostly copied from the percpu_modalloc() allocator.
 * Chunks can be determined from the address using the index field
 * in the page struct. The index field contains a pointer to the chunk.
 * These chunks are organized into lists according to free_size and
 * tries to allocate from the fullest chunk first. Each chunk maintains
 * a maximum contiguous area size hint which is guaranteed to be equal
 * to or larger than the maximum contiguous area in the chunk. This
 * helps prevent the allocator from iterating over chunks unnecessarily.
 *
 * To use this allocator, arch code should do the following:
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT		5

#define PCPU_EMPTY_POP_PAGES_LOW	2
#define PCPU_EMPTY_POP_PAGES_HIGH	4
#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)					\
	(void __percpu *)((unsigned long)(addr) -			\
			  (unsigned long)pcpu_base_addr	+		\
			  (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)						\
	(void __force *)((unsigned long)(ptr) +				\
			 (unsigned long)pcpu_base_addr -		\
			 (unsigned long)__per_cpu_start)
#else	/* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
#endif	/* CONFIG_SMP */
static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;
EXPORT_SYMBOL_GPL(pcpu_base_addr);

static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;
/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
 * Optional reserved chunk.  This chunk reserves part of the first
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);

/*
 * The number of empty populated pages, protected by pcpu_lock.  The
 * reserved chunk doesn't contribute to the count.
 */
int pcpu_nr_empty_pop_pages;
/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
	if (pcpu_async_enabled)
		schedule_work(&pcpu_balance_work);
}
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
 * True if the address is served from this chunk.
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
	start_addr = chunk->base_addr + chunk->start_offset;
	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
		   chunk->end_offset;

	return addr >= start_addr && addr < end_addr;
static int __pcpu_size_to_slot(int size)
Tejun Heo's avatar
Tejun Heo committed
	int highbit = fls(size);	/* size is in bytes */
	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
	if (size == pcpu_unit_size)
		return pcpu_nr_slots - 1;
	return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
	return pcpu_size_to_slot(chunk->free_bytes);
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
	page->index = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
	return (struct pcpu_chunk *)page->index;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
				     unsigned int cpu, int page_idx)
	return (unsigned long)chunk->base_addr +
	       pcpu_unit_page_offset(cpu, page_idx);
static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
	*rs = find_next_zero_bit(bitmap, end, *rs);
	*re = find_next_bit(bitmap, end, *rs + 1);
static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
	*rs = find_next_bit(bitmap, end, *rs);
	*re = find_next_zero_bit(bitmap, end, *rs + 1);
 * Bitmap region iterators.  Iterates over the bitmap between
 * [@start, @end) in @chunk.  @rs and @re should be integer variables
 * and will be set to start and end index of the current free region.
#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
	     (rs) < (re);						     \
	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
#define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
	     (rs) < (re);						     \
	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
 * pcpu_mem_zalloc - allocate memory
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 * kzalloc() is used; otherwise, vzalloc() is used.  The returned
 * CONTEXT:
 * Does GFP_KERNEL allocation.
 *
 * Pointer to the allocated area on success, NULL on failure.
static void *pcpu_mem_zalloc(size_t size)
	if (WARN_ON_ONCE(!slab_is_available()))
		return NULL;

	if (size <= PAGE_SIZE)
		return kzalloc(size, GFP_KERNEL);
/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
static void pcpu_mem_free(void *ptr)
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
	int nslot = pcpu_chunk_slot(chunk);

	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
		if (oslot < nslot)
			list_move(&chunk->list, &pcpu_slot[nslot]);
		else
			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
	}
}

 * pcpu_cnt_pop_pages- counts populated backing pages in range
 * @bit_off: start offset
 * @bits: size of area to check
 * Calculates the number of populated pages in the region
 * [page_start, page_end).  This keeps track of how many empty populated
 * pages are available and decide if async work should be scheduled.
 * The nr of populated pages.
static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
				     int bits)
	int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
	int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
	if (page_start >= page_end)
	/*
	 * bitmap_weight counts the number of bits set in a bitmap up to
	 * the specified number of bits.  This is counting the populated
	 * pages up to page_end and then subtracting the populated pages
	 * up to page_start to count the populated pages in
	 * [page_start, page_end).
	 */
	return bitmap_weight(chunk->populated, page_end) -
	       bitmap_weight(chunk->populated, page_start);
 * pcpu_chunk_update - updates the chunk metadata given a free area
 * @bit_off: chunk offset
 * @bits: size of free area
 * This updates the chunk's contig hint given a free area.
 */
static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
{
	if (bits > chunk->contig_bits)
		chunk->contig_bits = bits;
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
 * Iterates over the chunk to find the largest free area.
 * Updates:
 *      chunk->contig_bits
 *      nr_empty_pop_pages
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
	int bits, nr_empty_pop_pages;
	int rs, re; /* region start, region end */
	/* clear metadata */
	chunk->contig_bits = 0;
	bits = nr_empty_pop_pages = 0;
	pcpu_for_each_unpop_region(chunk->alloc_map, rs, re, 0,
				   pcpu_chunk_map_bits(chunk)) {
		bits = re - rs;
		pcpu_chunk_update(chunk, rs, bits);
		nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, rs, bits);
	}
	/*
	 * Keep track of nr_empty_pop_pages.
	 *
	 * The chunk maintains the previous number of free pages it held,
	 * so the delta is used to update the global counter.  The reserved
	 * chunk is not part of the free page count as they are populated
	 * at init and are special to serving reserved allocations.
	 */
	if (chunk != pcpu_reserved_chunk)
		pcpu_nr_empty_pop_pages +=
			(nr_empty_pop_pages - chunk->nr_empty_pop_pages);
	chunk->nr_empty_pop_pages = nr_empty_pop_pages;
}
/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
			      int *next_off)
{
	int page_start, page_end, rs, re;
	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
	rs = page_start;
	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
	if (rs >= page_end)
		return true;
	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
	return false;
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
			       size_t align, bool pop_only)
	int bit_off, bits;
	int re; /* region end */
	pcpu_for_each_unpop_region(chunk->alloc_map, bit_off, re, 0,
				   pcpu_chunk_map_bits(chunk)) {
		bits = re - bit_off;
		/* check alignment */
		bits -= ALIGN(bit_off, align) - bit_off;
		bit_off = ALIGN(bit_off, align);
		if (bits < alloc_bits)
			continue;
		bits = alloc_bits;
		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
						   &bit_off))
			break;

	if (bit_off == pcpu_chunk_map_bits(chunk))
		return -1;

	return bit_off;
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
 * This function takes in a @start offset to begin searching to fit an
 * allocation of @alloc_bits with alignment @align.  If it confirms a
 * valid free area, it then updates the allocation and boundary maps
 * accordingly.
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
			   size_t align, int start)
	size_t align_mask = (align) ? (align - 1) : 0;
	int bit_off, end, oslot;
	lockdep_assert_held(&pcpu_lock);
	oslot = pcpu_chunk_slot(chunk);
	/*
	 * Search to find a fit.
	 */
	end = start + alloc_bits;
	bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
					     alloc_bits, align_mask);
	if (bit_off >= end)
		return -1;
	/* update alloc map */
	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
	/* update boundary map */
	set_bit(bit_off, chunk->bound_map);
	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
	set_bit(bit_off + alloc_bits, chunk->bound_map);
	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
	pcpu_chunk_refresh_hint(chunk);

	pcpu_chunk_relocate(chunk, oslot);

	return bit_off * PCPU_MIN_ALLOC_SIZE;
 * pcpu_free_area - frees the corresponding offset
 * @chunk: chunk of interest
 * @off: addr offset into chunk
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
	int bit_off, bits, end, oslot;
	lockdep_assert_held(&pcpu_lock);
	pcpu_stats_area_dealloc(chunk);
	oslot = pcpu_chunk_slot(chunk);
	bit_off = off / PCPU_MIN_ALLOC_SIZE;
	/* find end index */
	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
			    bit_off + 1);
	bits = end - bit_off;
	bitmap_clear(chunk->alloc_map, bit_off, bits);
	/* update metadata */
	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
	pcpu_chunk_refresh_hint(chunk);

	pcpu_chunk_relocate(chunk, oslot);
}

/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
	unsigned long aligned_addr;
	int start_offset, offset_bits, region_size, region_bits;

	/* region calculations */
	aligned_addr = tmp_addr & PAGE_MASK;

	start_offset = tmp_addr - aligned_addr;

	region_size = PFN_ALIGN(start_offset + map_size);
	chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
				    BITS_TO_LONGS(region_size >> PAGE_SHIFT),
				    0);
	INIT_LIST_HEAD(&chunk->list);

	chunk->base_addr = (void *)aligned_addr;
	chunk->start_offset = start_offset;
	chunk->end_offset = region_size - chunk->start_offset - map_size;
	chunk->nr_pages = region_size >> PAGE_SHIFT;
	region_bits = pcpu_chunk_map_bits(chunk);
	chunk->alloc_map = memblock_virt_alloc(
				BITS_TO_LONGS(region_bits) *
				sizeof(chunk->alloc_map[0]), 0);
	chunk->bound_map = memblock_virt_alloc(
				BITS_TO_LONGS(region_bits + 1) *
				sizeof(chunk->bound_map[0]), 0);

	/* manage populated page bitmap */
	chunk->immutable = true;
	bitmap_fill(chunk->populated, chunk->nr_pages);
	chunk->nr_populated = chunk->nr_pages;
	chunk->nr_empty_pop_pages =
		pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
				   map_size / PCPU_MIN_ALLOC_SIZE);
	chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
	chunk->free_bytes = map_size;

	if (chunk->start_offset) {
		/* hide the beginning of the bitmap */
		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
		bitmap_set(chunk->alloc_map, 0, offset_bits);
		set_bit(0, chunk->bound_map);
		set_bit(offset_bits, chunk->bound_map);
	if (chunk->end_offset) {
		/* hide the end of the bitmap */
		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
		bitmap_set(chunk->alloc_map,
			   pcpu_chunk_map_bits(chunk) - offset_bits,
			   offset_bits);
		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
			chunk->bound_map);
		set_bit(region_bits, chunk->bound_map);
	pcpu_chunk_refresh_hint(chunk);

static struct pcpu_chunk *pcpu_alloc_chunk(void)
{
	struct pcpu_chunk *chunk;
	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
	INIT_LIST_HEAD(&chunk->list);
	chunk->nr_pages = pcpu_unit_pages;
	region_bits = pcpu_chunk_map_bits(chunk);
	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
					   sizeof(chunk->alloc_map[0]));
	if (!chunk->alloc_map)
		goto alloc_map_fail;
	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
					   sizeof(chunk->bound_map[0]));
	if (!chunk->bound_map)
		goto bound_map_fail;
	/* init metadata */
	chunk->contig_bits = region_bits;
	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

bound_map_fail:
	pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
	pcpu_mem_free(chunk);

	return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
	if (!chunk)
		return;
	pcpu_mem_free(chunk->bound_map);
	pcpu_mem_free(chunk->alloc_map);
/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
 * @for_alloc: if this is to populate for allocation
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
 *
 * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
 * is to serve an allocation in that area.
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
				 int page_end, bool for_alloc)
{
	int nr = page_end - page_start;

	lockdep_assert_held(&pcpu_lock);

	bitmap_set(chunk->populated, page_start, nr);
	chunk->nr_populated += nr;

	if (!for_alloc) {
		chunk->nr_empty_pop_pages += nr;
		pcpu_nr_empty_pop_pages += nr;
	}
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
				   int page_start, int page_end)
{
	int nr = page_end - page_start;

	lockdep_assert_held(&pcpu_lock);

	bitmap_clear(chunk->populated, page_start, nr);
	chunk->nr_populated -= nr;
/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk		- populate the specified range of a chunk
 * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
 * pcpu_create_chunk		- create a new chunk
 * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page		- translate address to physical address
 * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
static struct pcpu_chunk *pcpu_create_chunk(void);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
	/* is it in the dynamic region (first chunk)? */
	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
		return pcpu_first_chunk;

	/* is it in the reserved region? */
	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
		return pcpu_reserved_chunk;

	/*
	 * The address is relative to unit0 which might be unused and
	 * thus unmapped.  Offset the address to the unit space of the
	 * current processor before looking it up in the vmalloc
	 * space.  Note that any possible cpu id can be used here, so
	 * there's no need to worry about preemption or cpu hotplug.
	 */
	addr += pcpu_unit_offsets[raw_smp_processor_id()];
	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
Tejun Heo's avatar
Tejun Heo committed
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 * @gfp: allocation flags
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
 * contain %GFP_KERNEL, the allocation is atomic.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
				 gfp_t gfp)
	static int warn_limit = 10;
	struct pcpu_chunk *chunk;
	const char *err;
	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
	int slot, off, cpu, ret;
	void __percpu *ptr;
	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
	 * therefore alignment must be a minimum of that many bytes.
	 * An allocation may have internal fragmentation from rounding up
	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
		align = PCPU_MIN_ALLOC_SIZE;
	bits = size >> PCPU_MIN_ALLOC_SHIFT;
	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
		     !is_power_of_2(align))) {
Joe Perches's avatar
Joe Perches committed
		WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
		     size, align);
	spin_lock_irqsave(&pcpu_lock, flags);
	/* serve reserved allocations from the reserved chunk if available */
	if (reserved && pcpu_reserved_chunk) {
		chunk = pcpu_reserved_chunk;
		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
		if (off < 0) {
			err = "alloc from reserved chunk failed";
		off = pcpu_alloc_area(chunk, bits, bit_align, off);
		err = "alloc from reserved chunk failed";
	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
			off = pcpu_find_block_fit(chunk, bits, bit_align,
						  is_atomic);
			if (off < 0)
			off = pcpu_alloc_area(chunk, bits, bit_align, off);
			if (off >= 0)
				goto area_found;
	spin_unlock_irqrestore(&pcpu_lock, flags);
Tejun Heo's avatar
Tejun Heo committed
	/*
	 * No space left.  Create a new chunk.  We don't want multiple
	 * tasks to create chunks simultaneously.  Serialize and create iff
	 * there's still no empty chunk after grabbing the mutex.
	 */
	if (is_atomic) {
		err = "atomic alloc failed, no space left";
Tejun Heo's avatar
Tejun Heo committed
	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
		chunk = pcpu_create_chunk();
		if (!chunk) {
			err = "failed to allocate new chunk";
			goto fail;
		}

		spin_lock_irqsave(&pcpu_lock, flags);
		pcpu_chunk_relocate(chunk, -1);
	} else {
		spin_lock_irqsave(&pcpu_lock, flags);
	pcpu_stats_area_alloc(chunk, size);
	spin_unlock_irqrestore(&pcpu_lock, flags);
	/* populate if not all pages are already there */
	if (!is_atomic) {
		int page_start, page_end, rs, re;
		page_start = PFN_DOWN(off);
		page_end = PFN_UP(off + size);
		pcpu_for_each_unpop_region(chunk->populated, rs, re,
					   page_start, page_end) {
			WARN_ON(chunk->immutable);

			ret = pcpu_populate_chunk(chunk, rs, re);

			spin_lock_irqsave(&pcpu_lock, flags);
			if (ret) {
				pcpu_free_area(chunk, off);
				err = "failed to populate";
				goto fail_unlock;
			}
			pcpu_chunk_populated(chunk, rs, re, true);
			spin_unlock_irqrestore(&pcpu_lock, flags);
		mutex_unlock(&pcpu_alloc_mutex);
	}
	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
		pcpu_schedule_balance_work();

	/* clear the areas and return address relative to base address */
	for_each_possible_cpu(cpu)
		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
	kmemleak_alloc_percpu(ptr, size, gfp);

	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
			chunk->base_addr, off, ptr);

	spin_unlock_irqrestore(&pcpu_lock, flags);
Tejun Heo's avatar
Tejun Heo committed
fail:
	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

	if (!is_atomic && warn_limit) {
		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
			size, align, is_atomic, err);
		dump_stack();
		if (!--warn_limit)
			pr_info("limit reached, disable warning\n");
	if (is_atomic) {
		/* see the flag handling in pcpu_blance_workfn() */
		pcpu_atomic_alloc_failed = true;