Newer
Older
* mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
Dennis Zhou (Facebook)
committed
* This file is released under the GPLv2 license.
Dennis Zhou (Facebook)
committed
* The percpu allocator handles both static and dynamic areas. Percpu
* areas are allocated in chunks which are divided into units. There is
* a 1-to-1 mapping for units to possible cpus. These units are grouped
* based on NUMA properties of the machine.
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
Dennis Zhou (Facebook)
committed
* Allocation is done by offsets into a unit's address space. Ie., an
* area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
* c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
* and even sparse. Access is handled by configuring percpu base
* registers according to the cpu to unit mappings and offsetting the
* base address using pcpu_unit_size.
*
* There is special consideration for the first chunk which must handle
* the static percpu variables in the kernel image as allocation services
* are not online yet. In short, the first chunk is structure like so:
*
* <Static | [Reserved] | Dynamic>
*
* The static data is copied from the original section managed by the
* linker. The reserved section, if non-zero, primarily manages static
* percpu variables from kernel modules. Finally, the dynamic section
* takes care of normal allocations.
*
* Allocation state in each chunk is kept using an array of integers
* on chunk->map. A positive value in the map represents a free
* region and negative allocated. Allocation inside a chunk is done
* by scanning this map sequentially and serving the first matching
* entry. This is mostly copied from the percpu_modalloc() allocator.
* Chunks can be determined from the address using the index field
* in the page struct. The index field contains a pointer to the chunk.
Dennis Zhou (Facebook)
committed
* These chunks are organized into lists according to free_size and
* tries to allocate from the fullest chunk first. Each chunk maintains
* a maximum contiguous area size hint which is guaranteed to be equal
* to or larger than the maximum contiguous area in the chunk. This
* helps prevent the allocator from iterating over chunks unnecessarily.
*
* To use this allocator, arch code should do the following:
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
* different from the default
* - use pcpu_setup_first_chunk() during percpu area initialization to
* setup the first chunk containing the kernel static percpu area
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/io.h>
#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>
#include "percpu-internal.h"
/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT 5
#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr) \
(void __percpu *)((unsigned long)(addr) - \
(unsigned long)pcpu_base_addr + \
(unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr) \
(void __force *)((unsigned long)(ptr) + \
(unsigned long)pcpu_base_addr - \
(unsigned long)__per_cpu_start)
#else /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
#endif /* CONFIG_SMP */
static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
* ways and thus often doesn't live in the vmalloc area.
*/
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/*
* Optional reserved chunk. This chunk reserves part of the first
* chunk and serves it for reserved allocations. When the reserved
* region doesn't exist, the following variable is NULL.
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
Tejun Heo
committed
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
Tejun Heo
committed
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
Tejun Heo
committed
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);
/*
* The number of empty populated pages, protected by pcpu_lock. The
* reserved chunk doesn't contribute to the count.
*/
int pcpu_nr_empty_pop_pages;
/*
* Balance work is used to populate or destroy chunks asynchronously. We
* try to keep the number of populated free pages between
* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
* empty chunk.
*/
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;
static void pcpu_schedule_balance_work(void)
{
if (pcpu_async_enabled)
schedule_work(&pcpu_balance_work);
}
* pcpu_addr_in_chunk - check if the address is served from this chunk
* @chunk: chunk of interest
* @addr: percpu address
*
* RETURNS:
* True if the address is served from this chunk.
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
Tejun Heo
committed
{
void *start_addr, *end_addr;
Tejun Heo
committed
start_addr = chunk->base_addr + chunk->start_offset;
end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
chunk->end_offset;
return addr >= start_addr && addr < end_addr;
Tejun Heo
committed
}
static int __pcpu_size_to_slot(int size)
return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}
static int pcpu_size_to_slot(int size)
{
if (size == pcpu_unit_size)
return pcpu_nr_slots - 1;
return __pcpu_size_to_slot(size);
}
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
return pcpu_size_to_slot(chunk->free_bytes);
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
page->index = (unsigned long)pcpu;
}
/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
return (struct pcpu_chunk *)page->index;
}
static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
return (unsigned long)chunk->base_addr +
pcpu_unit_page_offset(cpu, page_idx);
static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
*rs = find_next_zero_bit(bitmap, end, *rs);
*re = find_next_bit(bitmap, end, *rs + 1);
static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
*rs = find_next_bit(bitmap, end, *rs);
*re = find_next_zero_bit(bitmap, end, *rs + 1);
* Bitmap region iterators. Iterates over the bitmap between
* [@start, @end) in @chunk. @rs and @re should be integer variables
* and will be set to start and end index of the current free region.
#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \
for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \
for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
* pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
* kzalloc() is used; otherwise, vzalloc() is used. The returned
* memory is always zeroed.
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* Pointer to the allocated area on success, NULL on failure.
static void *pcpu_mem_zalloc(size_t size)
if (WARN_ON_ONCE(!slab_is_available()))
return NULL;
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else
return vzalloc(size);
/**
* pcpu_mem_free - free memory
* @ptr: memory to free
*
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
static void pcpu_mem_free(void *ptr)
}
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
*
* This function is called after an allocation or free changed @chunk.
* New slot according to the changed state is determined and @chunk is
Tejun Heo
committed
* moved to the slot. Note that the reserved chunk is never put on
* chunk slots.
*
* CONTEXT:
* pcpu_lock.
*/
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
Tejun Heo
committed
if (chunk != pcpu_reserved_chunk && oslot != nslot) {
if (oslot < nslot)
list_move(&chunk->list, &pcpu_slot[nslot]);
else
list_move_tail(&chunk->list, &pcpu_slot[nslot]);
}
}
* pcpu_cnt_pop_pages- counts populated backing pages in range
* @chunk: chunk of interest
* @bit_off: start offset
* @bits: size of area to check
* Calculates the number of populated pages in the region
* [page_start, page_end). This keeps track of how many empty populated
* pages are available and decide if async work should be scheduled.
* The nr of populated pages.
static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
int bits)
int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
Tejun Heo
committed
if (page_start >= page_end)
/*
* bitmap_weight counts the number of bits set in a bitmap up to
* the specified number of bits. This is counting the populated
* pages up to page_end and then subtracting the populated pages
* up to page_start to count the populated pages in
* [page_start, page_end).
*/
return bitmap_weight(chunk->populated, page_end) -
bitmap_weight(chunk->populated, page_start);
}
/**
* pcpu_chunk_update - updates the chunk metadata given a free area
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of free area
*
* This updates the chunk's contig hint given a free area.
*/
static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
{
if (bits > chunk->contig_bits)
chunk->contig_bits = bits;
}
/**
* pcpu_chunk_refresh_hint - updates metadata about a chunk
* @chunk: chunk of interest
*
* Iterates over the chunk to find the largest free area.
*
* Updates:
* chunk->contig_bits
* nr_empty_pop_pages
*/
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
{
int bits, nr_empty_pop_pages;
int rs, re; /* region start, region end */
/* clear metadata */
chunk->contig_bits = 0;
Tejun Heo
committed
bits = nr_empty_pop_pages = 0;
pcpu_for_each_unpop_region(chunk->alloc_map, rs, re, 0,
pcpu_chunk_map_bits(chunk)) {
bits = re - rs;
pcpu_chunk_update(chunk, rs, bits);
nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, rs, bits);
}
/*
* Keep track of nr_empty_pop_pages.
*
* The chunk maintains the previous number of free pages it held,
* so the delta is used to update the global counter. The reserved
* chunk is not part of the free page count as they are populated
* at init and are special to serving reserved allocations.
*/
if (chunk != pcpu_reserved_chunk)
pcpu_nr_empty_pop_pages +=
(nr_empty_pop_pages - chunk->nr_empty_pop_pages);
chunk->nr_empty_pop_pages = nr_empty_pop_pages;
}
/**
* pcpu_is_populated - determines if the region is populated
* @chunk: chunk of interest
* @bit_off: chunk offset
* @bits: size of area
* @next_off: return value for the next offset to start searching
*
* For atomic allocations, check if the backing pages are populated.
*
* RETURNS:
* Bool if the backing pages are populated.
* next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
*/
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
int *next_off)
{
int page_start, page_end, rs, re;
page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
rs = page_start;
pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
if (rs >= page_end)
return true;
*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
return false;
/**
* pcpu_find_block_fit - finds the block index to start searching
* @chunk: chunk of interest
* @alloc_bits: size of request in allocation units
* @align: alignment of area (max PAGE_SIZE bytes)
* @pop_only: use populated regions only
*
* RETURNS:
* The offset in the bitmap to begin searching.
* -1 if no offset is found.
*/
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, bool pop_only)
{
int bit_off, bits;
int re; /* region end */
pcpu_for_each_unpop_region(chunk->alloc_map, bit_off, re, 0,
pcpu_chunk_map_bits(chunk)) {
bits = re - bit_off;
/* check alignment */
bits -= ALIGN(bit_off, align) - bit_off;
bit_off = ALIGN(bit_off, align);
if (bits < alloc_bits)
continue;
bits = alloc_bits;
if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
&bit_off))
break;
}
if (bit_off == pcpu_chunk_map_bits(chunk))
return -1;
return bit_off;
}
* pcpu_alloc_area - allocates an area from a pcpu_chunk
* @alloc_bits: size of request in allocation units
* @align: alignment of area (max PAGE_SIZE)
* @start: bit_off to start searching
* This function takes in a @start offset to begin searching to fit an
* allocation of @alloc_bits with alignment @align. If it confirms a
* valid free area, it then updates the allocation and boundary maps
* accordingly.
* Allocated addr offset in @chunk on success.
* -1 if no matching area is found.
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, int start)
size_t align_mask = (align) ? (align - 1) : 0;
int bit_off, end, oslot;
lockdep_assert_held(&pcpu_lock);
oslot = pcpu_chunk_slot(chunk);
/*
* Search to find a fit.
*/
end = start + alloc_bits;
bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
alloc_bits, align_mask);
if (bit_off >= end)
return -1;
/* update alloc map */
bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
/* update boundary map */
set_bit(bit_off, chunk->bound_map);
bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
set_bit(bit_off + alloc_bits, chunk->bound_map);
chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
pcpu_chunk_refresh_hint(chunk);
pcpu_chunk_relocate(chunk, oslot);
return bit_off * PCPU_MIN_ALLOC_SIZE;
* pcpu_free_area - frees the corresponding offset
* @off: addr offset into chunk
* This function determines the size of an allocation to free using
* the boundary bitmap and clears the allocation map.
static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
int bit_off, bits, end, oslot;
lockdep_assert_held(&pcpu_lock);
pcpu_stats_area_dealloc(chunk);
oslot = pcpu_chunk_slot(chunk);
bit_off = off / PCPU_MIN_ALLOC_SIZE;
/* find end index */
end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
bit_off + 1);
bits = end - bit_off;
bitmap_clear(chunk->alloc_map, bit_off, bits);
/* update metadata */
chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
pcpu_chunk_refresh_hint(chunk);
pcpu_chunk_relocate(chunk, oslot);
}
/**
* pcpu_alloc_first_chunk - creates chunks that serve the first chunk
* @tmp_addr: the start of the region served
* @map_size: size of the region served
*
* This is responsible for creating the chunks that serve the first chunk. The
* base_addr is page aligned down of @tmp_addr while the region end is page
* aligned up. Offsets are kept track of to determine the region served. All
* this is done to appease the bitmap allocator in avoiding partial blocks.
*
* RETURNS:
* Chunk serving the region at @tmp_addr of @map_size.
*/
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
{
struct pcpu_chunk *chunk;
unsigned long aligned_addr;
int start_offset, offset_bits, region_size, region_bits;
/* region calculations */
aligned_addr = tmp_addr & PAGE_MASK;
start_offset = tmp_addr - aligned_addr;
region_size = PFN_ALIGN(start_offset + map_size);
/* allocate chunk */
Dennis Zhou (Facebook)
committed
chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(region_size >> PAGE_SHIFT),
0);
INIT_LIST_HEAD(&chunk->list);
chunk->base_addr = (void *)aligned_addr;
chunk->start_offset = start_offset;
chunk->end_offset = region_size - chunk->start_offset - map_size;
Dennis Zhou (Facebook)
committed
chunk->nr_pages = region_size >> PAGE_SHIFT;
region_bits = pcpu_chunk_map_bits(chunk);
chunk->alloc_map = memblock_virt_alloc(
BITS_TO_LONGS(region_bits) *
sizeof(chunk->alloc_map[0]), 0);
chunk->bound_map = memblock_virt_alloc(
BITS_TO_LONGS(region_bits + 1) *
sizeof(chunk->bound_map[0]), 0);
/* manage populated page bitmap */
chunk->immutable = true;
Dennis Zhou (Facebook)
committed
bitmap_fill(chunk->populated, chunk->nr_pages);
chunk->nr_populated = chunk->nr_pages;
chunk->nr_empty_pop_pages =
pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
map_size / PCPU_MIN_ALLOC_SIZE);
chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
chunk->free_bytes = map_size;
if (chunk->start_offset) {
/* hide the beginning of the bitmap */
offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
bitmap_set(chunk->alloc_map, 0, offset_bits);
set_bit(0, chunk->bound_map);
set_bit(offset_bits, chunk->bound_map);
if (chunk->end_offset) {
/* hide the end of the bitmap */
offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
bitmap_set(chunk->alloc_map,
pcpu_chunk_map_bits(chunk) - offset_bits,
offset_bits);
set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
chunk->bound_map);
set_bit(region_bits, chunk->bound_map);
}
pcpu_chunk_refresh_hint(chunk);
return chunk;
}
static struct pcpu_chunk *pcpu_alloc_chunk(void)
{
struct pcpu_chunk *chunk;
int region_bits;
chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
if (!chunk)
return NULL;
INIT_LIST_HEAD(&chunk->list);
chunk->nr_pages = pcpu_unit_pages;
region_bits = pcpu_chunk_map_bits(chunk);
chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
sizeof(chunk->alloc_map[0]));
if (!chunk->alloc_map)
goto alloc_map_fail;
chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
sizeof(chunk->bound_map[0]));
if (!chunk->bound_map)
goto bound_map_fail;
/* init metadata */
chunk->contig_bits = region_bits;
chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
bound_map_fail:
pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
pcpu_mem_free(chunk);
return NULL;
}
static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
pcpu_mem_free(chunk->bound_map);
pcpu_mem_free(chunk->alloc_map);
pcpu_mem_free(chunk);
/**
* pcpu_chunk_populated - post-population bookkeeping
* @chunk: pcpu_chunk which got populated
* @page_start: the start page
* @page_end: the end page
* @for_alloc: if this is to populate for allocation
*
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
* successful population.
*
* If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
* is to serve an allocation in that area.
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
int page_end, bool for_alloc)
{
int nr = page_end - page_start;
lockdep_assert_held(&pcpu_lock);
bitmap_set(chunk->populated, page_start, nr);
chunk->nr_populated += nr;
if (!for_alloc) {
chunk->nr_empty_pop_pages += nr;
pcpu_nr_empty_pop_pages += nr;
}
}
/**
* pcpu_chunk_depopulated - post-depopulation bookkeeping
* @chunk: pcpu_chunk which got depopulated
* @page_start: the start page
* @page_end: the end page
*
* Pages in [@page_start,@page_end) have been depopulated from @chunk.
* Update the bookkeeping information accordingly. Must be called after
* each successful depopulation.
*/
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
int nr = page_end - page_start;
lockdep_assert_held(&pcpu_lock);
bitmap_clear(chunk->populated, page_start, nr);
chunk->nr_populated -= nr;
chunk->nr_empty_pop_pages -= nr;
pcpu_nr_empty_pop_pages -= nr;
}
/*
* Chunk management implementation.
*
* To allow different implementations, chunk alloc/free and
* [de]population are implemented in a separate file which is pulled
* into this file and compiled together. The following functions
* should be implemented.
*
* pcpu_populate_chunk - populate the specified range of a chunk
* pcpu_depopulate_chunk - depopulate the specified range of a chunk
* pcpu_create_chunk - create a new chunk
* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
* pcpu_addr_to_page - translate address to physical address
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
static struct pcpu_chunk *pcpu_create_chunk(void);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
/**
* pcpu_chunk_addr_search - determine chunk containing specified address
* @addr: address for which the chunk needs to be determined.
*
* This is an internal function that handles all but static allocations.
* Static percpu address values should never be passed into the allocator.
*
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
/* is it in the dynamic region (first chunk)? */
if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
/* is it in the reserved region? */
if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
return pcpu_reserved_chunk;
/*
* The address is relative to unit0 which might be unused and
* thus unmapped. Offset the address to the unit space of the
* current processor before looking it up in the vmalloc
* space. Note that any possible cpu id can be used here, so
* there's no need to worry about preemption or cpu hotplug.
*/
addr += pcpu_unit_offsets[raw_smp_processor_id()];
return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
Tejun Heo
committed
* pcpu_alloc - the percpu allocator
* @align: alignment of area (max PAGE_SIZE)
Tejun Heo
committed
* @reserved: allocate from the reserved chunk if available
* Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
* contain %GFP_KERNEL, the allocation is atomic.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
int slot, off, cpu, ret;
unsigned long flags;
size_t bits, bit_align;
* There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
* therefore alignment must be a minimum of that many bytes.
* An allocation may have internal fragmentation from rounding up
* of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
Dennis Zhou (Facebook)
committed
if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
align = PCPU_MIN_ALLOC_SIZE;
Dennis Zhou (Facebook)
committed
size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
bits = size >> PCPU_MIN_ALLOC_SHIFT;
bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
!is_power_of_2(align))) {
WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
size, align);
Tejun Heo
committed
if (!is_atomic)
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irqsave(&pcpu_lock, flags);
Tejun Heo
committed
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
if (off < 0) {
err = "alloc from reserved chunk failed";
goto fail_unlock;
off = pcpu_alloc_area(chunk, bits, bit_align, off);
Tejun Heo
committed
if (off >= 0)
goto area_found;
err = "alloc from reserved chunk failed";
goto fail_unlock;
Tejun Heo
committed
}
restart:
Tejun Heo
committed
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic);
if (off < 0)
off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
goto area_found;
spin_unlock_irqrestore(&pcpu_lock, flags);
/*
* No space left. Create a new chunk. We don't want multiple
* tasks to create chunks simultaneously. Serialize and create iff
* there's still no empty chunk after grabbing the mutex.
*/
if (is_atomic) {
err = "atomic alloc failed, no space left";
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
chunk = pcpu_create_chunk();
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
pcpu_chunk_relocate(chunk, -1);
} else {
spin_lock_irqsave(&pcpu_lock, flags);
goto restart;
pcpu_stats_area_alloc(chunk, size);
spin_unlock_irqrestore(&pcpu_lock, flags);
/* populate if not all pages are already there */
int page_start, page_end, rs, re;
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
pcpu_for_each_unpop_region(chunk->populated, rs, re,
page_start, page_end) {
WARN_ON(chunk->immutable);
ret = pcpu_populate_chunk(chunk, rs, re);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
pcpu_free_area(chunk, off);
err = "failed to populate";
goto fail_unlock;
}
pcpu_chunk_populated(chunk, rs, re, true);
spin_unlock_irqrestore(&pcpu_lock, flags);
mutex_unlock(&pcpu_alloc_mutex);
}
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
pcpu_schedule_balance_work();
/* clear the areas and return address relative to base address */
for_each_possible_cpu(cpu)
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);
trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
chunk->base_addr, off, ptr);
fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
dump_stack();
if (!--warn_limit)
pr_info("limit reached, disable warning\n");
if (is_atomic) {
/* see the flag handling in pcpu_blance_workfn() */
pcpu_atomic_alloc_failed = true;