Skip to content
Snippets Groups Projects
shmem.c 110 KiB
Newer Older
	unsigned long inflated_addr;
	unsigned long inflated_offset;

	if (len > TASK_SIZE)
		return -ENOMEM;

	get_area = current->mm->get_unmapped_area;
	addr = get_area(file, uaddr, len, pgoff, flags);

	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
		return addr;
	if (IS_ERR_VALUE(addr))
		return addr;
	if (addr & ~PAGE_MASK)
		return addr;
	if (addr > TASK_SIZE - len)
		return addr;

	if (shmem_huge == SHMEM_HUGE_DENY)
		return addr;
	if (len < HPAGE_PMD_SIZE)
		return addr;
	if (flags & MAP_FIXED)
		return addr;
	/*
	 * Our priority is to support MAP_SHARED mapped hugely;
	 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
	 * But if caller specified an address hint, respect that as before.
	 */
	if (uaddr)
		return addr;

	if (shmem_huge != SHMEM_HUGE_FORCE) {
		struct super_block *sb;

		if (file) {
			VM_BUG_ON(file->f_op != &shmem_file_operations);
			sb = file_inode(file)->i_sb;
		} else {
			/*
			 * Called directly from mm/mmap.c, or drivers/char/mem.c
			 * for "/dev/zero", to create a shared anonymous object.
			 */
			if (IS_ERR(shm_mnt))
				return addr;
			sb = shm_mnt->mnt_sb;
		}
		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
			return addr;
	}

	offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
	if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
		return addr;
	if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
		return addr;

	inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
	if (inflated_len > TASK_SIZE)
		return addr;
	if (inflated_len < len)
		return addr;

	inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
	if (IS_ERR_VALUE(inflated_addr))
		return addr;
	if (inflated_addr & ~PAGE_MASK)
		return addr;

	inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
	inflated_addr += offset - inflated_offset;
	if (inflated_offset > offset)
		inflated_addr += HPAGE_PMD_SIZE;

	if (inflated_addr > TASK_SIZE - len)
		return addr;
	return inflated_addr;
}

Linus Torvalds's avatar
Linus Torvalds committed
#ifdef CONFIG_NUMA
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
Linus Torvalds's avatar
Linus Torvalds committed
{
Al Viro's avatar
Al Viro committed
	struct inode *inode = file_inode(vma->vm_file);
	return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
					  unsigned long addr)
Linus Torvalds's avatar
Linus Torvalds committed
{
Al Viro's avatar
Al Viro committed
	struct inode *inode = file_inode(vma->vm_file);
	pgoff_t index;
Linus Torvalds's avatar
Linus Torvalds committed

	index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
	return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
Linus Torvalds's avatar
Linus Torvalds committed
}
#endif

int shmem_lock(struct file *file, int lock, struct user_struct *user)
{
Al Viro's avatar
Al Viro committed
	struct inode *inode = file_inode(file);
Linus Torvalds's avatar
Linus Torvalds committed
	struct shmem_inode_info *info = SHMEM_I(inode);
	int retval = -ENOMEM;

	spin_lock_irq(&info->lock);
Linus Torvalds's avatar
Linus Torvalds committed
	if (lock && !(info->flags & VM_LOCKED)) {
		if (!user_shm_lock(inode->i_size, user))
			goto out_nomem;
		info->flags |= VM_LOCKED;
		mapping_set_unevictable(file->f_mapping);
Linus Torvalds's avatar
Linus Torvalds committed
	}
	if (!lock && (info->flags & VM_LOCKED) && user) {
		user_shm_unlock(inode->i_size, user);
		info->flags &= ~VM_LOCKED;
		mapping_clear_unevictable(file->f_mapping);
Linus Torvalds's avatar
Linus Torvalds committed
	}
	retval = 0;
Linus Torvalds's avatar
Linus Torvalds committed
out_nomem:
	spin_unlock_irq(&info->lock);
Linus Torvalds's avatar
Linus Torvalds committed
	return retval;
}

static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
{
	file_accessed(file);
	vma->vm_ops = &shmem_vm_ops;
	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
			(vma->vm_end & HPAGE_PMD_MASK)) {
		khugepaged_enter(vma, vma->vm_flags);
	}
Linus Torvalds's avatar
Linus Torvalds committed
	return 0;
}

static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
				     umode_t mode, dev_t dev, unsigned long flags)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct inode *inode;
	struct shmem_inode_info *info;
	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

	if (shmem_reserve_inode(sb))
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed

	inode = new_inode(sb);
	if (inode) {
		inode->i_ino = get_next_ino();
		inode_init_owner(inode, dir, mode);
Linus Torvalds's avatar
Linus Torvalds committed
		inode->i_blocks = 0;
		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
		inode->i_generation = get_seconds();
Linus Torvalds's avatar
Linus Torvalds committed
		info = SHMEM_I(inode);
		memset(info, 0, (char *)inode - (char *)info);
		spin_lock_init(&info->lock);
David Herrmann's avatar
David Herrmann committed
		info->seals = F_SEAL_SEAL;
		info->flags = flags & VM_NORESERVE;
		INIT_LIST_HEAD(&info->shrinklist);
Linus Torvalds's avatar
Linus Torvalds committed
		INIT_LIST_HEAD(&info->swaplist);
		simple_xattrs_init(&info->xattrs);
		cache_no_acl(inode);
Linus Torvalds's avatar
Linus Torvalds committed

		switch (mode & S_IFMT) {
		default:
			inode->i_op = &shmem_special_inode_operations;
Linus Torvalds's avatar
Linus Torvalds committed
			init_special_inode(inode, mode, dev);
			break;
		case S_IFREG:
			inode->i_mapping->a_ops = &shmem_aops;
Linus Torvalds's avatar
Linus Torvalds committed
			inode->i_op = &shmem_inode_operations;
			inode->i_fop = &shmem_file_operations;
			mpol_shared_policy_init(&info->policy,
						 shmem_get_sbmpol(sbinfo));
Linus Torvalds's avatar
Linus Torvalds committed
			break;
		case S_IFDIR:
Linus Torvalds's avatar
Linus Torvalds committed
			/* Some things misbehave if size == 0 on a directory */
			inode->i_size = 2 * BOGO_DIRENT_SIZE;
			inode->i_op = &shmem_dir_inode_operations;
			inode->i_fop = &simple_dir_operations;
			break;
		case S_IFLNK:
			/*
			 * Must not load anything in the rbtree,
			 * mpol_free_shared_policy will not be called.
			 */
			mpol_shared_policy_init(&info->policy, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
			break;
		}
	} else
		shmem_free_inode(sb);
Linus Torvalds's avatar
Linus Torvalds committed
	return inode;
}

bool shmem_mapping(struct address_space *mapping)
{
	return mapping->a_ops == &shmem_aops;
int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
			   pmd_t *dst_pmd,
			   struct vm_area_struct *dst_vma,
			   unsigned long dst_addr,
			   unsigned long src_addr,
			   struct page **pagep)
{
	struct inode *inode = file_inode(dst_vma->vm_file);
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
	struct address_space *mapping = inode->i_mapping;
	gfp_t gfp = mapping_gfp_mask(mapping);
	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
	struct mem_cgroup *memcg;
	spinlock_t *ptl;
	void *page_kaddr;
	struct page *page;
	pte_t _dst_pte, *dst_pte;
	int ret;

	ret = -ENOMEM;
	if (shmem_acct_block(info->flags, 1))
		goto out;
	if (sbinfo->max_blocks) {
		if (percpu_counter_compare(&sbinfo->used_blocks,
					   sbinfo->max_blocks) >= 0)
			goto out_unacct_blocks;
		percpu_counter_inc(&sbinfo->used_blocks);
	}
		page = shmem_alloc_page(gfp, info, pgoff);
		if (!page)
			goto out_dec_used_blocks;

		page_kaddr = kmap_atomic(page);
		ret = copy_from_user(page_kaddr, (const void __user *)src_addr,
				     PAGE_SIZE);
		kunmap_atomic(page_kaddr);

		/* fallback to copy_from_user outside mmap_sem */
		if (unlikely(ret)) {
			*pagep = page;
			if (sbinfo->max_blocks)
				percpu_counter_add(&sbinfo->used_blocks, -1);
			shmem_unacct_blocks(info->flags, 1);
			/* don't free the page */
			return -EFAULT;
		}
	} else {
		page = *pagep;
		*pagep = NULL;
	}

	VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
	__SetPageLocked(page);
	__SetPageSwapBacked(page);
	ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
	if (ret)
		goto out_release;

	ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
	if (!ret) {
		ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
		radix_tree_preload_end();
	}
	if (ret)
		goto out_release_uncharge;

	mem_cgroup_commit_charge(page, memcg, false, false);

	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
	if (dst_vma->vm_flags & VM_WRITE)
		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));

	ret = -EEXIST;
	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
	if (!pte_none(*dst_pte))
		goto out_release_uncharge_unlock;

	lru_cache_add_anon(page);

	spin_lock(&info->lock);
	info->alloced++;
	inode->i_blocks += BLOCKS_PER_PAGE;
	shmem_recalc_inode(inode);
	spin_unlock(&info->lock);

	inc_mm_counter(dst_mm, mm_counter_file(page));
	page_add_file_rmap(page, false);
	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);

	/* No need to invalidate - it was non-present before */
	update_mmu_cache(dst_vma, dst_addr, dst_pte);
	unlock_page(page);
	pte_unmap_unlock(dst_pte, ptl);
	ret = 0;
out:
	return ret;
out_release_uncharge_unlock:
	pte_unmap_unlock(dst_pte, ptl);
out_release_uncharge:
	mem_cgroup_cancel_charge(page, memcg, false);
out_release:
	put_page(page);
out_dec_used_blocks:
	if (sbinfo->max_blocks)
		percpu_counter_add(&sbinfo->used_blocks, -1);
out_unacct_blocks:
	shmem_unacct_blocks(info->flags, 1);
	goto out;
}

Linus Torvalds's avatar
Linus Torvalds committed
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
Linus Torvalds's avatar
Linus Torvalds committed

#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
#else
#define shmem_initxattrs NULL
#endif

Linus Torvalds's avatar
Linus Torvalds committed
static int
Nick Piggin's avatar
Nick Piggin committed
shmem_write_begin(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned flags,
			struct page **pagep, void **fsdata)
Linus Torvalds's avatar
Linus Torvalds committed
{
Nick Piggin's avatar
Nick Piggin committed
	struct inode *inode = mapping->host;
David Herrmann's avatar
David Herrmann committed
	struct shmem_inode_info *info = SHMEM_I(inode);
David Herrmann's avatar
David Herrmann committed

	/* i_mutex is held by caller */
	if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) {
David Herrmann's avatar
David Herrmann committed
		if (info->seals & F_SEAL_WRITE)
			return -EPERM;
		if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
			return -EPERM;
	}

	return shmem_getpage(inode, index, pagep, SGP_WRITE);
Nick Piggin's avatar
Nick Piggin committed
}

static int
shmem_write_end(struct file *file, struct address_space *mapping,
			loff_t pos, unsigned len, unsigned copied,
			struct page *page, void *fsdata)
{
	struct inode *inode = mapping->host;

	if (pos + copied > inode->i_size)
		i_size_write(inode, pos + copied);

	if (!PageUptodate(page)) {
		struct page *head = compound_head(page);
		if (PageTransCompound(page)) {
			int i;

			for (i = 0; i < HPAGE_PMD_NR; i++) {
				if (head + i == page)
					continue;
				clear_highpage(head + i);
				flush_dcache_page(head + i);
			}
		}
		if (copied < PAGE_SIZE) {
			unsigned from = pos & (PAGE_SIZE - 1);
			zero_user_segments(page, 0, from,
		SetPageUptodate(head);
Nick Piggin's avatar
Nick Piggin committed
	set_page_dirty(page);
Nick Piggin's avatar
Nick Piggin committed

	return copied;
Al Viro's avatar
Al Viro committed
static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file_inode(file);
Linus Torvalds's avatar
Linus Torvalds committed
	struct address_space *mapping = inode->i_mapping;
	pgoff_t index;
	unsigned long offset;
	enum sgp_type sgp = SGP_READ;
Al Viro's avatar
Al Viro committed
	ssize_t retval = 0;

	/*
	 * Might this read be for a stacking filesystem?  Then when reading
	 * holes of a sparse file, we actually need to allocate those pages,
	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
	 */
Al Viro's avatar
Al Viro committed
	if (!iter_is_iovec(to))
		sgp = SGP_CACHE;
Linus Torvalds's avatar
Linus Torvalds committed

	index = *ppos >> PAGE_SHIFT;
	offset = *ppos & ~PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed

	for (;;) {
		struct page *page = NULL;
		pgoff_t end_index;
		unsigned long nr, ret;
Linus Torvalds's avatar
Linus Torvalds committed
		loff_t i_size = i_size_read(inode);

Linus Torvalds's avatar
Linus Torvalds committed
		if (index > end_index)
			break;
		if (index == end_index) {
Linus Torvalds's avatar
Linus Torvalds committed
			if (nr <= offset)
				break;
		}

		error = shmem_getpage(inode, index, &page, sgp);
		if (error) {
			if (error == -EINVAL)
				error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
			break;
		}
		if (page) {
			if (sgp == SGP_CACHE)
				set_page_dirty(page);
			unlock_page(page);
Linus Torvalds's avatar
Linus Torvalds committed

		/*
		 * We must evaluate after, since reads (unlike writes)
		 * are called without i_mutex protection against truncate
Linus Torvalds's avatar
Linus Torvalds committed
		 */
Linus Torvalds's avatar
Linus Torvalds committed
		i_size = i_size_read(inode);
Linus Torvalds's avatar
Linus Torvalds committed
		if (index == end_index) {
Linus Torvalds's avatar
Linus Torvalds committed
			if (nr <= offset) {
				if (page)
Linus Torvalds's avatar
Linus Torvalds committed
				break;
			}
		}
		nr -= offset;

		if (page) {
			/*
			 * If users can be writing to this page using arbitrary
			 * virtual addresses, take care about potential aliasing
			 * before reading the page on the kernel side.
			 */
			if (mapping_writably_mapped(mapping))
				flush_dcache_page(page);
			/*
			 * Mark the page accessed if we read the beginning.
			 */
			if (!offset)
				mark_page_accessed(page);
		} else {
Linus Torvalds's avatar
Linus Torvalds committed
			page = ZERO_PAGE(0);
Linus Torvalds's avatar
Linus Torvalds committed

		/*
		 * Ok, we have the page, and it's up-to-date, so
		 * now we can copy it to user space...
		 */
Al Viro's avatar
Al Viro committed
		ret = copy_page_to_iter(page, offset, nr, to);
Linus Torvalds's avatar
Linus Torvalds committed
		offset += ret;
		index += offset >> PAGE_SHIFT;
		offset &= ~PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed

Al Viro's avatar
Al Viro committed
		if (!iov_iter_count(to))
Linus Torvalds's avatar
Linus Torvalds committed
			break;
Linus Torvalds's avatar
Linus Torvalds committed
		cond_resched();
	}

	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
	file_accessed(file);
	return retval ? retval : error;
/*
 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
 */
static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
				    pgoff_t index, pgoff_t end, int whence)
{
	struct page *page;
	struct pagevec pvec;
	pgoff_t indices[PAGEVEC_SIZE];
	bool done = false;
	int i;

	pagevec_init(&pvec, 0);
	pvec.nr = 1;		/* start small: we may be there already */
	while (!done) {
		pvec.nr = find_get_entries(mapping, index,
					pvec.nr, pvec.pages, indices);
		if (!pvec.nr) {
			if (whence == SEEK_DATA)
				index = end;
			break;
		}
		for (i = 0; i < pvec.nr; i++, index++) {
			if (index < indices[i]) {
				if (whence == SEEK_HOLE) {
					done = true;
					break;
				}
				index = indices[i];
			}
			page = pvec.pages[i];
			if (page && !radix_tree_exceptional_entry(page)) {
				if (!PageUptodate(page))
					page = NULL;
			}
			if (index >= end ||
			    (page && whence == SEEK_DATA) ||
			    (!page && whence == SEEK_HOLE)) {
		pagevec_remove_exceptionals(&pvec);
		pagevec_release(&pvec);
		pvec.nr = PAGEVEC_SIZE;
		cond_resched();
	}
	return index;
}

static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
	pgoff_t start, end;
	loff_t new_offset;

	if (whence != SEEK_DATA && whence != SEEK_HOLE)
		return generic_file_llseek_size(file, offset, whence,
					MAX_LFS_FILESIZE, i_size_read(inode));
Al Viro's avatar
Al Viro committed
	inode_lock(inode);
	/* We're holding i_mutex so we can access i_size directly */

	if (offset < 0)
		offset = -EINVAL;
	else if (offset >= inode->i_size)
		offset = -ENXIO;
	else {
		start = offset >> PAGE_SHIFT;
		end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
		new_offset = shmem_seek_hole_data(mapping, start, end, whence);
		if (new_offset > offset) {
			if (new_offset < inode->i_size)
				offset = new_offset;
			else if (whence == SEEK_DATA)
				offset = -ENXIO;
			else
				offset = inode->i_size;
		}
	}

	if (offset >= 0)
		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
Al Viro's avatar
Al Viro committed
	inode_unlock(inode);
/*
 * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
 * so reuse a tag which we firmly believe is never set or cleared on shmem.
 */
#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
#define LAST_SCAN               4       /* about 150ms max */

static void shmem_tag_pins(struct address_space *mapping)
{
	struct radix_tree_iter iter;
	void **slot;
	pgoff_t start;
	struct page *page;

	lru_add_drain();
	start = 0;
	rcu_read_lock();

	radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
		page = radix_tree_deref_slot(slot);
		if (!page || radix_tree_exception(page)) {
			if (radix_tree_deref_retry(page)) {
				slot = radix_tree_iter_retry(&iter);
				continue;
			}
		} else if (page_count(page) - page_mapcount(page) > 1) {
			spin_lock_irq(&mapping->tree_lock);
			radix_tree_tag_set(&mapping->page_tree, iter.index,
					   SHMEM_TAG_PINNED);
			spin_unlock_irq(&mapping->tree_lock);
		}

		if (need_resched()) {
			slot = radix_tree_iter_resume(slot, &iter);
			cond_resched_rcu();
		}
	}
	rcu_read_unlock();
}

/*
 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
 * via get_user_pages(), drivers might have some pending I/O without any active
 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
 * and see whether it has an elevated ref-count. If so, we tag them and wait for
 * them to be dropped.
 * The caller must guarantee that no new user will acquire writable references
 * to those pages to avoid races.
 */
David Herrmann's avatar
David Herrmann committed
static int shmem_wait_for_pins(struct address_space *mapping)
{
	struct radix_tree_iter iter;
	void **slot;
	pgoff_t start;
	struct page *page;
	int error, scan;

	shmem_tag_pins(mapping);

	error = 0;
	for (scan = 0; scan <= LAST_SCAN; scan++) {
		if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
			break;

		if (!scan)
			lru_add_drain_all();
		else if (schedule_timeout_killable((HZ << scan) / 200))
			scan = LAST_SCAN;

		start = 0;
		rcu_read_lock();
		radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
					   start, SHMEM_TAG_PINNED) {

			page = radix_tree_deref_slot(slot);
			if (radix_tree_exception(page)) {
				if (radix_tree_deref_retry(page)) {
					slot = radix_tree_iter_retry(&iter);
					continue;
				}

				page = NULL;
			}

			if (page &&
			    page_count(page) - page_mapcount(page) != 1) {
				if (scan < LAST_SCAN)
					goto continue_resched;

				/*
				 * On the last scan, we clean up all those tags
				 * we inserted; but make a note that we still
				 * found pages pinned.
				 */
				error = -EBUSY;
			}

			spin_lock_irq(&mapping->tree_lock);
			radix_tree_tag_clear(&mapping->page_tree,
					     iter.index, SHMEM_TAG_PINNED);
			spin_unlock_irq(&mapping->tree_lock);
continue_resched:
			if (need_resched()) {
				slot = radix_tree_iter_resume(slot, &iter);
				cond_resched_rcu();
			}
		}
		rcu_read_unlock();
	}

	return error;
David Herrmann's avatar
David Herrmann committed
}

#define F_ALL_SEALS (F_SEAL_SEAL | \
		     F_SEAL_SHRINK | \
		     F_SEAL_GROW | \
		     F_SEAL_WRITE)

int shmem_add_seals(struct file *file, unsigned int seals)
{
	struct inode *inode = file_inode(file);
	struct shmem_inode_info *info = SHMEM_I(inode);
	int error;

	/*
	 * SEALING
	 * Sealing allows multiple parties to share a shmem-file but restrict
	 * access to a specific subset of file operations. Seals can only be
	 * added, but never removed. This way, mutually untrusted parties can
	 * share common memory regions with a well-defined policy. A malicious
	 * peer can thus never perform unwanted operations on a shared object.
	 *
	 * Seals are only supported on special shmem-files and always affect
	 * the whole underlying inode. Once a seal is set, it may prevent some
	 * kinds of access to the file. Currently, the following seals are
	 * defined:
	 *   SEAL_SEAL: Prevent further seals from being set on this file
	 *   SEAL_SHRINK: Prevent the file from shrinking
	 *   SEAL_GROW: Prevent the file from growing
	 *   SEAL_WRITE: Prevent write access to the file
	 *
	 * As we don't require any trust relationship between two parties, we
	 * must prevent seals from being removed. Therefore, sealing a file
	 * only adds a given set of seals to the file, it never touches
	 * existing seals. Furthermore, the "setting seals"-operation can be
	 * sealed itself, which basically prevents any further seal from being
	 * added.
	 *
	 * Semantics of sealing are only defined on volatile files. Only
	 * anonymous shmem files support sealing. More importantly, seals are
	 * never written to disk. Therefore, there's no plan to support it on
	 * other file types.
	 */

	if (file->f_op != &shmem_file_operations)
		return -EINVAL;
	if (!(file->f_mode & FMODE_WRITE))
		return -EPERM;
	if (seals & ~(unsigned int)F_ALL_SEALS)
		return -EINVAL;

Al Viro's avatar
Al Viro committed
	inode_lock(inode);
David Herrmann's avatar
David Herrmann committed

	if (info->seals & F_SEAL_SEAL) {
		error = -EPERM;
		goto unlock;
	}

	if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
		error = mapping_deny_writable(file->f_mapping);
		if (error)
			goto unlock;

		error = shmem_wait_for_pins(file->f_mapping);
		if (error) {
			mapping_allow_writable(file->f_mapping);
			goto unlock;
		}
	}

	info->seals |= seals;
	error = 0;

unlock:
Al Viro's avatar
Al Viro committed
	inode_unlock(inode);
David Herrmann's avatar
David Herrmann committed
	return error;
}
EXPORT_SYMBOL_GPL(shmem_add_seals);

int shmem_get_seals(struct file *file)
{
	if (file->f_op != &shmem_file_operations)
		return -EINVAL;

	return SHMEM_I(file_inode(file))->seals;
}
EXPORT_SYMBOL_GPL(shmem_get_seals);

long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
	long error;

	switch (cmd) {
	case F_ADD_SEALS:
		/* disallow upper 32bit */
		if (arg > UINT_MAX)
			return -EINVAL;

		error = shmem_add_seals(file, arg);
		break;
	case F_GET_SEALS:
		error = shmem_get_seals(file);
		break;
	default:
		error = -EINVAL;
		break;
	}

	return error;
}

static long shmem_fallocate(struct file *file, int mode, loff_t offset,
							 loff_t len)
{
Al Viro's avatar
Al Viro committed
	struct inode *inode = file_inode(file);
	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
David Herrmann's avatar
David Herrmann committed
	struct shmem_inode_info *info = SHMEM_I(inode);
	struct shmem_falloc shmem_falloc;
	pgoff_t start, index, end;
	int error;
	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
		return -EOPNOTSUPP;

Al Viro's avatar
Al Viro committed
	inode_lock(inode);

	if (mode & FALLOC_FL_PUNCH_HOLE) {
		struct address_space *mapping = file->f_mapping;
		loff_t unmap_start = round_up(offset, PAGE_SIZE);
		loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
		DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
David Herrmann's avatar
David Herrmann committed
		/* protected by i_mutex */
		if (info->seals & F_SEAL_WRITE) {
			error = -EPERM;
			goto out;
		}

		shmem_falloc.waitq = &shmem_falloc_waitq;
		shmem_falloc.start = unmap_start >> PAGE_SHIFT;
		shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
		spin_lock(&inode->i_lock);
		inode->i_private = &shmem_falloc;
		spin_unlock(&inode->i_lock);

		if ((u64)unmap_end > (u64)unmap_start)
			unmap_mapping_range(mapping, unmap_start,
					    1 + unmap_end - unmap_start, 0);
		shmem_truncate_range(inode, offset, offset + len - 1);
		/* No need to unmap again: hole-punching leaves COWed pages */

		spin_lock(&inode->i_lock);
		inode->i_private = NULL;
		wake_up_all(&shmem_falloc_waitq);
		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
		spin_unlock(&inode->i_lock);
	}

	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
	error = inode_newsize_ok(inode, offset + len);
	if (error)
		goto out;

David Herrmann's avatar
David Herrmann committed
	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
		error = -EPERM;
		goto out;
	}

	start = offset >> PAGE_SHIFT;
	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	/* Try to avoid a swapstorm if len is impossible to satisfy */
	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
		error = -ENOSPC;
		goto out;
	shmem_falloc.waitq = NULL;
	shmem_falloc.start = start;
	shmem_falloc.next  = start;
	shmem_falloc.nr_falloced = 0;
	shmem_falloc.nr_unswapped = 0;
	spin_lock(&inode->i_lock);
	inode->i_private = &shmem_falloc;
	spin_unlock(&inode->i_lock);

	for (index = start; index < end; index++) {
		struct page *page;

		/*
		 * Good, the fallocate(2) manpage permits EINTR: we may have
		 * been interrupted because we are using up too much memory.
		 */
		if (signal_pending(current))
			error = -EINTR;
		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
			error = -ENOMEM;
			error = shmem_getpage(inode, index, &page, SGP_FALLOC);
		if (error) {
			/* Remove the !PageUptodate pages we added */
			if (index > start) {
				shmem_undo_range(inode,
				    (loff_t)start << PAGE_SHIFT,
				    ((loff_t)index << PAGE_SHIFT) - 1, true);
			}
		/*
		 * Inform shmem_writepage() how far we have reached.
		 * No need for lock or barrier: we have the page lock.
		 */
		shmem_falloc.next++;
		if (!PageUptodate(page))
			shmem_falloc.nr_falloced++;

		 * If !PageUptodate, leave it that way so that freeable pages
		 * can be recognized if we need to rollback on error later.
		 * But set_page_dirty so that memory pressure will swap rather
		 * than free the pages we are allocating (and SGP_CACHE pages
		 * might still be clean: we now need to mark those dirty too).
		 */
		set_page_dirty(page);
		unlock_page(page);
		cond_resched();
	}

	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
		i_size_write(inode, offset + len);
	inode->i_ctime = current_time(inode);
undone:
	spin_lock(&inode->i_lock);
	inode->i_private = NULL;
	spin_unlock(&inode->i_lock);
Al Viro's avatar
Al Viro committed
	inode_unlock(inode);
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
Linus Torvalds's avatar
Linus Torvalds committed

	buf->f_type = TMPFS_MAGIC;
Linus Torvalds's avatar
Linus Torvalds committed
	buf->f_namelen = NAME_MAX;
	if (sbinfo->max_blocks) {
Linus Torvalds's avatar
Linus Torvalds committed
		buf->f_blocks = sbinfo->max_blocks;
		buf->f_bavail =
		buf->f_bfree  = sbinfo->max_blocks -
				percpu_counter_sum(&sbinfo->used_blocks);
	}
	if (sbinfo->max_inodes) {
Linus Torvalds's avatar
Linus Torvalds committed
		buf->f_files = sbinfo->max_inodes;
		buf->f_ffree = sbinfo->free_inodes;
	}
	/* else leave those fields 0 like simple_statfs */
	return 0;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int
Al Viro's avatar
Al Viro committed
shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct inode *inode;
Linus Torvalds's avatar
Linus Torvalds committed
	int error = -ENOSPC;

	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
Linus Torvalds's avatar
Linus Torvalds committed
	if (inode) {
		error = simple_acl_create(dir, inode);
		if (error)
			goto out_iput;
		error = security_inode_init_security(inode, dir,
						     shmem_initxattrs, NULL);
		if (error && error != -EOPNOTSUPP)
			goto out_iput;
Al Viro's avatar
Al Viro committed
		error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
		dir->i_size += BOGO_DIRENT_SIZE;
		dir->i_ctime = dir->i_mtime = current_time(dir);
Linus Torvalds's avatar
Linus Torvalds committed
		d_instantiate(dentry, inode);
		dget(dentry); /* Extra count - pin the dentry in core */
	}
	return error;
out_iput:
	iput(inode);
	return error;
static int
shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
	struct inode *inode;
	int error = -ENOSPC;

	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
	if (inode) {
		error = security_inode_init_security(inode, dir,
						     NULL,
						     shmem_initxattrs, NULL);
		if (error && error != -EOPNOTSUPP)
			goto out_iput;
		error = simple_acl_create(dir, inode);
		if (error)
			goto out_iput;