#include <linux/init.h>
#include <linux/fs.h>
#include <linux/xattr.h>
+#include <linux/exportfs.h>
#include <linux/generic_acl.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/ctype.h>
#include <linux/migrate.h>
#include <linux/highmem.h>
-#include <linux/backing-dev.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
SGP_READ, /* don't exceed i_size, don't allocate page */
SGP_CACHE, /* don't exceed i_size, may allocate page */
SGP_WRITE, /* may exceed i_size, may allocate page */
+ SGP_FAULT, /* same as SGP_CACHE, return with page locked */
};
static int shmem_getpage(struct inode *inode, unsigned long idx,
* The above definition of ENTRIES_PER_PAGE, and the use of
* BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
* might be reconsidered if it ever diverges from PAGE_SIZE.
+ *
+ * __GFP_MOVABLE is masked out as swap vectors cannot move
*/
- return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+ return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO,
+ PAGE_CACHE_SHIFT-PAGE_SHIFT);
}
static inline void shmem_dir_free(struct page *page)
vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
}
-static struct super_operations shmem_ops;
+static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
}
spin_unlock(&info->lock);
- page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
+ page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
if (page)
set_page_private(page, 0);
spin_lock(&info->lock);
/*
* shmem_free_swp - free some swap entries in a directory
*
- * @dir: pointer to the directory
- * @edir: pointer after last entry of the directory
+ * @dir: pointer to the directory
+ * @edir: pointer after last entry of the directory
+ * @punch_lock: pointer to spinlock when needed for the holepunch case
*/
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
+static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
+ spinlock_t *punch_lock)
{
+ spinlock_t *punch_unlock = NULL;
swp_entry_t *ptr;
int freed = 0;
for (ptr = dir; ptr < edir; ptr++) {
if (ptr->val) {
+ if (unlikely(punch_lock)) {
+ punch_unlock = punch_lock;
+ punch_lock = NULL;
+ spin_lock(punch_unlock);
+ if (!ptr->val)
+ continue;
+ }
free_swap_and_cache(*ptr);
*ptr = (swp_entry_t){0};
freed++;
}
}
+ if (punch_unlock)
+ spin_unlock(punch_unlock);
return freed;
}
-static int shmem_map_and_free_swp(struct page *subdir,
- int offset, int limit, struct page ***dir)
+static int shmem_map_and_free_swp(struct page *subdir, int offset,
+ int limit, struct page ***dir, spinlock_t *punch_lock)
{
swp_entry_t *ptr;
int freed = 0;
int size = limit - offset;
if (size > LATENCY_LIMIT)
size = LATENCY_LIMIT;
- freed += shmem_free_swp(ptr+offset, ptr+offset+size);
+ freed += shmem_free_swp(ptr+offset, ptr+offset+size,
+ punch_lock);
if (need_resched()) {
shmem_swp_unmap(ptr);
if (*dir) {
long nr_swaps_freed = 0;
int offset;
int freed;
- int punch_hole = 0;
+ int punch_hole;
+ spinlock_t *needs_lock;
+ spinlock_t *punch_lock;
+ unsigned long upper_limit;
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
info->flags |= SHMEM_TRUNCATE;
if (likely(end == (loff_t) -1)) {
limit = info->next_index;
+ upper_limit = SHMEM_MAX_INDEX;
info->next_index = idx;
+ needs_lock = NULL;
+ punch_hole = 0;
} else {
- limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (limit > info->next_index)
- limit = info->next_index;
+ if (end + 1 >= inode->i_size) { /* we may free a little more */
+ limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ upper_limit = SHMEM_MAX_INDEX;
+ } else {
+ limit = (end + 1) >> PAGE_CACHE_SHIFT;
+ upper_limit = limit;
+ }
+ needs_lock = &info->lock;
punch_hole = 1;
}
size = limit;
if (size > SHMEM_NR_DIRECT)
size = SHMEM_NR_DIRECT;
- nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size);
+ nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
}
/*
* If there are no indirect blocks or we are punching a hole
* below indirect blocks, nothing to be done.
*/
- if (!topdir || (punch_hole && (limit <= SHMEM_NR_DIRECT)))
+ if (!topdir || limit <= SHMEM_NR_DIRECT)
goto done2;
- BUG_ON(limit <= SHMEM_NR_DIRECT);
+ /*
+ * The truncation case has already dropped info->lock, and we're safe
+ * because i_size and next_index have already been lowered, preventing
+ * access beyond. But in the punch_hole case, we still need to take
+ * the lock when updating the swap directory, because there might be
+ * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
+ * shmem_writepage. However, whenever we find we can remove a whole
+ * directory page (not at the misaligned start or end of the range),
+ * we first NULLify its pointer in the level above, and then have no
+ * need to take the lock when updating its contents: needs_lock and
+ * punch_lock (either pointing to info->lock or NULL) manage this.
+ */
+
+ upper_limit -= SHMEM_NR_DIRECT;
limit -= SHMEM_NR_DIRECT;
idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
offset = idx % ENTRIES_PER_PAGE;
if (*dir) {
diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
- if (!diroff && !offset) {
- *dir = NULL;
+ if (!diroff && !offset && upper_limit >= stage) {
+ if (needs_lock) {
+ spin_lock(needs_lock);
+ *dir = NULL;
+ spin_unlock(needs_lock);
+ needs_lock = NULL;
+ } else
+ *dir = NULL;
nr_pages_to_free++;
list_add(&middir->lru, &pages_to_free);
}
}
stage = idx + ENTRIES_PER_PAGEPAGE;
middir = *dir;
- *dir = NULL;
- nr_pages_to_free++;
- list_add(&middir->lru, &pages_to_free);
+ if (punch_hole)
+ needs_lock = &info->lock;
+ if (upper_limit >= stage) {
+ if (needs_lock) {
+ spin_lock(needs_lock);
+ *dir = NULL;
+ spin_unlock(needs_lock);
+ needs_lock = NULL;
+ } else
+ *dir = NULL;
+ nr_pages_to_free++;
+ list_add(&middir->lru, &pages_to_free);
+ }
shmem_dir_unmap(dir);
cond_resched();
dir = shmem_dir_map(middir);
diroff = 0;
}
+ punch_lock = needs_lock;
subdir = dir[diroff];
- if (subdir && page_private(subdir)) {
+ if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
+ if (needs_lock) {
+ spin_lock(needs_lock);
+ dir[diroff] = NULL;
+ spin_unlock(needs_lock);
+ punch_lock = NULL;
+ } else
+ dir[diroff] = NULL;
+ nr_pages_to_free++;
+ list_add(&subdir->lru, &pages_to_free);
+ }
+ if (subdir && page_private(subdir) /* has swap entries */) {
size = limit - idx;
if (size > ENTRIES_PER_PAGE)
size = ENTRIES_PER_PAGE;
freed = shmem_map_and_free_swp(subdir,
- offset, size, &dir);
+ offset, size, &dir, punch_lock);
if (!dir)
dir = shmem_dir_map(middir);
nr_swaps_freed += freed;
- if (offset)
+ if (offset || punch_lock) {
spin_lock(&info->lock);
- set_page_private(subdir, page_private(subdir) - freed);
- if (offset)
+ set_page_private(subdir,
+ page_private(subdir) - freed);
spin_unlock(&info->lock);
- if (!punch_hole)
- BUG_ON(page_private(subdir) > offset);
- }
- if (offset)
- offset = 0;
- else if (subdir && !page_private(subdir)) {
- dir[diroff] = NULL;
- nr_pages_to_free++;
- list_add(&subdir->lru, &pages_to_free);
+ } else
+ BUG_ON(page_private(subdir) != freed);
}
+ offset = 0;
}
done1:
shmem_dir_unmap(dir);
* generic_delete_inode did it, before we lowered next_index.
* Also, though shmem_getpage checks i_size before adding to
* cache, no recheck after: so fix the narrow window there too.
+ *
+ * Recalling truncate_inode_pages_range and unmap_mapping_range
+ * every time for punch_hole (which never got a chance to clear
+ * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
+ * yet hardly ever necessary: try to optimize them out later.
*/
truncate_inode_pages_range(inode->i_mapping, start, end);
+ if (punch_hole)
+ unmap_mapping_range(inode->i_mapping, start,
+ end - start, 1);
}
spin_lock(&info->lock);
*nodelist++ = '\0';
if (nodelist_parse(nodelist, *policy_nodes))
goto out;
+ if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
+ goto out;
}
if (!strcmp(value, "default")) {
*policy = MPOL_DEFAULT;
err = 0;
} else if (!strcmp(value, "interleave")) {
*policy = MPOL_INTERLEAVE;
- /* Default to nodes online if no nodelist */
+ /*
+ * Default to online nodes with memory if no nodelist
+ */
if (!nodelist)
- *policy_nodes = node_online_map;
+ *policy_nodes = node_states[N_HIGH_MEMORY];
err = 0;
}
out:
return page;
}
#else
-static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy,
+ nodemask_t *policy_nodes)
{
return 1;
}
if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
+
+ if (type)
+ *type = 0;
+
/*
* Normally, filepage is NULL on entry, and either found
* uptodate immediately, or allocated and zeroed, or read
* in under swappage, which is then assigned to filepage.
- * But shmem_prepare_write passes in a locked filepage,
- * which may be found not uptodate by other callers too,
- * and may need to be copied from the swappage read in.
+ * But shmem_readpage and shmem_write_begin pass in a locked
+ * filepage, which may be found not uptodate by other callers
+ * too, and may need to be copied from the swappage read in.
*/
repeat:
if (!filepage)
if (!swappage) {
shmem_swp_unmap(entry);
/* here we actually do the io */
- if (type && *type == VM_FAULT_MINOR) {
+ if (type && !(*type & VM_FAULT_MAJOR)) {
__count_vm_event(PGMAJFAULT);
- *type = VM_FAULT_MAJOR;
+ *type |= VM_FAULT_MAJOR;
}
spin_unlock(&info->lock);
swappage = shmem_swapin(info, swap, idx);
}
done:
if (*pagep != filepage) {
- unlock_page(filepage);
*pagep = filepage;
+ if (sgp != SGP_FAULT)
+ unlock_page(filepage);
+
}
return 0;
return error;
}
-static struct page *shmem_nopage(struct vm_area_struct *vma,
- unsigned long address, int *type)
+static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
- struct page *page = NULL;
- unsigned long idx;
int error;
+ int ret;
- idx = (address - vma->vm_start) >> PAGE_SHIFT;
- idx += vma->vm_pgoff;
- idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
- if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- return NOPAGE_SIGBUS;
+ if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+ return VM_FAULT_SIGBUS;
- error = shmem_getpage(inode, idx, &page, SGP_CACHE, type);
+ error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret);
if (error)
- return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
-
- mark_page_accessed(page);
- return page;
-}
-
-static int shmem_populate(struct vm_area_struct *vma,
- unsigned long addr, unsigned long len,
- pgprot_t prot, unsigned long pgoff, int nonblock)
-{
- struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
- struct mm_struct *mm = vma->vm_mm;
- enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
- unsigned long size;
-
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
- return -EINVAL;
+ return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
- while ((long) len > 0) {
- struct page *page = NULL;
- int err;
- /*
- * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
- */
- err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
- if (err)
- return err;
- /* Page may still be null, but only if nonblock was set. */
- if (page) {
- mark_page_accessed(page);
- err = install_page(mm, vma, addr, page, prot);
- if (err) {
- page_cache_release(page);
- return err;
- }
- } else if (vma->vm_flags & VM_NONLINEAR) {
- /* No page was found just because we can't read it in
- * now (being here implies nonblock != 0), but the page
- * may exist, so set the PTE to fault it in later. */
- err = install_file_pte(mm, vma, addr, pgoff, prot);
- if (err)
- return err;
- }
-
- len -= PAGE_SIZE;
- addr += PAGE_SIZE;
- pgoff++;
- }
- return 0;
+ mark_page_accessed(vmf->page);
+ return ret | VM_FAULT_LOCKED;
}
#ifdef CONFIG_NUMA
{
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
static const struct inode_operations shmem_symlink_inline_operations;
/*
- * Normally tmpfs makes no use of shmem_prepare_write, but it
- * lets a tmpfs file be used read-write below the loop driver.
+ * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
+ * but providing them allows a tmpfs file to be used for splice, sendfile, and
+ * below the loop driver, in the generic fashion that many filesystems support.
*/
-static int
-shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+static int shmem_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
- return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+ int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
+ unlock_page(page);
+ return error;
+}
+
+static int
+shmem_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ *pagep = NULL;
+ return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+}
+
+static int
+shmem_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+
+ set_page_dirty(page);
+ page_cache_release(page);
+
+ if (pos+copied > inode->i_size)
+ i_size_write(inode, pos+copied);
+
+ return copied;
}
static ssize_t
return desc.error;
}
-static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
- size_t count, read_actor_t actor, void *target)
-{
- read_descriptor_t desc;
-
- if (!count)
- return 0;
-
- desc.written = 0;
- desc.count = count;
- desc.arg.data = target;
- desc.error = 0;
-
- do_shmem_file_read(in_file, ppos, &desc, actor);
- if (desc.written)
- return desc.written;
- return desc.error;
-}
-
static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
unsigned long blocks = 0;
unsigned long inodes = 0;
int policy = MPOL_DEFAULT;
- nodemask_t policy_nodes = node_online_map;
+ nodemask_t policy_nodes = node_states[N_HIGH_MEMORY];
#ifdef CONFIG_TMPFS
/*
{
struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR) {
- inode_init_once(&p->vfs_inode);
+ inode_init_once(&p->vfs_inode);
#ifdef CONFIG_TMPFS_POSIX_ACL
- p->i_acl = NULL;
- p->i_default_acl = NULL;
+ p->i_acl = NULL;
+ p->i_default_acl = NULL;
#endif
- }
}
static int init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, 0, init_once, NULL);
+ 0, 0, init_once);
if (shmem_inode_cachep == NULL)
return -ENOMEM;
return 0;
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
- .prepare_write = shmem_prepare_write,
- .commit_write = simple_commit_write,
+ .readpage = shmem_readpage,
+ .write_begin = shmem_write_begin,
+ .write_end = shmem_write_end,
#endif
.migratepage = migrate_page,
};
.read = shmem_file_read,
.write = shmem_file_write,
.fsync = simple_sync_file,
- .sendfile = shmem_file_sendfile,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
#endif
};
#endif
};
-static struct super_operations shmem_ops = {
+static const struct super_operations shmem_ops = {
.alloc_inode = shmem_alloc_inode,
.destroy_inode = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
};
static struct vm_operations_struct shmem_vm_ops = {
- .nopage = shmem_nopage,
- .populate = shmem_populate,
+ .fault = shmem_fault,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,