The idea of a swap_device_lock per device, and a swap_list_lock over them all,
is appealing; but in practice almost every holder of swap_device_lock must
already hold swap_list_lock, which defeats the purpose of the split.
The only exceptions have been swap_duplicate, valid_swaphandles and an
untrodden path in try_to_unuse (plus a few places added in this series).
valid_swaphandles doesn't show up high in profiles, but swap_duplicate does
demand attention. However, with the hold time in get_swap_pages so much
reduced, I've not yet found a load and set of swap device priorities to show
even swap_duplicate benefitting from the split. Certainly the split is mere
overhead in the common case of a single swap device.
So, replace swap_list_lock and swap_device_lock by spinlock_t swap_lock
(generally we seem to prefer an _ in the name, and not hide in a macro).
If someone can show a regression in swap_duplicate, then probably we should
add a hashlock for the swap_map entries alone (shorts being anatomic), so as
to help the case of the single swap device too.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
vmtruncate) does not lose sending ipi's to cloned threads that might
be spawned underneath it and go to user mode to drag in pte's into tlbs.
vmtruncate) does not lose sending ipi's to cloned threads that might
be spawned underneath it and go to user mode to drag in pte's into tlbs.
-swap_list_lock/swap_device_lock
--------------------------------
+swap_lock
+--------------
The swap devices are chained in priority order from the "swap_list" header.
The "swap_list" is used for the round-robin swaphandle allocation strategy.
The #free swaphandles is maintained in "nr_swap_pages". These two together
The swap devices are chained in priority order from the "swap_list" header.
The "swap_list" is used for the round-robin swaphandle allocation strategy.
The #free swaphandles is maintained in "nr_swap_pages". These two together
-are protected by the swap_list_lock.
+are protected by the swap_lock.
-The swap_device_lock, which is per swap device, protects the reference
-counts on the corresponding swaphandles, maintained in the "swap_map"
-array, and the "highest_bit" and "lowest_bit" fields.
+The swap_lock also protects all the device reference counts on the
+corresponding swaphandles, maintained in the "swap_map" array, and the
+"highest_bit" and "lowest_bit" fields.
-Both of these are spinlocks, and are never acquired from intr level. The
-locking hierarchy is swap_list_lock -> swap_device_lock.
+The swap_lock is a spinlock, and is never acquired from intr level.
To prevent races between swap space deletion or async readahead swapins
deciding whether a swap handle is being used, ie worthy of being read in
To prevent races between swap space deletion or async readahead swapins
deciding whether a swap handle is being used, ie worthy of being read in
*/
struct swap_info_struct {
unsigned int flags;
*/
struct swap_info_struct {
unsigned int flags;
+ int prio; /* swap priority */
struct file *swap_file;
struct block_device *bdev;
struct list_head extent_list;
struct file *swap_file;
struct block_device *bdev;
struct list_head extent_list;
unsigned int pages;
unsigned int max;
unsigned int inuse_pages;
unsigned int pages;
unsigned int max;
unsigned int inuse_pages;
- int prio; /* swap priority */
int next; /* next entry on swap list */
};
int next; /* next entry on swap list */
};
extern int remove_exclusive_swap_page(struct page *);
struct backing_dev_info;
extern int remove_exclusive_swap_page(struct page *);
struct backing_dev_info;
-extern struct swap_list_t swap_list;
-extern spinlock_t swaplock;
-
-#define swap_list_lock() spin_lock(&swaplock)
-#define swap_list_unlock() spin_unlock(&swaplock)
-#define swap_device_lock(p) spin_lock(&p->sdev_lock)
-#define swap_device_unlock(p) spin_unlock(&p->sdev_lock)
+extern spinlock_t swap_lock;
/* linux/mm/thrash.c */
extern struct mm_struct * swap_token_mm;
/* linux/mm/thrash.c */
extern struct mm_struct * swap_token_mm;
*
* ->i_mmap_lock (vmtruncate)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
*
* ->i_mmap_lock (vmtruncate)
* ->private_lock (__free_pte->__set_page_dirty_buffers)
- * ->swap_list_lock
- * ->swap_device_lock (exclusive_swap_page, others)
- * ->mapping->tree_lock
+ * ->swap_lock (exclusive_swap_page, others)
+ * ->mapping->tree_lock
*
* ->i_sem
* ->i_mmap_lock (truncate->unmap_mapping_range)
*
* ->i_sem
* ->i_mmap_lock (truncate->unmap_mapping_range)
* ->page_table_lock (anon_vma_prepare and various)
*
* ->page_table_lock
* ->page_table_lock (anon_vma_prepare and various)
*
* ->page_table_lock
- * ->swap_device_lock (try_to_unmap_one)
+ * ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
* anon_vma->lock
* mm->page_table_lock
* zone->lru_lock (in mark_page_accessed)
* anon_vma->lock
* mm->page_table_lock
* zone->lru_lock (in mark_page_accessed)
- * swap_list_lock (in swap_free etc's swap_info_get)
+ * swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mmlist_lock (in mmput, drain_mmlist and others)
- * swap_device_lock (in swap_duplicate, swap_info_get)
* mapping->private_lock (in __set_page_dirty_buffers)
* inode_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->private_lock (in __set_page_dirty_buffers)
* inode_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
-DEFINE_SPINLOCK(swaplock);
+DEFINE_SPINLOCK(swap_lock);
unsigned int nr_swapfiles;
long total_swap_pages;
static int swap_overflow;
unsigned int nr_swapfiles;
long total_swap_pages;
static int swap_overflow;
/*
* We need this because the bdev->unplug_fn can sleep and we cannot
/*
* We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_list_lock while calling the unplug_fn. And swap_list_lock
+ * hold swap_lock while calling the unplug_fn. And swap_lock
* cannot be turned into a semaphore.
*/
static DECLARE_RWSEM(swap_unplug_sem);
* cannot be turned into a semaphore.
*/
static DECLARE_RWSEM(swap_unplug_sem);
si->cluster_nr = SWAPFILE_CLUSTER - 1;
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
goto lowest;
si->cluster_nr = SWAPFILE_CLUSTER - 1;
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER)
goto lowest;
- swap_device_unlock(si);
+ spin_unlock(&swap_lock);
offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
if (si->swap_map[offset])
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
if (si->swap_map[offset])
last_in_cluster = offset + SWAPFILE_CLUSTER;
else if (offset == last_in_cluster) {
si->cluster_next = offset-SWAPFILE_CLUSTER-1;
goto cluster;
}
si->cluster_next = offset-SWAPFILE_CLUSTER-1;
goto cluster;
}
latency_ration = LATENCY_LIMIT;
}
}
latency_ration = LATENCY_LIMIT;
}
}
- swap_device_unlock(si);
+ spin_unlock(&swap_lock);
while (++offset <= si->highest_bit) {
if (!si->swap_map[offset]) {
while (++offset <= si->highest_bit) {
if (!si->swap_map[offset]) {
goto checks;
}
if (unlikely(--latency_ration < 0)) {
goto checks;
}
if (unlikely(--latency_ration < 0)) {
latency_ration = LATENCY_LIMIT;
}
}
latency_ration = LATENCY_LIMIT;
}
}
int type, next;
int wrapped = 0;
int type, next;
int wrapped = 0;
if (nr_swap_pages <= 0)
goto noswap;
nr_swap_pages--;
if (nr_swap_pages <= 0)
goto noswap;
nr_swap_pages--;
continue;
swap_list.next = next;
continue;
swap_list.next = next;
- swap_device_lock(si);
- swap_list_unlock();
offset = scan_swap_map(si);
offset = scan_swap_map(si);
- swap_device_unlock(si);
- if (offset)
+ if (offset) {
+ spin_unlock(&swap_lock);
return swp_entry(type, offset);
return swp_entry(type, offset);
next = swap_list.next;
}
nr_swap_pages++;
noswap:
next = swap_list.next;
}
nr_swap_pages++;
noswap:
+ spin_unlock(&swap_lock);
return (swp_entry_t) {0};
}
return (swp_entry_t) {0};
}
goto bad_offset;
if (!p->swap_map[offset])
goto bad_free;
goto bad_offset;
if (!p->swap_map[offset])
goto bad_free;
- swap_list_lock();
- swap_device_lock(p);
-static void swap_info_put(struct swap_info_struct * p)
-{
- swap_device_unlock(p);
- swap_list_unlock();
-}
-
static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
{
int count = p->swap_map[offset];
static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
{
int count = p->swap_map[offset];
p = swap_info_get(entry);
if (p) {
swap_entry_free(p, swp_offset(entry));
p = swap_info_get(entry);
if (p) {
swap_entry_free(p, swp_offset(entry));
+ spin_unlock(&swap_lock);
if (p) {
/* Subtract the 1 for the swap cache itself */
count = p->swap_map[swp_offset(entry)] - 1;
if (p) {
/* Subtract the 1 for the swap cache itself */
count = p->swap_map[swp_offset(entry)] - 1;
+ spin_unlock(&swap_lock);
}
write_unlock_irq(&swapper_space.tree_lock);
}
}
write_unlock_irq(&swapper_space.tree_lock);
}
+ spin_unlock(&swap_lock);
if (retval) {
swap_free(entry);
if (retval) {
swap_free(entry);
if (p) {
if (swap_entry_free(p, swp_offset(entry)) == 1)
page = find_trylock_page(&swapper_space, entry.val);
if (p) {
if (swap_entry_free(p, swp_offset(entry)) == 1)
page = find_trylock_page(&swapper_space, entry.val);
+ spin_unlock(&swap_lock);
}
if (page) {
int one_user;
}
if (page) {
int one_user;
- * No need for swap_device_lock(si) here: we're just looking
+ * No need for swap_lock here: we're just looking
* for whether an entry is in use, not modifying it; false
* hits are okay, and sys_swapoff() has already prevented new
* for whether an entry is in use, not modifying it; false
* hits are okay, and sys_swapoff() has already prevented new
- * allocations from this area (while holding swap_list_lock()).
+ * allocations from this area (while holding swap_lock).
*/
for (;;) {
if (++i >= max) {
*/
for (;;) {
if (++i >= max) {
* report them; but do report if we reset SWAP_MAP_MAX.
*/
if (*swap_map == SWAP_MAP_MAX) {
* report them; but do report if we reset SWAP_MAP_MAX.
*/
if (*swap_map == SWAP_MAP_MAX) {
- swap_device_unlock(si);
+ spin_unlock(&swap_lock);
- * After a successful try_to_unuse, if no swap is now in use, we know we
- * can empty the mmlist. swap_list_lock must be held on entry and exit.
- * Note that mmlist_lock nests inside swap_list_lock, and an mm must be
+ * After a successful try_to_unuse, if no swap is now in use, we know
+ * we can empty the mmlist. swap_lock must be held on entry and exit.
+ * Note that mmlist_lock nests inside swap_lock, and an mm must be
* added to the mmlist just after page_duplicate - before would be racy.
*/
static void drain_mmlist(void)
* added to the mmlist just after page_duplicate - before would be racy.
*/
static void drain_mmlist(void)
mapping = victim->f_mapping;
prev = -1;
mapping = victim->f_mapping;
prev = -1;
for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
p = swap_info + type;
if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
p = swap_info + type;
if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) {
}
if (type < 0) {
err = -EINVAL;
}
if (type < 0) {
err = -EINVAL;
+ spin_unlock(&swap_lock);
goto out_dput;
}
if (!security_vm_enough_memory(p->pages))
vm_unacct_memory(p->pages);
else {
err = -ENOMEM;
goto out_dput;
}
if (!security_vm_enough_memory(p->pages))
vm_unacct_memory(p->pages);
else {
err = -ENOMEM;
+ spin_unlock(&swap_lock);
goto out_dput;
}
if (prev < 0) {
goto out_dput;
}
if (prev < 0) {
}
nr_swap_pages -= p->pages;
total_swap_pages -= p->pages;
}
nr_swap_pages -= p->pages;
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
p->flags &= ~SWP_WRITEOK;
- swap_device_unlock(p);
- swap_list_unlock();
+ spin_unlock(&swap_lock);
current->flags |= PF_SWAPOFF;
err = try_to_unuse(type);
current->flags |= PF_SWAPOFF;
err = try_to_unuse(type);
if (err) {
/* re-insert swap space back into swap_list */
if (err) {
/* re-insert swap space back into swap_list */
for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
if (p->prio >= swap_info[i].prio)
break;
for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
if (p->prio >= swap_info[i].prio)
break;
swap_info[prev].next = p - swap_info;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
swap_info[prev].next = p - swap_info;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
- swap_device_unlock(p);
- swap_list_unlock();
+ spin_unlock(&swap_lock);
down_write(&swap_unplug_sem);
up_write(&swap_unplug_sem);
down_write(&swap_unplug_sem);
up_write(&swap_unplug_sem);
+ destroy_swap_extents(p);
+ down(&swapon_sem);
+ spin_lock(&swap_lock);
+ drain_mmlist();
+
/* wait for anyone still in scan_swap_map */
/* wait for anyone still in scan_swap_map */
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
p->highest_bit = 0; /* cuts scans short */
while (p->flags >= SWP_SCANNING) {
+ spin_unlock(&swap_lock);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(1);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(1);
- destroy_swap_extents(p);
- down(&swapon_sem);
- swap_list_lock();
- drain_mmlist();
- swap_device_lock(p);
swap_file = p->swap_file;
p->swap_file = NULL;
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
swap_file = p->swap_file;
p->swap_file = NULL;
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
- swap_device_unlock(p);
- swap_list_unlock();
+ spin_unlock(&swap_lock);
up(&swapon_sem);
vfree(swap_map);
inode = mapping->host;
up(&swapon_sem);
vfree(swap_map);
inode = mapping->host;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
p = swap_info;
for (type = 0 ; type < nr_swapfiles ; type++,p++)
if (!(p->flags & SWP_USED))
p = swap_info;
for (type = 0 ; type < nr_swapfiles ; type++,p++)
if (!(p->flags & SWP_USED))
* swp_entry_t or the architecture definition of a swap pte.
*/
if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
* swp_entry_t or the architecture definition of a swap pte.
*/
if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
+ spin_unlock(&swap_lock);
goto out;
}
if (type >= nr_swapfiles)
goto out;
}
if (type >= nr_swapfiles)
p->highest_bit = 0;
p->cluster_nr = 0;
p->inuse_pages = 0;
p->highest_bit = 0;
p->cluster_nr = 0;
p->inuse_pages = 0;
- spin_lock_init(&p->sdev_lock);
p->next = -1;
if (swap_flags & SWAP_FLAG_PREFER) {
p->prio =
p->next = -1;
if (swap_flags & SWAP_FLAG_PREFER) {
p->prio =
} else {
p->prio = --least_priority;
}
} else {
p->prio = --least_priority;
}
+ spin_unlock(&swap_lock);
name = getname(specialfile);
error = PTR_ERR(name);
if (IS_ERR(name)) {
name = getname(specialfile);
error = PTR_ERR(name);
if (IS_ERR(name)) {
- swap_list_lock();
- swap_device_lock(p);
p->flags = SWP_ACTIVE;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;
p->flags = SWP_ACTIVE;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;
} else {
swap_info[prev].next = p - swap_info;
}
} else {
swap_info[prev].next = p - swap_info;
}
- swap_device_unlock(p);
- swap_list_unlock();
+ spin_unlock(&swap_lock);
up(&swapon_sem);
error = 0;
goto out;
up(&swapon_sem);
error = 0;
goto out;
}
destroy_swap_extents(p);
bad_swap_2:
}
destroy_swap_extents(p);
bad_swap_2:
swap_map = p->swap_map;
p->swap_file = NULL;
p->swap_map = NULL;
p->flags = 0;
if (!(swap_flags & SWAP_FLAG_PREFER))
++least_priority;
swap_map = p->swap_map;
p->swap_file = NULL;
p->swap_map = NULL;
p->flags = 0;
if (!(swap_flags & SWAP_FLAG_PREFER))
++least_priority;
+ spin_unlock(&swap_lock);
vfree(swap_map);
if (swap_file)
filp_close(swap_file, NULL);
vfree(swap_map);
if (swap_file)
filp_close(swap_file, NULL);
unsigned int i;
unsigned long nr_to_be_unused = 0;
unsigned int i;
unsigned long nr_to_be_unused = 0;
for (i = 0; i < nr_swapfiles; i++) {
if (!(swap_info[i].flags & SWP_USED) ||
(swap_info[i].flags & SWP_WRITEOK))
for (i = 0; i < nr_swapfiles; i++) {
if (!(swap_info[i].flags & SWP_USED) ||
(swap_info[i].flags & SWP_WRITEOK))
}
val->freeswap = nr_swap_pages + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
}
val->freeswap = nr_swap_pages + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
+ spin_unlock(&swap_lock);
p = type + swap_info;
offset = swp_offset(entry);
p = type + swap_info;
offset = swp_offset(entry);
if (offset < p->max && p->swap_map[offset]) {
if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
p->swap_map[offset]++;
if (offset < p->max && p->swap_map[offset]) {
if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
p->swap_map[offset]++;
+ spin_unlock(&swap_lock);
- * swap_device_lock prevents swap_map being freed. Don't grab an extra
+ * swap_lock prevents swap_map being freed. Don't grab an extra
* reference on the swaphandle, it doesn't matter if it becomes unused.
*/
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
* reference on the swaphandle, it doesn't matter if it becomes unused.
*/
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
toff++, i--;
*offset = toff;
toff++, i--;
*offset = toff;
- swap_device_lock(swapdev);
do {
/* Don't read-ahead past the end of the swap area */
if (toff >= swapdev->max)
do {
/* Don't read-ahead past the end of the swap area */
if (toff >= swapdev->max)
toff++;
ret++;
} while (--i);
toff++;
ret++;
} while (--i);
- swap_device_unlock(swapdev);
+ spin_unlock(&swap_lock);