memory cgroup enhancements: force_empty interface for dropping all account in empty...

[linux-2.6] / mm / shmem.c
diff --git a/mm/shmem.c b/mm/shmem.c

index e577adf4ae85530a39175cdba516ef8dadcf7327..0f246c44a5744ec5aac0930c7896923e96bcb215 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -80,6 +80,7 @@
  enum sgp_type {
         SGP_READ,       /* don't exceed i_size, don't allocate page */
         SGP_CACHE,      /* don't exceed i_size, may allocate page */
+       SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
         SGP_WRITE,      /* may exceed i_size, may allocate page */
  };
  
@@ -192,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
  };
  
  static LIST_HEAD(shmem_swaplist);
-static DEFINE_SPINLOCK(shmem_swaplist_lock);
+static DEFINE_MUTEX(shmem_swaplist_mutex);
  
  static void shmem_free_blocks(struct inode *inode, long pages)
  {
@@ -795,9 +796,9 @@ static void shmem_delete_inode(struct inode *inode)
                 inode->i_size = 0;
                 shmem_truncate(inode);
                 if (!list_empty(&info->swaplist)) {
-                       spin_lock(&shmem_swaplist_lock);
+                       mutex_lock(&shmem_swaplist_mutex);
                         list_del_init(&info->swaplist);
-                       spin_unlock(&shmem_swaplist_lock);
+                       mutex_unlock(&shmem_swaplist_mutex);
                 }
         }
         BUG_ON(inode->i_blocks);
@@ -827,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
         struct page *subdir;
         swp_entry_t *ptr;
         int offset;
+       int error;
  
         idx = 0;
         ptr = info->i_direct;
         spin_lock(&info->lock);
+       if (!info->swapped) {
+               list_del_init(&info->swaplist);
+               goto lost2;
+       }
         limit = info->next_index;
         size = limit;
         if (size > SHMEM_NR_DIRECT)
                 size = SHMEM_NR_DIRECT;
         offset = shmem_find_swp(entry, ptr, ptr+size);
-       if (offset >= 0) {
-               shmem_swp_balance_unmap();
+       if (offset >= 0)
                 goto found;
-       }
         if (!info->i_indirect)
                 goto lost2;
  
@@ -849,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
         for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
                 if (unlikely(idx == stage)) {
                         shmem_dir_unmap(dir-1);
+                       if (cond_resched_lock(&info->lock)) {
+                               /* check it has not been truncated */
+                               if (limit > info->next_index) {
+                                       limit = info->next_index;
+                                       if (idx >= limit)
+                                               goto lost2;
+                               }
+                       }
                         dir = shmem_dir_map(info->i_indirect) +
                             ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
                         while (!*dir) {
@@ -869,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
                         if (size > ENTRIES_PER_PAGE)
                                 size = ENTRIES_PER_PAGE;
                         offset = shmem_find_swp(entry, ptr, ptr+size);
+                       shmem_swp_unmap(ptr);
                         if (offset >= 0) {
                                 shmem_dir_unmap(dir);
                                 goto found;
                         }
-                       shmem_swp_unmap(ptr);
                 }
         }
  lost1:
@@ -883,21 +895,63 @@ lost2:
         return 0;
  found:
         idx += offset;
-       inode = &info->vfs_inode;
-       if (add_to_page_cache(page, inode->i_mapping, idx, GFP_ATOMIC) == 0) {
+       inode = igrab(&info->vfs_inode);
+       spin_unlock(&info->lock);
+
+       /*
+        * Move _head_ to start search for next from here.
+        * But be careful: shmem_delete_inode checks list_empty without taking
+        * mutex, and there's an instant in list_move_tail when info->swaplist
+        * would appear empty, if it were the only one on shmem_swaplist.  We
+        * could avoid doing it if inode NULL; or use this minor optimization.
+        */
+       if (shmem_swaplist.next != &info->swaplist)
+               list_move_tail(&shmem_swaplist, &info->swaplist);
+       mutex_unlock(&shmem_swaplist_mutex);
+
+       error = 1;
+       if (!inode)
+               goto out;
+       error = radix_tree_preload(GFP_KERNEL);
+       if (error)
+               goto out;
+       error = 1;
+
+       spin_lock(&info->lock);
+       ptr = shmem_swp_entry(info, idx, NULL);
+       if (ptr && ptr->val == entry.val)
+               error = add_to_page_cache(page, inode->i_mapping,
+                                               idx, GFP_NOWAIT);
+       if (error == -EEXIST) {
+               struct page *filepage = find_get_page(inode->i_mapping, idx);
+               error = 1;
+               if (filepage) {
+                       /*
+                        * There might be a more uptodate page coming down
+                        * from a stacked writepage: forget our swappage if so.
+                        */
+                       if (PageUptodate(filepage))
+                               error = 0;
+                       page_cache_release(filepage);
+               }
+       }
+       if (!error) {
                 delete_from_swap_cache(page);
                 set_page_dirty(page);
                 info->flags |= SHMEM_PAGEIN;
-               shmem_swp_set(info, ptr + offset, 0);
+               shmem_swp_set(info, ptr, 0);
+               swap_free(entry);
+               error = 1;      /* not an error, but entry was found */
         }
-       shmem_swp_unmap(ptr);
+       if (ptr)
+               shmem_swp_unmap(ptr);
         spin_unlock(&info->lock);
-       /*
-        * Decrement swap count even when the entry is left behind:
-        * try_to_unuse will skip over mms, then reincrement count.
-        */
-       swap_free(entry);
-       return 1;
+       radix_tree_preload_end();
+out:
+       unlock_page(page);
+       page_cache_release(page);
+       iput(inode);            /* allows for NULL */
+       return error;
  }
  
  /*
@@ -909,20 +963,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
         struct shmem_inode_info *info;
         int found = 0;
  
-       spin_lock(&shmem_swaplist_lock);
+       mutex_lock(&shmem_swaplist_mutex);
         list_for_each_safe(p, next, &shmem_swaplist) {
                 info = list_entry(p, struct shmem_inode_info, swaplist);
-               if (!info->swapped)
-                       list_del_init(&info->swaplist);
-               else if (shmem_unuse_inode(info, entry, page)) {
-                       /* move head to start search for next from here */
-                       list_move_tail(&shmem_swaplist, &info->swaplist);
-                       found = 1;
-                       break;
-               }
+               found = shmem_unuse_inode(info, entry, page);
+               cond_resched();
+               if (found)
+                       goto out;
         }
-       spin_unlock(&shmem_swaplist_lock);
-       return found;
+       mutex_unlock(&shmem_swaplist_mutex);
+out:   return found;   /* 0 or 1 or -ENOMEM */
  }
  
  /*
@@ -937,58 +987,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
         struct inode *inode;
  
         BUG_ON(!PageLocked(page));
-       /*
-        * shmem_backing_dev_info's capabilities prevent regular writeback or
-        * sync from ever calling shmem_writepage; but a stacking filesystem
-        * may use the ->writepage of its underlying filesystem, in which case
-        * we want to do nothing when that underlying filesystem is tmpfs
-        * (writing out to swap is useful as a response to memory pressure, but
-        * of no use to stabilize the data) - just redirty the page, unlock it
-        * and claim success in this case.  AOP_WRITEPAGE_ACTIVATE, and the
-        * page_mapped check below, must be avoided unless we're in reclaim.
-        */
-       if (!wbc->for_reclaim) {
-               set_page_dirty(page);
-               unlock_page(page);
-               return 0;
-       }
-       BUG_ON(page_mapped(page));
-
         mapping = page->mapping;
         index = page->index;
         inode = mapping->host;
         info = SHMEM_I(inode);
         if (info->flags & VM_LOCKED)
                 goto redirty;
-       swap = get_swap_page();
-       if (!swap.val)
+       if (!total_swap_pages)
                 goto redirty;
  
+       /*
+        * shmem_backing_dev_info's capabilities prevent regular writeback or
+        * sync from ever calling shmem_writepage; but a stacking filesystem
+        * may use the ->writepage of its underlying filesystem, in which case
+        * tmpfs should write out to swap only in response to memory pressure,
+        * and not for pdflush or sync.  However, in those cases, we do still
+        * want to check if there's a redundant swappage to be discarded.
+        */
+       if (wbc->for_reclaim)
+               swap = get_swap_page();
+       else
+               swap.val = 0;
+
         spin_lock(&info->lock);
-       shmem_recalc_inode(inode);
         if (index >= info->next_index) {
                 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
                 goto unlock;
         }
         entry = shmem_swp_entry(info, index, NULL);
-       BUG_ON(!entry);
-       BUG_ON(entry->val);
+       if (entry->val) {
+               /*
+                * The more uptodate page coming down from a stacked
+                * writepage should replace our old swappage.
+                */
+               free_swap_and_cache(*entry);
+               shmem_swp_set(info, entry, 0);
+       }
+       shmem_recalc_inode(inode);
  
-       if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+       if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
                 remove_from_page_cache(page);
                 shmem_swp_set(info, entry, swap.val);
                 shmem_swp_unmap(entry);
+               if (list_empty(&info->swaplist))
+                       inode = igrab(inode);
+               else
+                       inode = NULL;
                 spin_unlock(&info->lock);
-               if (list_empty(&info->swaplist)) {
-                       spin_lock(&shmem_swaplist_lock);
-                       /* move instead of add in case we're racing */
-                       list_move_tail(&info->swaplist, &shmem_swaplist);
-                       spin_unlock(&shmem_swaplist_lock);
-               }
                 swap_duplicate(swap);
+               BUG_ON(page_mapped(page));
                 page_cache_release(page);       /* pagecache ref */
                 set_page_dirty(page);
                 unlock_page(page);
+               if (inode) {
+                       mutex_lock(&shmem_swaplist_mutex);
+                       /* move instead of add in case we're racing */
+                       list_move_tail(&info->swaplist, &shmem_swaplist);
+                       mutex_unlock(&shmem_swaplist_mutex);
+                       iput(inode);
+               }
                 return 0;
         }
  
@@ -998,7 +1055,10 @@ unlock:
         swap_free(swap);
  redirty:
         set_page_dirty(page);
-       return AOP_WRITEPAGE_ACTIVATE;  /* Return with the page locked */
+       if (wbc->for_reclaim)
+               return AOP_WRITEPAGE_ACTIVATE;  /* Return with page locked */
+       unlock_page(page);
+       return 0;
  }
  
  #ifdef CONFIG_NUMA
@@ -1143,6 +1203,16 @@ repeat:
                 goto done;
         error = 0;
         gfp = mapping_gfp_mask(mapping);
+       if (!filepage) {
+               /*
+                * Try to preload while we can wait, to not make a habit of
+                * draining atomic reserves; but don't latch on to this cpu.
+                */
+               error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+               if (error)
+                       goto failed;
+               radix_tree_preload_end();
+       }
  
         spin_lock(&info->lock);
         shmem_recalc_inode(inode);
@@ -1224,7 +1294,7 @@ repeat:
                         set_page_dirty(filepage);
                         swap_free(swap);
                 } else if (!(error = add_to_page_cache(
-                               swappage, mapping, idx, GFP_ATOMIC))) {
+                               swappage, mapping, idx, GFP_NOWAIT))) {
                         info->flags |= SHMEM_PAGEIN;
                         shmem_swp_set(info, entry, 0);
                         shmem_swp_unmap(entry);
@@ -1238,10 +1308,6 @@ repeat:
                         spin_unlock(&info->lock);
                         unlock_page(swappage);
                         page_cache_release(swappage);
-                       if (error == -ENOMEM) {
-                               /* let kswapd refresh zone for GFP_ATOMICs */
-                               congestion_wait(WRITE, HZ/50);
-                       }
                         goto repeat;
                 }
         } else if (sgp == SGP_READ && !filepage) {
@@ -1296,7 +1362,7 @@ repeat:
                                 shmem_swp_unmap(entry);
                         }
                         if (error || swap.val || 0 != add_to_page_cache_lru(
-                                       filepage, mapping, idx, GFP_ATOMIC)) {
+                                       filepage, mapping, idx, GFP_NOWAIT)) {
                                 spin_unlock(&info->lock);
                                 page_cache_release(filepage);
                                 shmem_unacct_blocks(info->flags, 1);
@@ -1314,6 +1380,8 @@ repeat:
                 clear_highpage(filepage);
                 flush_dcache_page(filepage);
                 SetPageUptodate(filepage);
+               if (sgp == SGP_DIRTY)
+                       set_page_dirty(filepage);
         }
  done:
         *pagep = filepage;
@@ -1499,6 +1567,15 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
         struct inode *inode = filp->f_path.dentry->d_inode;
         struct address_space *mapping = inode->i_mapping;
         unsigned long index, offset;
+       enum sgp_type sgp = SGP_READ;
+
+       /*
+        * Might this read be for a stacking filesystem?  Then when reading
+        * holes of a sparse file, we actually need to allocate those pages,
+        * and even mark them dirty, so it cannot exceed the max_blocks limit.
+        */
+       if (segment_eq(get_fs(), KERNEL_DS))
+               sgp = SGP_DIRTY;
  
         index = *ppos >> PAGE_CACHE_SHIFT;
         offset = *ppos & ~PAGE_CACHE_MASK;
@@ -1517,7 +1594,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
                                 break;
                 }
  
-               desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
+               desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
                 if (desc->error) {
                         if (desc->error == -EINVAL)
                                 desc->error = 0;
@@ -1878,8 +1955,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name,
  {
         if (strcmp(name, "") == 0)
                 return -EINVAL;
-       return security_inode_getsecurity(inode, name, buffer, size,
-                                         -EOPNOTSUPP);
+       return xattr_getsecurity(inode, name, buffer, size);
  }
  
  static int shmem_xattr_security_set(struct inode *inode, const char *name,