Merge branch 'x86/crashdump' into x86/urgent

[linux-2.6] / fs / ext4 / inode.c
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 24518b57733efcd5f7b57cd3ffc02dd64dfb9dc2..8ca2763df091051fea3e02ae7bba35a1e82e21d9 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2003,11 +2003,15 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
         handle_t *handle = NULL;
  
         handle = ext4_journal_current_handle();
-       BUG_ON(handle == NULL);
-       BUG_ON(create == 0);
-
-       ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+       if (!handle) {
+               ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                  bh_result, 0, 0, 0);
+               BUG_ON(!ret);
+       } else {
+               ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
                                    bh_result, create, 0, EXT4_DELALLOC_RSVED);
+       }
+
         if (ret > 0) {
                 bh_result->b_size = (ret << inode->i_blkbits);
  
@@ -2040,15 +2044,37 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
  
  static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
  {
-       return !buffer_mapped(bh) || buffer_delay(bh);
+       /*
+        * unmapped buffer is possible for holes.
+        * delay buffer is possible with delayed allocation
+        */
+       return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+       int ret = 0;
+       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+       /*
+        * we don't want to do block allocation in writepage
+        * so call get_block_wrap with create = 0
+        */
+       ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+                                  bh_result, 0, 0, 0);
+       if (ret > 0) {
+               bh_result->b_size = (ret << inode->i_blkbits);
+               ret = 0;
+       }
+       return ret;
  }
  
  /*
- * get called vi ext4_da_writepages after taking page lock
- * We may end up doing block allocation here in case
- * mpage_da_map_blocks failed to allocate blocks.
- *
- * We also get called via journal_submit_inode_data_buffers
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
   */
  static int ext4_da_writepage(struct page *page,
                                 struct writeback_control *wbc)
@@ -2056,37 +2082,61 @@ static int ext4_da_writepage(struct page *page,
         int ret = 0;
         loff_t size;
         unsigned long len;
-       handle_t *handle = NULL;
         struct buffer_head *page_bufs;
         struct inode *inode = page->mapping->host;
  
-       handle = ext4_journal_current_handle();
-       if (!handle) {
-               /*
-                * This can happen when we aren't called via
-                * ext4_da_writepages() but directly (shrink_page_list).
-                * We cannot easily start a transaction here so we just skip
-                * writing the page in case we would have to do so.
-                * We reach here also via journal_submit_inode_data_buffers
-                */
-               size = i_size_read(inode);
+       size = i_size_read(inode);
+       if (page->index == size >> PAGE_CACHE_SHIFT)
+               len = size & ~PAGE_CACHE_MASK;
+       else
+               len = PAGE_CACHE_SIZE;
  
+       if (page_has_buffers(page)) {
                 page_bufs = page_buffers(page);
-               if (page->index == size >> PAGE_CACHE_SHIFT)
-                       len = size & ~PAGE_CACHE_MASK;
-               else
-                       len = PAGE_CACHE_SIZE;
-
-               if (walk_page_buffers(NULL, page_bufs, 0,
-                               len, NULL, ext4_bh_unmapped_or_delay)) {
+               if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                       ext4_bh_unmapped_or_delay)) {
                         /*
-                        * We can't do block allocation under
-                        * page lock without a handle . So redirty
-                        * the page and return
+                        * We don't want to do  block allocation
+                        * So redirty the page and return
                          * We may reach here when we do a journal commit
                          * via journal_submit_inode_data_buffers.
                          * If we don't have mapping block we just ignore
-                        * them
+                        * them. We can also reach here via shrink_page_list
+                        */
+                       redirty_page_for_writepage(wbc, page);
+                       unlock_page(page);
+                       return 0;
+               }
+       } else {
+               /*
+                * The test for page_has_buffers() is subtle:
+                * We know the page is dirty but it lost buffers. That means
+                * that at some moment in time after write_begin()/write_end()
+                * has been called all buffers have been clean and thus they
+                * must have been written at least once. So they are all
+                * mapped and we can happily proceed with mapping them
+                * and writing the page.
+                *
+                * Try to initialize the buffer_heads and check whether
+                * all are mapped and non delay. We don't want to
+                * do block allocation here.
+                */
+               ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                                               ext4_normal_get_block_write);
+               if (!ret) {
+                       page_bufs = page_buffers(page);
+                       /* check whether all are mapped and non delay */
+                       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                               ext4_bh_unmapped_or_delay)) {
+                               redirty_page_for_writepage(wbc, page);
+                               unlock_page(page);
+                               return 0;
+                       }
+               } else {
+                       /*
+                        * We can't do block allocation here
+                        * so just redity the page and unlock
+                        * and return
                          */
                         redirty_page_for_writepage(wbc, page);
                         unlock_page(page);
@@ -2095,9 +2145,11 @@ static int ext4_da_writepage(struct page *page,
         }
  
         if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-               ret = nobh_writepage(page, ext4_da_get_block_write, wbc);
+               ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
         else
-               ret = block_write_full_page(page, ext4_da_get_block_write, wbc);
+               ret = block_write_full_page(page,
+                                               ext4_normal_get_block_write,
+                                               wbc);
  
         return ret;
  }
@@ -2246,6 +2298,29 @@ out:
         return ret;
  }
  
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+                                        unsigned long offset)
+{
+       struct buffer_head *bh;
+       struct inode *inode = page->mapping->host;
+       unsigned int idx;
+       int i;
+
+       bh = page_buffers(page);
+       idx = offset >> inode->i_blkbits;
+
+       for (i=0; i < idx; i++)
+               bh = bh->b_this_page;
+
+       if (!buffer_mapped(bh) || (buffer_delay(bh)))
+               return 0;
+       return 1;
+}
+
  static int ext4_da_write_end(struct file *file,
                                 struct address_space *mapping,
                                 loff_t pos, unsigned len, unsigned copied,
@@ -2255,6 +2330,10 @@ static int ext4_da_write_end(struct file *file,
         int ret = 0, ret2;
         handle_t *handle = ext4_journal_current_handle();
         loff_t new_i_size;
+       unsigned long start, end;
+
+       start = pos & (PAGE_CACHE_SIZE - 1);
+       end = start + copied -1;
  
         /*
          * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2263,18 +2342,23 @@ static int ext4_da_write_end(struct file *file,
          */
  
         new_i_size = pos + copied;
-       if (new_i_size > EXT4_I(inode)->i_disksize)
-               if (!walk_page_buffers(NULL, page_buffers(page),
-                                      0, len, NULL, ext4_bh_unmapped_or_delay)){
-                       /*
-                        * Updating i_disksize when extending file without
-                        * needing block allocation
-                        */
-                       if (ext4_should_order_data(inode))
-                               ret = ext4_jbd2_file_inode(handle, inode);
+       if (new_i_size > EXT4_I(inode)->i_disksize) {
+               if (ext4_da_should_update_i_disksize(page, end)) {
+                       down_write(&EXT4_I(inode)->i_data_sem);
+                       if (new_i_size > EXT4_I(inode)->i_disksize) {
+                               /*
+                                * Updating i_disksize when extending file
+                                * without needing block allocation
+                                */
+                               if (ext4_should_order_data(inode))
+                                       ret = ext4_jbd2_file_inode(handle,
+                                                                  inode);
  
-                       EXT4_I(inode)->i_disksize = new_i_size;
+                               EXT4_I(inode)->i_disksize = new_i_size;
+                       }
+                       up_write(&EXT4_I(inode)->i_data_sem);
                 }
+       }
         ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                         page, fsdata);
         copied = ret2;
@@ -2438,12 +2522,14 @@ static int __ext4_normal_writepage(struct page *page,
         struct inode *inode = page->mapping->host;
  
         if (test_opt(inode->i_sb, NOBH))
-               return nobh_writepage(page, ext4_get_block, wbc);
+               return nobh_writepage(page,
+                                       ext4_normal_get_block_write, wbc);
         else
-               return block_write_full_page(page, ext4_get_block, wbc);
+               return block_write_full_page(page,
+                                               ext4_normal_get_block_write,
+                                               wbc);
  }
  
-
  static int ext4_normal_writepage(struct page *page,
                                 struct writeback_control *wbc)
  {
@@ -2452,13 +2538,24 @@ static int ext4_normal_writepage(struct page *page,
         loff_t len;
  
         J_ASSERT(PageLocked(page));
-       J_ASSERT(page_has_buffers(page));
         if (page->index == size >> PAGE_CACHE_SHIFT)
                 len = size & ~PAGE_CACHE_MASK;
         else
                 len = PAGE_CACHE_SIZE;
-       BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                ext4_bh_unmapped_or_delay));
+
+       if (page_has_buffers(page)) {
+               /* if page has buffers it should all be mapped
+                * and allocated. If there are not buffers attached
+                * to the page we know the page is dirty but it lost
+                * buffers. That means that at some moment in time
+                * after write_begin() / write_end() has been called
+                * all buffers have been clean and thus they must have been
+                * written at least once. So they are all mapped and we can
+                * happily proceed with mapping them and writing the page.
+                */
+               BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                       ext4_bh_unmapped_or_delay));
+       }
  
         if (!ext4_journal_current_handle())
                 return __ext4_normal_writepage(page, wbc);
@@ -2478,7 +2575,8 @@ static int __ext4_journalled_writepage(struct page *page,
         int ret = 0;
         int err;
  
-       ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+       ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                                       ext4_normal_get_block_write);
         if (ret != 0)
                 goto out_unlock;
  
@@ -2525,13 +2623,24 @@ static int ext4_journalled_writepage(struct page *page,
         loff_t len;
  
         J_ASSERT(PageLocked(page));
-       J_ASSERT(page_has_buffers(page));
         if (page->index == size >> PAGE_CACHE_SHIFT)
                 len = size & ~PAGE_CACHE_MASK;
         else
                 len = PAGE_CACHE_SIZE;
-       BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                ext4_bh_unmapped_or_delay));
+
+       if (page_has_buffers(page)) {
+               /* if page has buffers it should all be mapped
+                * and allocated. If there are not buffers attached
+                * to the page we know the page is dirty but it lost
+                * buffers. That means that at some moment in time
+                * after write_begin() / write_end() has been called
+                * all buffers have been clean and thus they must have been
+                * written at least once. So they are all mapped and we can
+                * happily proceed with mapping them and writing the page.
+                */
+               BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                       ext4_bh_unmapped_or_delay));
+       }
  
         if (ext4_journal_current_handle())
                 goto no_write;
@@ -2549,7 +2658,9 @@ static int ext4_journalled_writepage(struct page *page,
                  * really know unless we go poke around in the buffer_heads.
                  * But block_write_full_page will do the right thing.
                  */
-               return block_write_full_page(page, ext4_get_block, wbc);
+               return block_write_full_page(page,
+                                               ext4_normal_get_block_write,
+                                               wbc);
         }
  no_write:
         redirty_page_for_writepage(wbc, page);
@@ -3314,6 +3425,11 @@ void ext4_truncate(struct inode *inode)
         if (ext4_orphan_add(handle, inode))
                 goto out_stop;
  
+       /*
+        * From here we block out all ext4_get_block() callers who want to
+        * modify the block allocation tree.
+        */
+       down_write(&ei->i_data_sem);
         /*
          * The orphan list entry will now protect us from any crash which
          * occurs before the truncate completes, so it is now safe to propagate
@@ -3323,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
          */
         ei->i_disksize = inode->i_size;
  
-       /*
-        * From here we block out all ext4_get_block() callers who want to
-        * modify the block allocation tree.
-        */
-       down_write(&ei->i_data_sem);
-
         if (n == 1) {           /* direct blocks */
                 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                                i_data + EXT4_NDIR_BLOCKS);
@@ -4121,6 +4231,32 @@ err_out:
         return error;
  }
  
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                struct kstat *stat)
+{
+       struct inode *inode;
+       unsigned long delalloc_blocks;
+
+       inode = dentry->d_inode;
+       generic_fillattr(inode, stat);
+
+       /*
+        * We can't update i_blocks if the block allocation is delayed
+        * otherwise in the case of system crash before the real block
+        * allocation is done, we will have i_blocks inconsistent with
+        * on-disk file blocks.
+        * We always keep i_blocks updated together with real
+        * allocation. But to not confuse with user, stat
+        * will return the blocks that include the delayed allocation
+        * blocks for this file.
+        */
+       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+       delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+       stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+       return 0;
+}
  
  /*
   * How many blocks doth make a writepage()?