]> err.no Git - linux-2.6/blobdiff - fs/ocfs2/alloc.c
ocfs2: Force use of GFP_NOFS in ocfs2_write()
[linux-2.6] / fs / ocfs2 / alloc.c
index 85a05f1202497b55cc65595fc8bdb916a28774d3..19712a7d145feeeded8e6b21bfcd9ec7ac2a2349 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -34,6 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -216,20 +218,32 @@ enum ocfs2_contig_type {
        CONTIG_RIGHT
 };
 
+
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
 static int ocfs2_block_extent_contig(struct super_block *sb,
                                     struct ocfs2_extent_rec *ext,
                                     u64 blkno)
 {
-       return blkno == (le64_to_cpu(ext->e_blkno) +
-                        ocfs2_clusters_to_blocks(sb,
-                                                 le32_to_cpu(ext->e_clusters)));
+       u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+       blk_end += ocfs2_clusters_to_blocks(sb,
+                                   le16_to_cpu(ext->e_leaf_clusters));
+
+       return blkno == blk_end;
 }
 
 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
                                  struct ocfs2_extent_rec *right)
 {
-       return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
-               le32_to_cpu(right->e_cpos));
+       u32 left_range;
+
+       left_range = le32_to_cpu(left->e_cpos) +
+               le16_to_cpu(left->e_leaf_clusters);
+
+       return (left_range == le32_to_cpu(right->e_cpos));
 }
 
 static enum ocfs2_contig_type
@@ -428,7 +442,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
        i = le16_to_cpu(el->l_next_free_rec) - 1;
 
        return le32_to_cpu(el->l_recs[i].e_cpos) +
-               le32_to_cpu(el->l_recs[i].e_clusters);
+               ocfs2_rec_clusters(el, &el->l_recs[i]);
 }
 
 /*
@@ -440,7 +454,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
  * for the new last extent block.
  *
  * the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
  */
 static int ocfs2_add_branch(struct ocfs2_super *osb,
                            handle_t *handle,
@@ -530,7 +544,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
                 */
                eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
                eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
-               eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+               /*
+                * eb_el isn't always an interior node, but even leaf
+                * nodes want a zero'd flags and reserved field so
+                * this gets the whole 32 bits regardless of use.
+                */
+               eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
                if (!eb_el->l_tree_depth)
                        new_last_eb_blk = le64_to_cpu(eb->h_blkno);
 
@@ -575,7 +594,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
        i = le16_to_cpu(el->l_next_free_rec);
        el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
        el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
-       el->l_recs[i].e_clusters = 0;
+       el->l_recs[i].e_int_clusters = 0;
        le16_add_cpu(&el->l_next_free_rec, 1);
 
        /* fe needs a new last extent block pointer, as does the
@@ -660,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        /* copy the fe data into the new extent block */
        eb_el->l_tree_depth = fe_el->l_tree_depth;
        eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-       for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
-               eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
-               eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
-               eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
-       }
+       for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+               eb_el->l_recs[i] = fe_el->l_recs[i];
 
        status = ocfs2_journal_dirty(handle, new_eb_bh);
        if (status < 0) {
@@ -685,12 +701,9 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
        le16_add_cpu(&fe_el->l_tree_depth, 1);
        fe_el->l_recs[0].e_cpos = 0;
        fe_el->l_recs[0].e_blkno = eb->h_blkno;
-       fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
-       for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
-               fe_el->l_recs[i].e_cpos = 0;
-               fe_el->l_recs[i].e_clusters = 0;
-               fe_el->l_recs[i].e_blkno = 0;
-       }
+       fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+       for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+               memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
        fe_el->l_next_free_rec = cpu_to_le16(1);
 
        /* If this is our 1st tree depth shift, then last_eb_blk
@@ -815,9 +828,13 @@ bail:
        return status;
 }
 
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
 static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 {
-       return !rec->e_clusters;
+       return !rec->e_leaf_clusters;
 }
 
 /*
@@ -928,6 +945,8 @@ static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
 {
        int next_free = le16_to_cpu(el->l_next_free_rec);
 
+       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
        if (next_free == 0)
                goto set_and_inc;
 
@@ -1032,7 +1051,7 @@ static int __ocfs2_find_path(struct inode *inode,
                         * rightmost record.
                         */
                        range = le32_to_cpu(rec->e_cpos) +
-                               le32_to_cpu(rec->e_clusters);
+                               ocfs2_rec_clusters(el, rec);
                        if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
                            break;
                }
@@ -1193,21 +1212,21 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
         */
        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
        left_clusters -= le32_to_cpu(left_rec->e_cpos);
-       left_rec->e_clusters = cpu_to_le32(left_clusters);
+       left_rec->e_int_clusters = cpu_to_le32(left_clusters);
 
        /*
         * Calculate the rightmost cluster count boundary before
-        * moving cpos - we will need to adjust e_clusters after
+        * moving cpos - we will need to adjust clusters after
         * updating e_cpos to keep the same highest cluster count.
         */
        right_end = le32_to_cpu(right_rec->e_cpos);
-       right_end += le32_to_cpu(right_rec->e_clusters);
+       right_end += le32_to_cpu(right_rec->e_int_clusters);
 
        right_rec->e_cpos = left_rec->e_cpos;
        le32_add_cpu(&right_rec->e_cpos, left_clusters);
 
        right_end -= le32_to_cpu(right_rec->e_cpos);
-       right_rec->e_clusters = cpu_to_le32(right_end);
+       right_rec->e_int_clusters = cpu_to_le32(right_end);
 }
 
 /*
@@ -1450,6 +1469,8 @@ static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
        u64 blkno;
        struct ocfs2_extent_list *el;
 
+       BUG_ON(path->p_tree_depth == 0);
+
        *cpos = 0;
 
        blkno = path_leaf_bh(path)->b_blocknr;
@@ -1484,7 +1505,9 @@ static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
                                }
 
                                *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
-                               *cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
+                               *cpos = *cpos + ocfs2_rec_clusters(el,
+                                                          &el->l_recs[j - 1]);
+                               *cpos = *cpos - 1;
                                goto out;
                        }
                }
@@ -1713,7 +1736,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
        unsigned int range;
        struct ocfs2_extent_rec *rec;
 
-       BUG_ON(el->l_tree_depth);
+       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
 
        /*
         * Contiguous insert - either left or right.
@@ -1724,8 +1747,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
                        rec->e_blkno = insert_rec->e_blkno;
                        rec->e_cpos = insert_rec->e_cpos;
                }
-               le32_add_cpu(&rec->e_clusters,
-                            le32_to_cpu(insert_rec->e_clusters));
+               le16_add_cpu(&rec->e_leaf_clusters,
+                            le16_to_cpu(insert_rec->e_leaf_clusters));
                return;
        }
 
@@ -1746,7 +1769,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
        if (insert->ins_appending == APPEND_TAIL) {
                i = le16_to_cpu(el->l_next_free_rec) - 1;
                rec = &el->l_recs[i];
-               range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
+               range = le32_to_cpu(rec->e_cpos)
+                       + le16_to_cpu(rec->e_leaf_clusters);
                BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
 
                mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
@@ -1759,9 +1783,9 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
                                le16_to_cpu(el->l_count),
                                le16_to_cpu(el->l_next_free_rec),
                                le32_to_cpu(el->l_recs[i].e_cpos),
-                               le32_to_cpu(el->l_recs[i].e_clusters),
+                               le16_to_cpu(el->l_recs[i].e_leaf_clusters),
                                le32_to_cpu(insert_rec->e_cpos),
-                               le32_to_cpu(insert_rec->e_clusters));
+                               le16_to_cpu(insert_rec->e_leaf_clusters));
                i++;
                el->l_recs[i] = *insert_rec;
                le16_add_cpu(&el->l_next_free_rec, 1);
@@ -1803,6 +1827,12 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 
        *ret_left_path = NULL;
 
+       /*
+        * This shouldn't happen for non-trees. The extent rec cluster
+        * count manipulation below only works for interior nodes.
+        */
+       BUG_ON(right_path->p_tree_depth == 0);
+
        /*
         * If our appending insert is at the leftmost edge of a leaf,
         * then we might need to update the rightmost records of the
@@ -1861,6 +1891,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
        bh = path_root_bh(right_path);
        i = 0;
        while (1) {
+               struct ocfs2_extent_rec *rec;
+
                next_free = le16_to_cpu(el->l_next_free_rec);
                if (next_free == 0) {
                        ocfs2_error(inode->i_sb,
@@ -1870,16 +1902,19 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
                        goto out;
                }
 
-               el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
-               le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-                            le32_to_cpu(insert_rec->e_clusters));
-               le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-                           -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
+               rec = &el->l_recs[next_free - 1];
+
+               rec->e_int_clusters = insert_rec->e_cpos;
+               le32_add_cpu(&rec->e_int_clusters,
+                            le16_to_cpu(insert_rec->e_leaf_clusters));
+               le32_add_cpu(&rec->e_int_clusters,
+                            -le32_to_cpu(rec->e_cpos));
 
                ret = ocfs2_journal_dirty(handle, bh);
                if (ret)
                        mlog_errno(ret);
 
+               /* Don't touch the leaf node */
                if (++i >= right_path->p_tree_depth)
                        break;
 
@@ -2066,7 +2101,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 out_update_clusters:
        ocfs2_update_dinode_clusters(inode, di,
-                                    le32_to_cpu(insert_rec->e_clusters));
+                                    le16_to_cpu(insert_rec->e_leaf_clusters));
 
        ret = ocfs2_journal_dirty(handle, di_bh);
        if (ret)
@@ -2087,6 +2122,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
        int i;
        enum ocfs2_contig_type contig_type = CONTIG_NONE;
 
+       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
        for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
                contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
                                                  insert_rec);
@@ -2118,7 +2155,7 @@ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
 
        insert->ins_appending = APPEND_NONE;
 
-       BUG_ON(el->l_tree_depth);
+       BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
 
        if (!el->l_next_free_rec)
                goto set_tail_append;
@@ -2132,7 +2169,8 @@ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
        i = le16_to_cpu(el->l_next_free_rec) - 1;
        rec = &el->l_recs[i];
 
-       if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
+       if (cpos >=
+           (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
                goto set_tail_append;
 
        return;
@@ -2240,7 +2278,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
         * The insert code isn't quite ready to deal with all cases of
         * left contiguousness. Specifically, if it's an insert into
         * the 1st record in a leaf, it will require the adjustment of
-        * e_clusters on the last record of the path directly to it's
+        * cluster count on the last record of the path directly to it's
         * left. For now, just catch that case and fool the layers
         * above us. This works just fine for tree_depth == 0, which
         * is why we allow that above.
@@ -2308,9 +2346,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
                        (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
                        OCFS2_I(inode)->ip_clusters);
 
+       memset(&rec, 0, sizeof(rec));
        rec.e_cpos = cpu_to_le32(cpos);
        rec.e_blkno = cpu_to_le64(start_blk);
-       rec.e_clusters = cpu_to_le32(new_clusters);
+       rec.e_leaf_clusters = cpu_to_le16(new_clusters);
 
        status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
                                          &insert);
@@ -2378,6 +2417,8 @@ out_add:
        status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
        if (status < 0)
                mlog_errno(status);
+       else
+               ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
        if (bh)
@@ -2828,7 +2869,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
        tl = &tl_copy->id2.i_dealloc;
        num_recs = le16_to_cpu(tl->tl_used);
        mlog(0, "cleanup %u records from %llu\n", num_recs,
-            (unsigned long long)tl_copy->i_blkno);
+            (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
 
        mutex_lock(&tl_inode->i_mutex);
        for(i = 0; i < num_recs; i++) {
@@ -2921,12 +2962,13 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
  * block will be so we can update his h_next_leaf_blk field, as well
  * as the dinodes i_last_eb_blk */
 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
-                                      u32 new_i_clusters,
+                                      unsigned int clusters_to_del,
                                       struct ocfs2_path *path,
                                       struct buffer_head **new_last_eb)
 {
-       int ret = 0;
+       int next_free, ret = 0;
        u32 cpos;
+       struct ocfs2_extent_rec *rec;
        struct ocfs2_extent_block *eb;
        struct ocfs2_extent_list *el;
        struct buffer_head *bh = NULL;
@@ -2939,20 +2981,48 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
 
        /* trunc to zero special case - this makes tree_depth = 0
         * regardless of what it is.  */
-       if (!new_i_clusters)
+       if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
                goto out;
 
        el = path_leaf_el(path);
        BUG_ON(!el->l_next_free_rec);
 
-       /* Make sure that this guy will actually be empty after we
-        * clear away the data. */
+       /*
+        * Make sure that this extent list will actually be empty
+        * after we clear away the data. We can shortcut out if
+        * there's more than one non-empty extent in the
+        * list. Otherwise, a check of the remaining extent is
+        * necessary.
+        */
+       next_free = le16_to_cpu(el->l_next_free_rec);
+       rec = NULL;
        if (ocfs2_is_empty_extent(&el->l_recs[0])) {
-               if (le16_to_cpu(el->l_next_free_rec) > 1 &&
-                   le32_to_cpu(el->l_recs[1].e_cpos) < new_i_clusters)
+               if (next_free > 2)
                        goto out;
-       } else if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
-               goto out;
+
+               /* We may have a valid extent in index 1, check it. */
+               if (next_free == 2)
+                       rec = &el->l_recs[1];
+
+               /*
+                * Fall through - no more nonempty extents, so we want
+                * to delete this leaf.
+                */
+       } else {
+               if (next_free > 1)
+                       goto out;
+
+               rec = &el->l_recs[0];
+       }
+
+       if (rec) {
+               /*
+                * Check it we'll only be trimming off the end of this
+                * cluster.
+                */
+               if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
+                       goto out;
+       }
 
        ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
        if (ret) {
@@ -2984,6 +3054,223 @@ out:
        return ret;
 }
 
+/*
+ * Trim some clusters off the rightmost edge of a tree. Only called
+ * during truncate.
+ *
+ * The caller needs to:
+ *   - start journaling of each path component.
+ *   - compute and fully set up any new last ext block
+ */
+static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
+                          handle_t *handle, struct ocfs2_truncate_context *tc,
+                          u32 clusters_to_del, u64 *delete_start)
+{
+       int ret, i, index = path->p_tree_depth;
+       u32 new_edge = 0;
+       u64 deleted_eb = 0;
+       struct buffer_head *bh;
+       struct ocfs2_extent_list *el;
+       struct ocfs2_extent_rec *rec;
+
+       *delete_start = 0;
+
+       while (index >= 0) {
+               bh = path->p_node[index].bh;
+               el = path->p_node[index].el;
+
+               mlog(0, "traveling tree (index = %d, block = %llu)\n",
+                    index,  (unsigned long long)bh->b_blocknr);
+
+               BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+
+               if (index !=
+                   (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
+                       ocfs2_error(inode->i_sb,
+                                   "Inode %lu has invalid ext. block %llu",
+                                   inode->i_ino,
+                                   (unsigned long long)bh->b_blocknr);
+                       ret = -EROFS;
+                       goto out;
+               }
+
+find_tail_record:
+               i = le16_to_cpu(el->l_next_free_rec) - 1;
+               rec = &el->l_recs[i];
+
+               mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
+                    "next = %u\n", i, le32_to_cpu(rec->e_cpos),
+                    ocfs2_rec_clusters(el, rec),
+                    (unsigned long long)le64_to_cpu(rec->e_blkno),
+                    le16_to_cpu(el->l_next_free_rec));
+
+               BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
+
+               if (le16_to_cpu(el->l_tree_depth) == 0) {
+                       /*
+                        * If the leaf block contains a single empty
+                        * extent and no records, we can just remove
+                        * the block.
+                        */
+                       if (i == 0 && ocfs2_is_empty_extent(rec)) {
+                               memset(rec, 0,
+                                      sizeof(struct ocfs2_extent_rec));
+                               el->l_next_free_rec = cpu_to_le16(0);
+
+                               goto delete;
+                       }
+
+                       /*
+                        * Remove any empty extents by shifting things
+                        * left. That should make life much easier on
+                        * the code below. This condition is rare
+                        * enough that we shouldn't see a performance
+                        * hit.
+                        */
+                       if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+                               le16_add_cpu(&el->l_next_free_rec, -1);
+
+                               for(i = 0;
+                                   i < le16_to_cpu(el->l_next_free_rec); i++)
+                                       el->l_recs[i] = el->l_recs[i + 1];
+
+                               memset(&el->l_recs[i], 0,
+                                      sizeof(struct ocfs2_extent_rec));
+
+                               /*
+                                * We've modified our extent list. The
+                                * simplest way to handle this change
+                                * is to being the search from the
+                                * start again.
+                                */
+                               goto find_tail_record;
+                       }
+
+                       le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
+
+                       /*
+                        * We'll use "new_edge" on our way back up the
+                        * tree to know what our rightmost cpos is.
+                        */
+                       new_edge = le16_to_cpu(rec->e_leaf_clusters);
+                       new_edge += le32_to_cpu(rec->e_cpos);
+
+                       /*
+                        * The caller will use this to delete data blocks.
+                        */
+                       *delete_start = le64_to_cpu(rec->e_blkno)
+                               + ocfs2_clusters_to_blocks(inode->i_sb,
+                                       le16_to_cpu(rec->e_leaf_clusters));
+
+                       /*
+                        * If it's now empty, remove this record.
+                        */
+                       if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
+                               memset(rec, 0,
+                                      sizeof(struct ocfs2_extent_rec));
+                               le16_add_cpu(&el->l_next_free_rec, -1);
+                       }
+               } else {
+                       if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
+                               memset(rec, 0,
+                                      sizeof(struct ocfs2_extent_rec));
+                               le16_add_cpu(&el->l_next_free_rec, -1);
+
+                               goto delete;
+                       }
+
+                       /* Can this actually happen? */
+                       if (le16_to_cpu(el->l_next_free_rec) == 0)
+                               goto delete;
+
+                       /*
+                        * We never actually deleted any clusters
+                        * because our leaf was empty. There's no
+                        * reason to adjust the rightmost edge then.
+                        */
+                       if (new_edge == 0)
+                               goto delete;
+
+                       rec->e_int_clusters = cpu_to_le32(new_edge);
+                       le32_add_cpu(&rec->e_int_clusters,
+                                    -le32_to_cpu(rec->e_cpos));
+
+                        /*
+                         * A deleted child record should have been
+                         * caught above.
+                         */
+                        BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
+               }
+
+delete:
+               ret = ocfs2_journal_dirty(handle, bh);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               mlog(0, "extent list container %llu, after: record %d: "
+                    "(%u, %u, %llu), next = %u.\n",
+                    (unsigned long long)bh->b_blocknr, i,
+                    le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
+                    (unsigned long long)le64_to_cpu(rec->e_blkno),
+                    le16_to_cpu(el->l_next_free_rec));
+
+               /*
+                * We must be careful to only attempt delete of an
+                * extent block (and not the root inode block).
+                */
+               if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
+                       struct ocfs2_extent_block *eb =
+                               (struct ocfs2_extent_block *)bh->b_data;
+
+                       /*
+                        * Save this for use when processing the
+                        * parent block.
+                        */
+                       deleted_eb = le64_to_cpu(eb->h_blkno);
+
+                       mlog(0, "deleting this extent block.\n");
+
+                       ocfs2_remove_from_cache(inode, bh);
+
+                       BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
+                       BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
+                       BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
+
+                       if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+                               /*
+                                * This code only understands how to
+                                * lock the suballocator in slot 0,
+                                * which is fine because allocation is
+                                * only ever done out of that
+                                * suballocator too. A future version
+                                * might change that however, so avoid
+                                * a free if we don't know how to
+                                * handle it. This way an fs incompat
+                                * bit will not be necessary.
+                                */
+                               ret = ocfs2_free_extent_block(handle,
+                                                             tc->tc_ext_alloc_inode,
+                                                             tc->tc_ext_alloc_bh,
+                                                             eb);
+
+                               /* An error here is not fatal. */
+                               if (ret < 0)
+                                       mlog_errno(ret);
+                       }
+               } else {
+                       deleted_eb = 0;
+               }
+
+               index--;
+       }
+
+       ret = 0;
+out:
+       return ret;
+}
+
 static int ocfs2_do_truncate(struct ocfs2_super *osb,
                             unsigned int clusters_to_del,
                             struct inode *inode,
@@ -2992,20 +3279,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                             struct ocfs2_truncate_context *tc,
                             struct ocfs2_path *path)
 {
-       int status, i, index;
+       int status;
        struct ocfs2_dinode *fe;
-       struct ocfs2_extent_block *eb;
        struct ocfs2_extent_block *last_eb = NULL;
        struct ocfs2_extent_list *el;
-       struct buffer_head *eb_bh = NULL;
        struct buffer_head *last_eb_bh = NULL;
        u64 delete_blk = 0;
 
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-       status = ocfs2_find_new_last_ext_blk(inode,
-                                            le32_to_cpu(fe->i_clusters) -
-                                            clusters_to_del,
+       status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
                                             path, &last_eb_bh);
        if (status < 0) {
                mlog_errno(status);
@@ -3016,14 +3299,10 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
         * Each component will be touched, so we might as well journal
         * here to avoid having to handle errors later.
         */
-       for (i = 0; i < path_num_items(path); i++) {
-               status = ocfs2_journal_access(handle, inode,
-                                             path->p_node[i].bh,
-                                             OCFS2_JOURNAL_ACCESS_WRITE);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
-               }
+       status = ocfs2_journal_access_path(inode, handle, path);
+       if (status < 0) {
+               mlog_errno(status);
+               goto bail;
        }
 
        if (last_eb_bh) {
@@ -3043,10 +3322,11 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
         * Lower levels depend on this never happening, but it's best
         * to check it up here before changing the tree.
         */
-       if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
+       if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
                ocfs2_error(inode->i_sb,
                            "Inode %lu has an empty extent record, depth %u\n",
                            inode->i_ino, le16_to_cpu(el->l_tree_depth));
+               status = -EROFS;
                goto bail;
        }
 
@@ -3056,38 +3336,11 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        le32_add_cpu(&fe->i_clusters, -clusters_to_del);
 
-       i = le16_to_cpu(el->l_next_free_rec) - 1;
-
-       BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
-       le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-       /* tree depth zero, we can just delete the clusters, otherwise
-        * we need to record the offset of the next level extent block
-        * as we may overwrite it. */
-       if (!el->l_tree_depth) {
-               delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-                       + ocfs2_clusters_to_blocks(osb->sb,
-                                       le32_to_cpu(el->l_recs[i].e_clusters));
-
-               if (!el->l_recs[i].e_clusters) {
-                       /* if we deleted the whole extent record, then clear
-                        * out the other fields and update the extent
-                        * list.
-                        */
-                       el->l_recs[i].e_cpos = 0;
-                       el->l_recs[i].e_blkno = 0;
-                       BUG_ON(!el->l_next_free_rec);
-                       le16_add_cpu(&el->l_next_free_rec, -1);
-
-                       /*
-                        * The leftmost record might be an empty extent -
-                        * delete it here too.
-                        */
-                       if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
-                               el->l_recs[0].e_cpos = 0;
-                               el->l_recs[0].e_blkno = 0;
-                               el->l_next_free_rec = 0;
-                       }
-               }
+       status = ocfs2_trim_tree(inode, path, handle, tc,
+                                clusters_to_del, &delete_blk);
+       if (status) {
+               mlog_errno(status);
+               goto bail;
        }
 
        if (le32_to_cpu(fe->i_clusters) == 0) {
@@ -3115,131 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                }
        }
 
-       index = 1;
-       /* if our tree depth > 0, update all the tree blocks below us. */
-       while (index <= path->p_tree_depth) {
-               eb_bh = path->p_node[index].bh;
-               eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-               el = path->p_node[index].el;
-
-               mlog(0, "traveling tree (index = %d, extent block: %llu)\n",
-                    index,  (unsigned long long)eb_bh->b_blocknr);
-
-               BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-               if (index !=
-                   (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
-                       ocfs2_error(inode->i_sb,
-                                   "Inode %lu has invalid ext. block %llu\n",
-                                   inode->i_ino,
-                                   (unsigned long long)eb_bh->b_blocknr);
+       if (delete_blk) {
+               status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+                                                  clusters_to_del);
+               if (status < 0) {
+                       mlog_errno(status);
                        goto bail;
                }
+       }
+       status = 0;
+bail:
 
-               i = le16_to_cpu(el->l_next_free_rec) - 1;
+       mlog_exit(status);
+       return status;
+}
 
-               mlog(0, "extent block %llu, before: record %d: "
-                    "(%u, %u, %llu), next = %u\n",
-                    (unsigned long long)le64_to_cpu(eb->h_blkno), i,
-                    le32_to_cpu(el->l_recs[i].e_cpos),
-                    le32_to_cpu(el->l_recs[i].e_clusters),
-                    (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-                    le16_to_cpu(el->l_next_free_rec));
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+       set_buffer_uptodate(bh);
+       mark_buffer_dirty(bh);
+       return 0;
+}
 
-               BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
-               le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-
-               /* bottom-most block requires us to delete data.*/
-               if (!el->l_tree_depth)
-                       delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-                               + ocfs2_clusters_to_blocks(osb->sb,
-                                       le32_to_cpu(el->l_recs[i].e_clusters));
-               if (!el->l_recs[i].e_clusters) {
-                       el->l_recs[i].e_cpos = 0;
-                       el->l_recs[i].e_blkno = 0;
-                       BUG_ON(!el->l_next_free_rec);
-                       le16_add_cpu(&el->l_next_free_rec, -1);
-               }
-               if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
-                       el->l_recs[0].e_cpos = 0;
-                       el->l_recs[0].e_blkno = 0;
-                       el->l_next_free_rec = 0;
-               }
-
-               mlog(0, "extent block %llu, after: record %d: "
-                    "(%u, %u, %llu), next = %u\n",
-                    (unsigned long long)le64_to_cpu(eb->h_blkno), i,
-                    le32_to_cpu(el->l_recs[i].e_cpos),
-                    le32_to_cpu(el->l_recs[i].e_clusters),
-                    (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-                    le16_to_cpu(el->l_next_free_rec));
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+       set_buffer_uptodate(bh);
+       mark_buffer_dirty(bh);
+       return ocfs2_journal_dirty_data(handle, bh);
+}
 
-               status = ocfs2_journal_dirty(handle, eb_bh);
-               if (status < 0) {
-                       mlog_errno(status);
-                       goto bail;
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+                                    struct page **pages, int numpages,
+                                    u64 phys, handle_t *handle)
+{
+       int i, ret, partial = 0;
+       void *kaddr;
+       struct page *page;
+       unsigned int from, to = PAGE_CACHE_SIZE;
+       struct super_block *sb = inode->i_sb;
+
+       BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+       if (numpages == 0)
+               goto out;
+
+       from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+       if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+               /*
+                * Since 'from' has been capped to a value below page
+                * size, this calculation won't be able to overflow
+                * 'to'
+                */
+               to = ocfs2_align_bytes_to_clusters(sb, from);
+
+               /*
+                * The truncate tail in this case should never contain
+                * more than one page at maximum. The loop below also
+                * assumes this.
+                */
+               BUG_ON(numpages != 1);
+       }
+
+       for(i = 0; i < numpages; i++) {
+               page = pages[i];
+
+               BUG_ON(from > PAGE_CACHE_SIZE);
+               BUG_ON(to > PAGE_CACHE_SIZE);
+
+               ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+               if (ret)
+                       mlog_errno(ret);
+
+               kaddr = kmap_atomic(page, KM_USER0);
+               memset(kaddr + from, 0, to - from);
+               kunmap_atomic(kaddr, KM_USER0);
+
+               /*
+                * Need to set the buffers we zero'd into uptodate
+                * here if they aren't - ocfs2_map_page_blocks()
+                * might've skipped some
+                */
+               if (ocfs2_should_order_data(inode)) {
+                       ret = walk_page_buffers(handle,
+                                               page_buffers(page),
+                                               from, to, &partial,
+                                               ocfs2_ordered_zero_func);
+                       if (ret < 0)
+                               mlog_errno(ret);
+               } else {
+                       ret = walk_page_buffers(handle, page_buffers(page),
+                                               from, to, &partial,
+                                               ocfs2_writeback_zero_func);
+                       if (ret < 0)
+                               mlog_errno(ret);
                }
 
-               if (!el->l_next_free_rec) {
-                       mlog(0, "deleting this extent block.\n");
+               if (!partial)
+                       SetPageUptodate(page);
 
-                       ocfs2_remove_from_cache(inode, eb_bh);
+               flush_dcache_page(page);
 
-                       BUG_ON(el->l_recs[0].e_clusters);
-                       BUG_ON(el->l_recs[0].e_cpos);
-                       BUG_ON(el->l_recs[0].e_blkno);
+               /*
+                * Every page after the 1st one should be completely zero'd.
+                */
+               from = 0;
+       }
+out:
+       if (pages) {
+               for (i = 0; i < numpages; i++) {
+                       page = pages[i];
+                       unlock_page(page);
+                       mark_page_accessed(page);
+                       page_cache_release(page);
+               }
+       }
+}
 
-                       /*
-                        * We need to remove this extent block from
-                        * the list above it.
-                        *
-                        * Since we've passed it already in this loop,
-                        * no need to worry about journaling.
-                        */
-                       el = path->p_node[index - 1].el;
-                       i = le16_to_cpu(el->l_next_free_rec) - 1;
-                       BUG_ON(i < 0);
-                       el->l_recs[i].e_cpos = 0;
-                       el->l_recs[i].e_clusters = 0;
-                       el->l_recs[i].e_blkno = 0;
-                       le16_add_cpu(&el->l_next_free_rec, -1);
-
-                       if (eb->h_suballoc_slot == 0) {
-                               /*
-                                * This code only understands how to
-                                * lock the suballocator in slot 0,
-                                * which is fine because allocation is
-                                * only ever done out of that
-                                * suballocator too. A future version
-                                * might change that however, so avoid
-                                * a free if we don't know how to
-                                * handle it. This way an fs incompat
-                                * bit will not be necessary.
-                                */
-                               status = ocfs2_free_extent_block(handle,
-                                                                tc->tc_ext_alloc_inode,
-                                                                tc->tc_ext_alloc_bh,
-                                                                eb);
-                               if (status < 0) {
-                                       mlog_errno(status);
-                                       goto bail;
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+                               int *num, u64 *phys)
+{
+       int i, numpages = 0, ret = 0;
+       unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+       unsigned int ext_flags;
+       struct super_block *sb = inode->i_sb;
+       struct address_space *mapping = inode->i_mapping;
+       unsigned long index;
+       u64 next_cluster_bytes;
+
+       BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+       /* Cluster boundary, so we don't need to grab any pages. */
+       if ((isize & (csize - 1)) == 0)
+               goto out;
+
+       ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+                                         phys, NULL, &ext_flags);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       /* Tail is a hole. */
+       if (*phys == 0)
+               goto out;
+
+       /* Tail is marked as unwritten, we can count on write to zero
+        * in that case. */
+       if (ext_flags & OCFS2_EXT_UNWRITTEN)
+               goto out;
+
+       next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+       index = isize >> PAGE_CACHE_SHIFT;
+       do {
+               pages[numpages] = grab_cache_page(mapping, index);
+               if (!pages[numpages]) {
+                       ret = -ENOMEM;
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               numpages++;
+               index++;
+       } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+       if (ret != 0) {
+               if (pages) {
+                       for (i = 0; i < numpages; i++) {
+                               if (pages[i]) {
+                                       unlock_page(pages[i]);
+                                       page_cache_release(pages[i]);
                                }
                        }
                }
-               index++;
+               numpages = 0;
        }
 
-       BUG_ON(!delete_blk);
-       status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-                                          clusters_to_del);
-       if (status < 0) {
-               mlog_errno(status);
-               goto bail;
+       *num = numpages;
+
+       return ret;
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+                                u64 new_i_size)
+{
+       int ret, numpages;
+       loff_t endbyte;
+       struct page **pages = NULL;
+       u64 phys;
+
+       /*
+        * File systems which don't support sparse files zero on every
+        * extend.
+        */
+       if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+               return 0;
+
+       pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+                       sizeof(struct page *), GFP_NOFS);
+       if (pages == NULL) {
+               ret = -ENOMEM;
+               mlog_errno(ret);
+               goto out;
        }
-       status = 0;
-bail:
 
-       mlog_exit(status);
-       return status;
+       ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       if (numpages == 0)
+               goto out;
+
+       ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+                                handle);
+
+       /*
+        * Initiate writeout of the pages we zero'd here. We don't
+        * wait on them - the truncate_inode_pages() call later will
+        * do that for us.
+        */
+       endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+       ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+                                   endbyte - 1, SYNC_FILE_RANGE_WRITE);
+       if (ret)
+               mlog_errno(ret);
+
+out:
+       if (pages)
+               kfree(pages);
+
+       return ret;
 }
 
 /*
@@ -3273,7 +3642,18 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
                mlog_errno(status);
                goto bail;
        }
+
+       ocfs2_extent_map_trunc(inode, new_highest_cpos);
+
 start:
+       /*
+        * Check that we still have allocation to delete.
+        */
+       if (OCFS2_I(inode)->ip_clusters == 0) {
+               status = 0;
+               goto bail;
+       }
+
        /*
         * Truncate always works against the rightmost tree branch.
         */
@@ -3298,15 +3678,24 @@ start:
         * - no record needs to be removed (truncate has completed)
         */
        el = path_leaf_el(path);
+       if (le16_to_cpu(el->l_next_free_rec) == 0) {
+               ocfs2_error(inode->i_sb,
+                           "Inode %llu has empty extent block at %llu\n",
+                           (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                           (unsigned long long)path_leaf_bh(path)->b_blocknr);
+               status = -EROFS;
+               goto bail;
+       }
+
        i = le16_to_cpu(el->l_next_free_rec) - 1;
        range = le32_to_cpu(el->l_recs[i].e_cpos) +
-               le32_to_cpu(el->l_recs[i].e_clusters);
+               ocfs2_rec_clusters(el, &el->l_recs[i]);
        if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
                clusters_to_del = 0;
        } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
-               clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+               clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
        } else if (range > new_highest_cpos) {
-               clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+               clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
                                   le32_to_cpu(el->l_recs[i].e_cpos)) -
                                  new_highest_cpos;
        } else {
@@ -3359,10 +3748,11 @@ start:
        ocfs2_reinit_path(path, 1);
 
        /*
-        * Only loop if we still have allocation.
+        * The check above will catch the case where we've truncated
+        * away all allocation.
         */
-       if (OCFS2_I(inode)->ip_clusters)
-               goto start;
+       goto start;
+
 bail:
        up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
@@ -3411,24 +3801,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
        fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
        mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
-            "%llu\n", fe->i_clusters, new_i_clusters,
-            (unsigned long long)fe->i_size);
-
-       if (!ocfs2_sparse_alloc(osb) &&
-           le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
-               ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
-                           "%u and size %llu whereas struct inode has "
-                           "cluster count %u and size %llu which caused an "
-                           "invalid truncate to %u clusters.",
-                           (unsigned long long)le64_to_cpu(fe->i_blkno),
-                           le32_to_cpu(fe->i_clusters),
-                           (unsigned long long)le64_to_cpu(fe->i_size),
-                           OCFS2_I(inode)->ip_clusters, i_size_read(inode),
-                           new_i_clusters);
-               mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
-               status = -EIO;
-               goto bail;
-       }
+            "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
+            (unsigned long long)le64_to_cpu(fe->i_size));
 
        *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
        if (!(*tc)) {