#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "aops.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
CONTIG_RIGHT
};
+
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
static int ocfs2_block_extent_contig(struct super_block *sb,
struct ocfs2_extent_rec *ext,
u64 blkno)
{
- return blkno == (le64_to_cpu(ext->e_blkno) +
- ocfs2_clusters_to_blocks(sb,
- le32_to_cpu(ext->e_clusters)));
+ u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+ blk_end += ocfs2_clusters_to_blocks(sb,
+ le16_to_cpu(ext->e_leaf_clusters));
+
+ return blkno == blk_end;
}
static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
struct ocfs2_extent_rec *right)
{
- return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
- le32_to_cpu(right->e_cpos));
+ u32 left_range;
+
+ left_range = le32_to_cpu(left->e_cpos) +
+ le16_to_cpu(left->e_leaf_clusters);
+
+ return (left_range == le32_to_cpu(right->e_cpos));
}
static enum ocfs2_contig_type
i = le16_to_cpu(el->l_next_free_rec) - 1;
return le32_to_cpu(el->l_recs[i].e_cpos) +
- le32_to_cpu(el->l_recs[i].e_clusters);
+ ocfs2_rec_clusters(el, &el->l_recs[i]);
}
/*
* for the new last extent block.
*
* the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
*/
static int ocfs2_add_branch(struct ocfs2_super *osb,
handle_t *handle,
*/
eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
- eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+ /*
+ * eb_el isn't always an interior node, but even leaf
+ * nodes want a zero'd flags and reserved field so
+ * this gets the whole 32 bits regardless of use.
+ */
+ eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
if (!eb_el->l_tree_depth)
new_last_eb_blk = le64_to_cpu(eb->h_blkno);
i = le16_to_cpu(el->l_next_free_rec);
el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
- el->l_recs[i].e_clusters = 0;
+ el->l_recs[i].e_int_clusters = 0;
le16_add_cpu(&el->l_next_free_rec, 1);
/* fe needs a new last extent block pointer, as does the
/* copy the fe data into the new extent block */
eb_el->l_tree_depth = fe_el->l_tree_depth;
eb_el->l_next_free_rec = fe_el->l_next_free_rec;
- for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
- eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
- eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
- eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
- }
+ for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+ eb_el->l_recs[i] = fe_el->l_recs[i];
status = ocfs2_journal_dirty(handle, new_eb_bh);
if (status < 0) {
le16_add_cpu(&fe_el->l_tree_depth, 1);
fe_el->l_recs[0].e_cpos = 0;
fe_el->l_recs[0].e_blkno = eb->h_blkno;
- fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
- for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
- fe_el->l_recs[i].e_cpos = 0;
- fe_el->l_recs[i].e_clusters = 0;
- fe_el->l_recs[i].e_blkno = 0;
- }
+ fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+ for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+ memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
fe_el->l_next_free_rec = cpu_to_le16(1);
/* If this is our 1st tree depth shift, then last_eb_blk
return status;
}
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
{
- return !rec->e_clusters;
+ return !rec->e_leaf_clusters;
}
/*
{
int next_free = le16_to_cpu(el->l_next_free_rec);
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
if (next_free == 0)
goto set_and_inc;
* rightmost record.
*/
range = le32_to_cpu(rec->e_cpos) +
- le32_to_cpu(rec->e_clusters);
+ ocfs2_rec_clusters(el, rec);
if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
break;
}
*
* This function doesn't handle non btree extent lists.
*/
-static int ocfs2_find_leaf(struct inode *inode,
- struct ocfs2_extent_list *root_el, u32 cpos,
- struct buffer_head **leaf_bh)
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+ u32 cpos, struct buffer_head **leaf_bh)
{
int ret;
struct buffer_head *bh = NULL;
*/
left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
left_clusters -= le32_to_cpu(left_rec->e_cpos);
- left_rec->e_clusters = cpu_to_le32(left_clusters);
+ left_rec->e_int_clusters = cpu_to_le32(left_clusters);
/*
* Calculate the rightmost cluster count boundary before
- * moving cpos - we will need to adjust e_clusters after
+ * moving cpos - we will need to adjust clusters after
* updating e_cpos to keep the same highest cluster count.
*/
right_end = le32_to_cpu(right_rec->e_cpos);
- right_end += le32_to_cpu(right_rec->e_clusters);
+ right_end += le32_to_cpu(right_rec->e_int_clusters);
right_rec->e_cpos = left_rec->e_cpos;
le32_add_cpu(&right_rec->e_cpos, left_clusters);
right_end -= le32_to_cpu(right_rec->e_cpos);
- right_rec->e_clusters = cpu_to_le32(right_end);
+ right_rec->e_int_clusters = cpu_to_le32(right_end);
}
/*
u64 blkno;
struct ocfs2_extent_list *el;
+ BUG_ON(path->p_tree_depth == 0);
+
*cpos = 0;
blkno = path_leaf_bh(path)->b_blocknr;
}
*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
- *cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
+ *cpos = *cpos + ocfs2_rec_clusters(el,
+ &el->l_recs[j - 1]);
+ *cpos = *cpos - 1;
goto out;
}
}
unsigned int range;
struct ocfs2_extent_rec *rec;
- BUG_ON(el->l_tree_depth);
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
/*
* Contiguous insert - either left or right.
rec->e_blkno = insert_rec->e_blkno;
rec->e_cpos = insert_rec->e_cpos;
}
- le32_add_cpu(&rec->e_clusters,
- le32_to_cpu(insert_rec->e_clusters));
+ le16_add_cpu(&rec->e_leaf_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
return;
}
if (insert->ins_appending == APPEND_TAIL) {
i = le16_to_cpu(el->l_next_free_rec) - 1;
rec = &el->l_recs[i];
- range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
+ range = le32_to_cpu(rec->e_cpos)
+ + le16_to_cpu(rec->e_leaf_clusters);
BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
le16_to_cpu(el->l_count),
le16_to_cpu(el->l_next_free_rec),
le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(el->l_recs[i].e_clusters),
+ le16_to_cpu(el->l_recs[i].e_leaf_clusters),
le32_to_cpu(insert_rec->e_cpos),
- le32_to_cpu(insert_rec->e_clusters));
+ le16_to_cpu(insert_rec->e_leaf_clusters));
i++;
el->l_recs[i] = *insert_rec;
le16_add_cpu(&el->l_next_free_rec, 1);
*ret_left_path = NULL;
+ /*
+ * This shouldn't happen for non-trees. The extent rec cluster
+ * count manipulation below only works for interior nodes.
+ */
+ BUG_ON(right_path->p_tree_depth == 0);
+
/*
* If our appending insert is at the leftmost edge of a leaf,
* then we might need to update the rightmost records of the
bh = path_root_bh(right_path);
i = 0;
while (1) {
+ struct ocfs2_extent_rec *rec;
+
next_free = le16_to_cpu(el->l_next_free_rec);
if (next_free == 0) {
ocfs2_error(inode->i_sb,
goto out;
}
- el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
- le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
- le32_to_cpu(insert_rec->e_clusters));
- le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
- -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
+ rec = &el->l_recs[next_free - 1];
+
+ rec->e_int_clusters = insert_rec->e_cpos;
+ le32_add_cpu(&rec->e_int_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ le32_add_cpu(&rec->e_int_clusters,
+ -le32_to_cpu(rec->e_cpos));
ret = ocfs2_journal_dirty(handle, bh);
if (ret)
mlog_errno(ret);
+ /* Don't touch the leaf node */
if (++i >= right_path->p_tree_depth)
break;
out_update_clusters:
ocfs2_update_dinode_clusters(inode, di,
- le32_to_cpu(insert_rec->e_clusters));
+ le16_to_cpu(insert_rec->e_leaf_clusters));
ret = ocfs2_journal_dirty(handle, di_bh);
if (ret)
int i;
enum ocfs2_contig_type contig_type = CONTIG_NONE;
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
insert_rec);
insert->ins_appending = APPEND_NONE;
- BUG_ON(el->l_tree_depth);
+ BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
if (!el->l_next_free_rec)
goto set_tail_append;
i = le16_to_cpu(el->l_next_free_rec) - 1;
rec = &el->l_recs[i];
- if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
+ if (cpos >=
+ (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
goto set_tail_append;
return;
* The insert code isn't quite ready to deal with all cases of
* left contiguousness. Specifically, if it's an insert into
* the 1st record in a leaf, it will require the adjustment of
- * e_clusters on the last record of the path directly to it's
+ * cluster count on the last record of the path directly to it's
* left. For now, just catch that case and fool the layers
* above us. This works just fine for tree_depth == 0, which
* is why we allow that above.
(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
OCFS2_I(inode)->ip_clusters);
+ memset(&rec, 0, sizeof(rec));
rec.e_cpos = cpu_to_le32(cpos);
rec.e_blkno = cpu_to_le64(start_blk);
- rec.e_clusters = cpu_to_le32(new_clusters);
+ rec.e_leaf_clusters = cpu_to_le16(new_clusters);
status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
&insert);
status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
if (status < 0)
mlog_errno(status);
+ else
+ ocfs2_extent_map_insert_rec(inode, &rec);
bail:
if (bh)
tl = &tl_copy->id2.i_dealloc;
num_recs = le16_to_cpu(tl->tl_used);
mlog(0, "cleanup %u records from %llu\n", num_recs,
- (unsigned long long)tl_copy->i_blkno);
+ (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
mutex_lock(&tl_inode->i_mutex);
for(i = 0; i < num_recs; i++) {
* block will be so we can update his h_next_leaf_blk field, as well
* as the dinodes i_last_eb_blk */
static int ocfs2_find_new_last_ext_blk(struct inode *inode,
- u32 new_i_clusters,
+ unsigned int clusters_to_del,
struct ocfs2_path *path,
struct buffer_head **new_last_eb)
{
- int ret = 0;
+ int next_free, ret = 0;
u32 cpos;
+ struct ocfs2_extent_rec *rec;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct buffer_head *bh = NULL;
/* trunc to zero special case - this makes tree_depth = 0
* regardless of what it is. */
- if (!new_i_clusters)
+ if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
goto out;
el = path_leaf_el(path);
BUG_ON(!el->l_next_free_rec);
- /* Make sure that this guy will actually be empty after we
- * clear away the data. */
+ /*
+ * Make sure that this extent list will actually be empty
+ * after we clear away the data. We can shortcut out if
+ * there's more than one non-empty extent in the
+ * list. Otherwise, a check of the remaining extent is
+ * necessary.
+ */
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ rec = NULL;
if (ocfs2_is_empty_extent(&el->l_recs[0])) {
- if (le16_to_cpu(el->l_next_free_rec) > 1 &&
- le32_to_cpu(el->l_recs[1].e_cpos) < new_i_clusters)
+ if (next_free > 2)
goto out;
- } else if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
- goto out;
+
+ /* We may have a valid extent in index 1, check it. */
+ if (next_free == 2)
+ rec = &el->l_recs[1];
+
+ /*
+ * Fall through - no more nonempty extents, so we want
+ * to delete this leaf.
+ */
+ } else {
+ if (next_free > 1)
+ goto out;
+
+ rec = &el->l_recs[0];
+ }
+
+ if (rec) {
+ /*
+ * Check it we'll only be trimming off the end of this
+ * cluster.
+ */
+ if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
+ goto out;
+ }
ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
if (ret) {
return ret;
}
+/*
+ * Trim some clusters off the rightmost edge of a tree. Only called
+ * during truncate.
+ *
+ * The caller needs to:
+ * - start journaling of each path component.
+ * - compute and fully set up any new last ext block
+ */
+static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
+ handle_t *handle, struct ocfs2_truncate_context *tc,
+ u32 clusters_to_del, u64 *delete_start)
+{
+ int ret, i, index = path->p_tree_depth;
+ u32 new_edge = 0;
+ u64 deleted_eb = 0;
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ *delete_start = 0;
+
+ while (index >= 0) {
+ bh = path->p_node[index].bh;
+ el = path->p_node[index].el;
+
+ mlog(0, "traveling tree (index = %d, block = %llu)\n",
+ index, (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+
+ if (index !=
+ (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has invalid ext. block %llu",
+ inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+
+find_tail_record:
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[i];
+
+ mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
+ "next = %u\n", i, le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec),
+ (unsigned long long)le64_to_cpu(rec->e_blkno),
+ le16_to_cpu(el->l_next_free_rec));
+
+ BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
+
+ if (le16_to_cpu(el->l_tree_depth) == 0) {
+ /*
+ * If the leaf block contains a single empty
+ * extent and no records, we can just remove
+ * the block.
+ */
+ if (i == 0 && ocfs2_is_empty_extent(rec)) {
+ memset(rec, 0,
+ sizeof(struct ocfs2_extent_rec));
+ el->l_next_free_rec = cpu_to_le16(0);
+
+ goto delete;
+ }
+
+ /*
+ * Remove any empty extents by shifting things
+ * left. That should make life much easier on
+ * the code below. This condition is rare
+ * enough that we shouldn't see a performance
+ * hit.
+ */
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ le16_add_cpu(&el->l_next_free_rec, -1);
+
+ for(i = 0;
+ i < le16_to_cpu(el->l_next_free_rec); i++)
+ el->l_recs[i] = el->l_recs[i + 1];
+
+ memset(&el->l_recs[i], 0,
+ sizeof(struct ocfs2_extent_rec));
+
+ /*
+ * We've modified our extent list. The
+ * simplest way to handle this change
+ * is to being the search from the
+ * start again.
+ */
+ goto find_tail_record;
+ }
+
+ le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
+
+ /*
+ * We'll use "new_edge" on our way back up the
+ * tree to know what our rightmost cpos is.
+ */
+ new_edge = le16_to_cpu(rec->e_leaf_clusters);
+ new_edge += le32_to_cpu(rec->e_cpos);
+
+ /*
+ * The caller will use this to delete data blocks.
+ */
+ *delete_start = le64_to_cpu(rec->e_blkno)
+ + ocfs2_clusters_to_blocks(inode->i_sb,
+ le16_to_cpu(rec->e_leaf_clusters));
+
+ /*
+ * If it's now empty, remove this record.
+ */
+ if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
+ memset(rec, 0,
+ sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&el->l_next_free_rec, -1);
+ }
+ } else {
+ if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
+ memset(rec, 0,
+ sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&el->l_next_free_rec, -1);
+
+ goto delete;
+ }
+
+ /* Can this actually happen? */
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ goto delete;
+
+ /*
+ * We never actually deleted any clusters
+ * because our leaf was empty. There's no
+ * reason to adjust the rightmost edge then.
+ */
+ if (new_edge == 0)
+ goto delete;
+
+ rec->e_int_clusters = cpu_to_le32(new_edge);
+ le32_add_cpu(&rec->e_int_clusters,
+ -le32_to_cpu(rec->e_cpos));
+
+ /*
+ * A deleted child record should have been
+ * caught above.
+ */
+ BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
+ }
+
+delete:
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "extent list container %llu, after: record %d: "
+ "(%u, %u, %llu), next = %u.\n",
+ (unsigned long long)bh->b_blocknr, i,
+ le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
+ (unsigned long long)le64_to_cpu(rec->e_blkno),
+ le16_to_cpu(el->l_next_free_rec));
+
+ /*
+ * We must be careful to only attempt delete of an
+ * extent block (and not the root inode block).
+ */
+ if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
+ struct ocfs2_extent_block *eb =
+ (struct ocfs2_extent_block *)bh->b_data;
+
+ /*
+ * Save this for use when processing the
+ * parent block.
+ */
+ deleted_eb = le64_to_cpu(eb->h_blkno);
+
+ mlog(0, "deleting this extent block.\n");
+
+ ocfs2_remove_from_cache(inode, bh);
+
+ BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
+ BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
+ BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
+
+ if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+ /*
+ * This code only understands how to
+ * lock the suballocator in slot 0,
+ * which is fine because allocation is
+ * only ever done out of that
+ * suballocator too. A future version
+ * might change that however, so avoid
+ * a free if we don't know how to
+ * handle it. This way an fs incompat
+ * bit will not be necessary.
+ */
+ ret = ocfs2_free_extent_block(handle,
+ tc->tc_ext_alloc_inode,
+ tc->tc_ext_alloc_bh,
+ eb);
+
+ /* An error here is not fatal. */
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+ } else {
+ deleted_eb = 0;
+ }
+
+ index--;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
static int ocfs2_do_truncate(struct ocfs2_super *osb,
unsigned int clusters_to_del,
struct inode *inode,
struct ocfs2_truncate_context *tc,
struct ocfs2_path *path)
{
- int status, i, index;
+ int status;
struct ocfs2_dinode *fe;
- struct ocfs2_extent_block *eb;
struct ocfs2_extent_block *last_eb = NULL;
struct ocfs2_extent_list *el;
- struct buffer_head *eb_bh = NULL;
struct buffer_head *last_eb_bh = NULL;
u64 delete_blk = 0;
fe = (struct ocfs2_dinode *) fe_bh->b_data;
- status = ocfs2_find_new_last_ext_blk(inode,
- le32_to_cpu(fe->i_clusters) -
- clusters_to_del,
+ status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
path, &last_eb_bh);
if (status < 0) {
mlog_errno(status);
* Each component will be touched, so we might as well journal
* here to avoid having to handle errors later.
*/
- for (i = 0; i < path_num_items(path); i++) {
- status = ocfs2_journal_access(handle, inode,
- path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ status = ocfs2_journal_access_path(inode, handle, path);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
}
if (last_eb_bh) {
* Lower levels depend on this never happening, but it's best
* to check it up here before changing the tree.
*/
- if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
+ if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
ocfs2_error(inode->i_sb,
"Inode %lu has an empty extent record, depth %u\n",
inode->i_ino, le16_to_cpu(el->l_tree_depth));
+ status = -EROFS;
goto bail;
}
spin_unlock(&OCFS2_I(inode)->ip_lock);
le32_add_cpu(&fe->i_clusters, -clusters_to_del);
- i = le16_to_cpu(el->l_next_free_rec) - 1;
-
- BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
- le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
- /* tree depth zero, we can just delete the clusters, otherwise
- * we need to record the offset of the next level extent block
- * as we may overwrite it. */
- if (!el->l_tree_depth) {
- delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
- + ocfs2_clusters_to_blocks(osb->sb,
- le32_to_cpu(el->l_recs[i].e_clusters));
-
- if (!el->l_recs[i].e_clusters) {
- /* if we deleted the whole extent record, then clear
- * out the other fields and update the extent
- * list.
- */
- el->l_recs[i].e_cpos = 0;
- el->l_recs[i].e_blkno = 0;
- BUG_ON(!el->l_next_free_rec);
- le16_add_cpu(&el->l_next_free_rec, -1);
-
- /*
- * The leftmost record might be an empty extent -
- * delete it here too.
- */
- if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
- el->l_recs[0].e_cpos = 0;
- el->l_recs[0].e_blkno = 0;
- el->l_next_free_rec = 0;
- }
- }
+ status = ocfs2_trim_tree(inode, path, handle, tc,
+ clusters_to_del, &delete_blk);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
}
if (le32_to_cpu(fe->i_clusters) == 0) {
}
}
- index = 1;
- /* if our tree depth > 0, update all the tree blocks below us. */
- while (index <= path->p_tree_depth) {
- eb_bh = path->p_node[index].bh;
- eb = (struct ocfs2_extent_block *)eb_bh->b_data;
- el = path->p_node[index].el;
-
- mlog(0, "traveling tree (index = %d, extent block: %llu)\n",
- index, (unsigned long long)eb_bh->b_blocknr);
-
- BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
- if (index !=
- (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
- ocfs2_error(inode->i_sb,
- "Inode %lu has invalid ext. block %llu\n",
- inode->i_ino,
- (unsigned long long)eb_bh->b_blocknr);
+ if (delete_blk) {
+ status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+ clusters_to_del);
+ if (status < 0) {
+ mlog_errno(status);
goto bail;
}
+ }
+ status = 0;
+bail:
- i = le16_to_cpu(el->l_next_free_rec) - 1;
+ mlog_exit(status);
+ return status;
+}
- mlog(0, "extent block %llu, before: record %d: "
- "(%u, %u, %llu), next = %u\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), i,
- le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(el->l_recs[i].e_clusters),
- (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
- le16_to_cpu(el->l_next_free_rec));
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ return 0;
+}
- BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
- le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-
- /* bottom-most block requires us to delete data.*/
- if (!el->l_tree_depth)
- delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
- + ocfs2_clusters_to_blocks(osb->sb,
- le32_to_cpu(el->l_recs[i].e_clusters));
- if (!el->l_recs[i].e_clusters) {
- el->l_recs[i].e_cpos = 0;
- el->l_recs[i].e_blkno = 0;
- BUG_ON(!el->l_next_free_rec);
- le16_add_cpu(&el->l_next_free_rec, -1);
- }
- if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
- el->l_recs[0].e_cpos = 0;
- el->l_recs[0].e_blkno = 0;
- el->l_next_free_rec = 0;
- }
-
- mlog(0, "extent block %llu, after: record %d: "
- "(%u, %u, %llu), next = %u\n",
- (unsigned long long)le64_to_cpu(eb->h_blkno), i,
- le32_to_cpu(el->l_recs[i].e_cpos),
- le32_to_cpu(el->l_recs[i].e_clusters),
- (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
- le16_to_cpu(el->l_next_free_rec));
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ return ocfs2_journal_dirty_data(handle, bh);
+}
- status = ocfs2_journal_dirty(handle, eb_bh);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+ struct page **pages, int numpages,
+ u64 phys, handle_t *handle)
+{
+ int i, ret, partial = 0;
+ void *kaddr;
+ struct page *page;
+ unsigned int from, to = PAGE_CACHE_SIZE;
+ struct super_block *sb = inode->i_sb;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+ if (numpages == 0)
+ goto out;
+
+ from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+ if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+ /*
+ * Since 'from' has been capped to a value below page
+ * size, this calculation won't be able to overflow
+ * 'to'
+ */
+ to = ocfs2_align_bytes_to_clusters(sb, from);
+
+ /*
+ * The truncate tail in this case should never contain
+ * more than one page at maximum. The loop below also
+ * assumes this.
+ */
+ BUG_ON(numpages != 1);
+ }
+
+ for(i = 0; i < numpages; i++) {
+ page = pages[i];
+
+ BUG_ON(from > PAGE_CACHE_SIZE);
+ BUG_ON(to > PAGE_CACHE_SIZE);
+
+ ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+ if (ret)
+ mlog_errno(ret);
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + from, 0, to - from);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ /*
+ * Need to set the buffers we zero'd into uptodate
+ * here if they aren't - ocfs2_map_page_blocks()
+ * might've skipped some
+ */
+ if (ocfs2_should_order_data(inode)) {
+ ret = walk_page_buffers(handle,
+ page_buffers(page),
+ from, to, &partial,
+ ocfs2_ordered_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
+ } else {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_writeback_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
}
- if (!el->l_next_free_rec) {
- mlog(0, "deleting this extent block.\n");
+ if (!partial)
+ SetPageUptodate(page);
- ocfs2_remove_from_cache(inode, eb_bh);
+ flush_dcache_page(page);
- BUG_ON(el->l_recs[0].e_clusters);
- BUG_ON(el->l_recs[0].e_cpos);
- BUG_ON(el->l_recs[0].e_blkno);
+ /*
+ * Every page after the 1st one should be completely zero'd.
+ */
+ from = 0;
+ }
+out:
+ if (pages) {
+ for (i = 0; i < numpages; i++) {
+ page = pages[i];
+ unlock_page(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
+ }
+ }
+}
- /*
- * We need to remove this extent block from
- * the list above it.
- *
- * Since we've passed it already in this loop,
- * no need to worry about journaling.
- */
- el = path->p_node[index - 1].el;
- i = le16_to_cpu(el->l_next_free_rec) - 1;
- BUG_ON(i < 0);
- el->l_recs[i].e_cpos = 0;
- el->l_recs[i].e_clusters = 0;
- el->l_recs[i].e_blkno = 0;
- le16_add_cpu(&el->l_next_free_rec, -1);
-
- if (eb->h_suballoc_slot == 0) {
- /*
- * This code only understands how to
- * lock the suballocator in slot 0,
- * which is fine because allocation is
- * only ever done out of that
- * suballocator too. A future version
- * might change that however, so avoid
- * a free if we don't know how to
- * handle it. This way an fs incompat
- * bit will not be necessary.
- */
- status = ocfs2_free_extent_block(handle,
- tc->tc_ext_alloc_inode,
- tc->tc_ext_alloc_bh,
- eb);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+ int *num, u64 *phys)
+{
+ int i, numpages = 0, ret = 0;
+ unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+ unsigned int ext_flags;
+ struct super_block *sb = inode->i_sb;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long index;
+ u64 next_cluster_bytes;
+
+ BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+ /* Cluster boundary, so we don't need to grab any pages. */
+ if ((isize & (csize - 1)) == 0)
+ goto out;
+
+ ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+ phys, NULL, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* Tail is a hole. */
+ if (*phys == 0)
+ goto out;
+
+ /* Tail is marked as unwritten, we can count on write to zero
+ * in that case. */
+ if (ext_flags & OCFS2_EXT_UNWRITTEN)
+ goto out;
+
+ next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+ index = isize >> PAGE_CACHE_SHIFT;
+ do {
+ pages[numpages] = grab_cache_page(mapping, index);
+ if (!pages[numpages]) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ numpages++;
+ index++;
+ } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+
+out:
+ if (ret != 0) {
+ if (pages) {
+ for (i = 0; i < numpages; i++) {
+ if (pages[i]) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
}
}
}
- index++;
+ numpages = 0;
}
- BUG_ON(!delete_blk);
- status = ocfs2_truncate_log_append(osb, handle, delete_blk,
- clusters_to_del);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
+ *num = numpages;
+
+ return ret;
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+ u64 new_i_size)
+{
+ int ret, numpages;
+ loff_t endbyte;
+ struct page **pages = NULL;
+ u64 phys;
+
+ /*
+ * File systems which don't support sparse files zero on every
+ * extend.
+ */
+ if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ return 0;
+
+ pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+ sizeof(struct page *), GFP_NOFS);
+ if (pages == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
}
- status = 0;
-bail:
- mlog_exit(status);
- return status;
+ ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (numpages == 0)
+ goto out;
+
+ ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+ handle);
+
+ /*
+ * Initiate writeout of the pages we zero'd here. We don't
+ * wait on them - the truncate_inode_pages() call later will
+ * do that for us.
+ */
+ endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+ ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+ endbyte - 1, SYNC_FILE_RANGE_WRITE);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ if (pages)
+ kfree(pages);
+
+ return ret;
}
/*
mlog_errno(status);
goto bail;
}
+
+ ocfs2_extent_map_trunc(inode, new_highest_cpos);
+
start:
+ /*
+ * Check that we still have allocation to delete.
+ */
+ if (OCFS2_I(inode)->ip_clusters == 0) {
+ status = 0;
+ goto bail;
+ }
+
/*
* Truncate always works against the rightmost tree branch.
*/
* - no record needs to be removed (truncate has completed)
*/
el = path_leaf_el(path);
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has empty extent block at %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)path_leaf_bh(path)->b_blocknr);
+ status = -EROFS;
+ goto bail;
+ }
+
i = le16_to_cpu(el->l_next_free_rec) - 1;
range = le32_to_cpu(el->l_recs[i].e_cpos) +
- le32_to_cpu(el->l_recs[i].e_clusters);
+ ocfs2_rec_clusters(el, &el->l_recs[i]);
if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
clusters_to_del = 0;
} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
- clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
+ clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
} else if (range > new_highest_cpos) {
- clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+ clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
le32_to_cpu(el->l_recs[i].e_cpos)) -
new_highest_cpos;
} else {
ocfs2_reinit_path(path, 1);
/*
- * Only loop if we still have allocation.
+ * The check above will catch the case where we've truncated
+ * away all allocation.
*/
- if (OCFS2_I(inode)->ip_clusters)
- goto start;
+ goto start;
+
bail:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
fe = (struct ocfs2_dinode *) fe_bh->b_data;
mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
- "%llu\n", fe->i_clusters, new_i_clusters,
- (unsigned long long)fe->i_size);
-
- if (!ocfs2_sparse_alloc(osb) &&
- le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
- ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
- "%u and size %llu whereas struct inode has "
- "cluster count %u and size %llu which caused an "
- "invalid truncate to %u clusters.",
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- le32_to_cpu(fe->i_clusters),
- (unsigned long long)le64_to_cpu(fe->i_size),
- OCFS2_I(inode)->ip_clusters, i_size_read(inode),
- new_i_clusters);
- mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
- status = -EIO;
- goto bail;
- }
+ "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
+ (unsigned long long)le64_to_cpu(fe->i_size));
*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
if (!(*tc)) {