X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=fs%2Focfs2%2Fextent_map.c;h=c58668a326fe87f58074741965c531761fe97359;hb=4937ce87959629d31e9b09cf5bdf1e12a305c805;hp=80ac69f11d9f4d2354fa5dab987e4a1904a5547e;hpb=1709775828de83b099554176b6d4971ebc772962;p=linux-2.6 diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 80ac69f11d..c58668a326 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -3,8 +3,7 @@ * * extent_map.c * - * In-memory extent map for OCFS2. Man, this code was prettier in - * the library. + * Block/Cluster mapping functions * * Copyright (C) 2004 Oracle. All rights reserved. * @@ -26,1016 +25,499 @@ #include #include #include -#include -#include #define MLOG_MASK_PREFIX ML_EXTENT_MAP #include #include "ocfs2.h" +#include "alloc.h" #include "extent_map.h" #include "inode.h" #include "super.h" #include "buffer_head_io.h" - -/* - * SUCK SUCK SUCK - * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h - */ - -struct ocfs2_extent_map_entry { - struct rb_node e_node; - int e_tree_depth; - struct ocfs2_extent_rec e_rec; -}; - -struct ocfs2_em_insert_context { - int need_left; - int need_right; - struct ocfs2_extent_map_entry *new_ent; - struct ocfs2_extent_map_entry *old_ent; - struct ocfs2_extent_map_entry *left_ent; - struct ocfs2_extent_map_entry *right_ent; -}; - -static struct kmem_cache *ocfs2_em_ent_cachep = NULL; - - -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent); -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth); -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent); -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el); -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_map_entry **ret_ent); -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt); - -/* returns 1 only if the rec contains all the given clusters -- that is that - * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + - * clusters) is >= the argument's endpoint */ -static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, - u32 cpos, u32 clusters) -{ - if (le32_to_cpu(rec->e_cpos) > cpos) - return 0; - if (cpos + clusters > le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) - return 0; - return 1; -} - - /* - * Find an entry in the tree that intersects the region passed in. - * Note that this will find straddled intervals, it is up to the - * callers to enforce any boundary conditions. + * The extent caching implementation is intentionally trivial. * - * Callers must hold ip_lock. This lookup is not guaranteed to return - * a tree_depth 0 match, and as such can race inserts if the lock - * were not held. - * - * The rb_node garbage lets insertion share the search. Trivial - * callers pass NULL. + * We only cache a small number of extents stored directly on the + * inode, so linear order operations are acceptable. If we ever want + * to increase the size of the extent map, then these algorithms must + * get smarter. */ -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent) + +void ocfs2_extent_map_init(struct inode *inode) { - struct rb_node **p = &em->em_extents.rb_node; - struct rb_node *parent = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - while (*p) - { - parent = *p; - ent = rb_entry(parent, struct ocfs2_extent_map_entry, - e_node); - if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { - p = &(*p)->rb_left; - ent = NULL; - } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters))) { - p = &(*p)->rb_right; - ent = NULL; - } else - break; - } + struct ocfs2_inode_info *oi = OCFS2_I(inode); - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - return ent; + oi->ip_extent_map.em_num_items = 0; + INIT_LIST_HEAD(&oi->ip_extent_map.em_list); } -/* - * Find the leaf containing the interval we want. While we're on our - * way down the tree, fill in every record we see at any depth, because - * we might want it later. - * - * Note that this code is run without ip_lock. That's because it - * sleeps while reading. If someone is also filling the extent list at - * the same time we are, we might have to restart. - */ -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el) +static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + unsigned int cpos, + struct ocfs2_extent_map_item **ret_emi) { - int i, ret; - struct buffer_head *eb_bh = NULL; - u64 blkno; - u32 rec_end; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_rec *rec; - - /* - * The bh data containing the el cannot change here, because - * we hold alloc_sem. So we can do this without other - * locks. - */ - while (el->l_tree_depth) - { - blkno = 0; - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - rec_end = (le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - - ret = -EBADR; - if (rec_end > OCFS2_I(inode)->ip_clusters) { - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", - i, - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_clusters); - goto out_free; - } - - if (rec_end <= cpos) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } - if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } - - /* - * We've found a record that matches our - * interval. We don't insert it because we're - * about to traverse it. - */ - - /* Check to see if we're stradling */ - ret = -ESRCH; - if (!ocfs2_extent_rec_contains_clusters(rec, - cpos, - clusters)) { - mlog_errno(ret); - goto out_free; - } + unsigned int range; + struct ocfs2_extent_map_item *emi; - /* - * If we've already found a record, the el has - * two records covering the same interval. - * EEEK! - */ - ret = -EBADR; - if (blkno) { - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", - cpos, clusters, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)blkno, i, - (unsigned long long)le64_to_cpu(rec->e_blkno)); - goto out_free; - } + *ret_emi = NULL; - blkno = le64_to_cpu(rec->e_blkno); - } + list_for_each_entry(emi, &em->em_list, ei_list) { + range = emi->ei_cpos + emi->ei_clusters; - /* - * We don't support holes, and we're still up - * in the branches, so we'd better have found someone - */ - ret = -EBADR; - if (!blkno) { - ocfs2_error(inode->i_sb, - "No record found for (cpos = %u, clusters = %u) on inode %llu\n", - cpos, clusters, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_errno(ret); - goto out_free; - } + if (cpos >= emi->ei_cpos && cpos < range) { + list_move(&emi->ei_list, &em->em_list); - if (eb_bh) { - brelse(eb_bh); - eb_bh = NULL; - } - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - blkno, &eb_bh, OCFS2_BH_CACHED, - inode); - if (ret) { - mlog_errno(ret); - goto out_free; - } - eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EIO; - goto out_free; + *ret_emi = emi; + break; } - el = &eb->h_list; } +} - BUG_ON(el->l_tree_depth); - - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_clusters) { - ret = -EBADR; - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", - i, - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_clusters); - return ret; - } - - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } +static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, + unsigned int *phys, unsigned int *len, + unsigned int *flags) +{ + unsigned int coff; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map_item *emi; + + spin_lock(&oi->ip_lock); + + __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); + if (emi) { + coff = cpos - emi->ei_cpos; + *phys = emi->ei_phys + coff; + if (len) + *len = emi->ei_clusters - coff; + if (flags) + *flags = emi->ei_flags; } - ret = 0; + spin_unlock(&oi->ip_lock); -out_free: - if (eb_bh) - brelse(eb_bh); + if (emi == NULL) + return -ENOENT; - return ret; + return 0; } /* - * This lookup actually will read from disk. It has one invariant: - * It will never re-traverse blocks. This means that all inserts should - * be new regions or more granular regions (both allowed by insert). + * Forget about all clusters equal to or greater than cpos. */ -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, - u32 clusters, - struct ocfs2_extent_map_entry **ret_ent) +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) { - int ret; - u64 blkno; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct buffer_head *bh = NULL; - struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; - struct ocfs2_extent_list *el; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (ent) { - if (!ent->e_tree_depth) { - spin_unlock(&OCFS2_I(inode)->ip_lock); - *ret_ent = ent; - return 0; + struct ocfs2_extent_map_item *emi, *n; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + LIST_HEAD(tmp_list); + unsigned int range; + + spin_lock(&oi->ip_lock); + list_for_each_entry_safe(emi, n, &em->em_list, ei_list) { + if (emi->ei_cpos >= cpos) { + /* Full truncate of this record. */ + list_move(&emi->ei_list, &tmp_list); + BUG_ON(em->em_num_items == 0); + em->em_num_items--; + continue; } - blkno = le64_to_cpu(ent->e_rec.e_blkno); - spin_unlock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; - } - eb = (struct ocfs2_extent_block *)bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - brelse(bh); - return -EIO; - } - el = &eb->h_list; - } else { - spin_unlock(&OCFS2_I(inode)->ip_lock); - - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; + range = emi->ei_cpos + emi->ei_clusters; + if (range > cpos) { + /* Partial truncate */ + emi->ei_clusters = cpos - emi->ei_cpos; } - di = (struct ocfs2_dinode *)bh->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - brelse(bh); - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); - return -EIO; - } - el = &di->id2.i_list; - } - - ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); - brelse(bh); - if (ret) { - mlog_errno(ret); - return ret; } + spin_unlock(&oi->ip_lock); - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (!ent) { - ret = -ESRCH; - mlog_errno(ret); - return ret; + list_for_each_entry_safe(emi, n, &tmp_list, ei_list) { + list_del(&emi->ei_list); + kfree(emi); } - - /* FIXME: Make sure this isn't a corruption */ - BUG_ON(ent->e_tree_depth); - - *ret_ent = ent; - - return 0; } /* - * Callers must hold ip_lock. This can insert pieces of the tree, - * thus racing lookup if the lock weren't held. + * Is any part of emi2 contained within emi1 */ -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent) +static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, + struct ocfs2_extent_map_item *emi2) { - struct rb_node **p, *parent; - struct ocfs2_extent_map_entry *old_ent; + unsigned int range1, range2; - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), - le32_to_cpu(ent->e_rec.e_clusters), - &p, &parent); - if (old_ent) - return -EEXIST; + /* + * Check if logical start of emi2 is inside emi1 + */ + range1 = emi1->ei_cpos + emi1->ei_clusters; + if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) + return 1; - rb_link_node(&ent->e_node, parent, p); - rb_insert_color(&ent->e_node, &em->em_extents); + /* + * Check if logical end of emi2 is inside emi1 + */ + range2 = emi2->ei_cpos + emi2->ei_clusters; + if (range2 > emi1->ei_cpos && range2 <= range1) + return 1; return 0; } +static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, + struct ocfs2_extent_map_item *src) +{ + dest->ei_cpos = src->ei_cpos; + dest->ei_phys = src->ei_phys; + dest->ei_clusters = src->ei_clusters; + dest->ei_flags = src->ei_flags; +} /* - * Simple rule: on any return code other than -EAGAIN, anything left - * in the insert_context will be freed. - * - * Simple rule #2: A return code of -EEXIST from this function or - * its calls to ocfs2_extent_map_insert_entry() signifies that another - * thread beat us to the insert. It is not an actual error, but it - * tells the caller we have no more work to do. + * Try to merge emi with ins. Returns 1 if merge succeeds, zero + * otherwise. */ -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt) +static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, + struct ocfs2_extent_map_item *ins) { - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *old_ent; - - ctxt->need_left = 0; - ctxt->need_right = 0; - ctxt->old_ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - if (!ret) { - ctxt->new_ent = NULL; - goto out_unlock; - } - - /* Since insert_entry failed, the map MUST have old_ent */ - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), - le32_to_cpu(rec->e_clusters), - NULL, NULL); - - BUG_ON(!old_ent); - - if (old_ent->e_tree_depth < tree_depth) { - /* Another thread beat us to the lower tree_depth */ - ret = -EEXIST; - goto out_unlock; - } - - if (old_ent->e_tree_depth == tree_depth) { - /* - * Another thread beat us to this tree_depth. - * Let's make sure we agree with that thread (the - * extent_rec should be identical). - */ - if (!memcmp(rec, &old_ent->e_rec, - sizeof(struct ocfs2_extent_rec))) - ret = 0; - else - /* FIXME: Should this be ESRCH/EBADR??? */ - ret = -EEXIST; - - goto out_unlock; - } - /* - * We do it in this order specifically so that no actual tree - * changes occur until we have all the pieces we need. We - * don't want malloc failures to leave an inconsistent tree. - * Whenever we drop the lock, another process could be - * inserting. Also note that, if another process just beat us - * to an insert, we might not need the same pieces we needed - * the first go round. In the end, the pieces we need will - * be used, and the pieces we don't will be freed. + * Handle contiguousness */ - ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > - le32_to_cpu(old_ent->e_rec.e_cpos)); - ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) > - (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); - ret = -EAGAIN; - if (ctxt->need_left) { - if (!ctxt->left_ent) - goto out_unlock; - *(ctxt->left_ent) = *old_ent; - ctxt->left_ent->e_rec.e_clusters = - cpu_to_le32(le32_to_cpu(rec->e_cpos) - - le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); - } - if (ctxt->need_right) { - if (!ctxt->right_ent) - goto out_unlock; - *(ctxt->right_ent) = *old_ent; - ctxt->right_ent->e_rec.e_cpos = - cpu_to_le32(le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - ctxt->right_ent->e_rec.e_clusters = - cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) - - le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); - } - - rb_erase(&old_ent->e_node, &em->em_extents); - /* Now that he's erased, set him up for deletion */ - ctxt->old_ent = old_ent; - - if (ctxt->need_left) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->left_ent); - if (ret) - goto out_unlock; - ctxt->left_ent = NULL; + if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && + ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && + ins->ei_flags == emi->ei_flags) { + emi->ei_clusters += ins->ei_clusters; + return 1; + } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && + (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && + ins->ei_flags == emi->ei_flags) { + emi->ei_phys = ins->ei_phys; + emi->ei_cpos = ins->ei_cpos; + emi->ei_clusters += ins->ei_clusters; + return 1; } - if (ctxt->need_right) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->right_ent); - if (ret) - goto out_unlock; - ctxt->right_ent = NULL; + /* + * Overlapping extents - this shouldn't happen unless we've + * split an extent to change it's flags. That is exceedingly + * rare, so there's no sense in trying to optimize it yet. + */ + if (ocfs2_ei_is_contained(emi, ins) || + ocfs2_ei_is_contained(ins, emi)) { + ocfs2_copy_emi_fields(emi, ins); + return 1; } - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - - if (!ret) - ctxt->new_ent = NULL; - -out_unlock: - spin_unlock(&OCFS2_I(inode)->ip_lock); - - return ret; + /* No merge was possible. */ + return 0; } - -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth) +/* + * In order to reduce complexity on the caller, this insert function + * is intentionally liberal in what it will accept. + * + * The only rule is that the truncate call *must* be used whenever + * records have been deleted. This avoids inserting overlapping + * records with different physical mappings. + */ +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec) { - int ret; - struct ocfs2_em_insert_context ctxt = {0, }; - - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - return ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + struct ocfs2_extent_map_item *emi, *new_emi = NULL; + struct ocfs2_extent_map_item ins; + + ins.ei_cpos = le32_to_cpu(rec->e_cpos); + ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); + ins.ei_flags = rec->e_flags; + +search: + spin_lock(&oi->ip_lock); + + list_for_each_entry(emi, &em->em_list, ei_list) { + if (ocfs2_try_to_merge_extent_map(emi, &ins)) { + list_move(&emi->ei_list, &em->em_list); + spin_unlock(&oi->ip_lock); + goto out; + } } - /* Zero e_clusters means a truncated tail record. It better be EOF */ - if (!rec->e_clusters) { - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno); - return ret; - } + /* + * No item could be merged. + * + * Either allocate and add a new item, or overwrite the last recently + * inserted. + */ - /* Ignore the truncated tail */ - return 0; - } + if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { + if (new_emi == NULL) { + spin_unlock(&oi->ip_lock); - ret = -ENOMEM; - ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.new_ent) { - mlog_errno(ret); - return ret; - } + new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); + if (new_emi == NULL) + goto out; - ctxt.new_ent->e_rec = *rec; - ctxt.new_ent->e_tree_depth = tree_depth; - - do { - ret = -ENOMEM; - if (ctxt.need_left && !ctxt.left_ent) { - ctxt.left_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.left_ent) - break; + goto search; } - if (ctxt.need_right && !ctxt.right_ent) { - ctxt.right_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.right_ent) - break; - } - - ret = ocfs2_extent_map_try_insert(inode, rec, - tree_depth, &ctxt); - } while (ret == -EAGAIN); - if ((ret < 0) && (ret != -EEXIST)) - mlog_errno(ret); + ocfs2_copy_emi_fields(new_emi, &ins); + list_add(&new_emi->ei_list, &em->em_list); + em->em_num_items++; + new_emi = NULL; + } else { + BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); + emi = list_entry(em->em_list.prev, + struct ocfs2_extent_map_item, ei_list); + list_move(&emi->ei_list, &em->em_list); + ocfs2_copy_emi_fields(emi, &ins); + } - if (ctxt.left_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); - if (ctxt.right_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); - if (ctxt.old_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); - if (ctxt.new_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); + spin_unlock(&oi->ip_lock); - return ret; +out: + if (new_emi) + kfree(new_emi); } /* - * Append this record to the tail of the extent map. It must be - * tree_depth 0. The record might be an extension of an existing - * record, and as such that needs to be handled. eg: - * - * Existing record in the extent map: - * - * cpos = 10, len = 10 - * |---------| - * - * New Record: - * - * cpos = 10, len = 20 - * |------------------| - * - * The passed record is the new on-disk record. The new_clusters value - * is how many clusters were added to the file. If the append is a - * contiguous append, the new_clusters has been added to - * rec->e_clusters. If the append is an entirely new extent, then - * rec->e_clusters is == new_clusters. + * Return the 1st index within el which contains an extent start + * larger than v_cluster. */ -int ocfs2_extent_map_append(struct inode *inode, - struct ocfs2_extent_rec *rec, - u32 new_clusters) +static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, + u32 v_cluster) { - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct ocfs2_extent_rec *old; - - BUG_ON(!new_clusters); - BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); + int i; + struct ocfs2_extent_rec *rec; - if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; - mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (em->em_clusters + new_clusters), - "Inode %llu:\n" - "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" - "em->em_clusters = %u + new_clusters = %u = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), - le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), - em->em_clusters, new_clusters, - em->em_clusters + new_clusters); - - em->em_clusters += new_clusters; - - ret = -ENOENT; - if (le32_to_cpu(rec->e_clusters) > new_clusters) { - /* This is a contiguous append */ - ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, - NULL, NULL); - if (ent) { - old = &ent->e_rec; - BUG_ON((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (le32_to_cpu(old->e_cpos) + - le32_to_cpu(old->e_clusters) + - new_clusters)); - if (ent->e_tree_depth == 0) { - BUG_ON(le32_to_cpu(old->e_cpos) != - le32_to_cpu(rec->e_cpos)); - BUG_ON(le64_to_cpu(old->e_blkno) != - le64_to_cpu(rec->e_blkno)); - ret = 0; - } - /* - * Let non-leafs fall through as -ENOENT to - * force insertion of the new leaf. - */ - le32_add_cpu(&old->e_clusters, new_clusters); - } + if (v_cluster < le32_to_cpu(rec->e_cpos)) + break; } - if (ret == -ENOENT) - ret = ocfs2_extent_map_insert(inode, rec, 0); - if (ret < 0) - mlog_errno(ret); - return ret; + return i; } -#if 0 -/* Code here is included but defined out as it completes the extent - * map api and may be used in the future. */ - /* - * Look up the record containing this cluster offset. This record is - * part of the extent map. Do not free it. Any changes you make to - * it will reflect in the extent map. So, if your last extent - * is (cpos = 10, clusters = 10) and you truncate the file by 5 - * clusters, you can do: + * Figure out the size of a hole which starts at v_cluster within the given + * extent list. * - * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); - * rec->e_clusters -= 5; + * If there is no more allocation past v_cluster, we return the maximum + * cluster size minus v_cluster. * - * The lookup does not read from disk. If the map isn't filled in for - * an entry, you won't find it. - * - * Also note that the returned record is valid until alloc_sem is - * dropped. After that, truncate and extend can happen. Caveat Emptor. + * If we have in-inode extents, then el points to the dinode list and + * eb_bh is NULL. Otherwise, eb_bh should point to the extent block + * containing el. */ -int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, - struct ocfs2_extent_rec **rec, - int *tree_depth) +static int ocfs2_figure_hole_clusters(struct inode *inode, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) { - int ret = -ENOENT; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; + int ret, i; + struct buffer_head *next_eb_bh = NULL; + struct ocfs2_extent_block *eb, *next_eb; - *rec = NULL; + i = ocfs2_search_for_hole_index(el, v_cluster); - if (cpos >= OCFS2_I(inode)->ip_clusters) - return -EINVAL; + if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { + eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (cpos >= em->em_clusters) { /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters + * Check the next leaf for any extents. */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters ; - } - - ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, - NULL, NULL); - if (ent) { - *rec = &ent->e_rec; - if (tree_depth) - *tree_depth = ent->e_tree_depth; - ret = 0; - } + if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) + goto no_more_extents; - return ret; -} + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + le64_to_cpu(eb->h_next_leaf_blk), + &next_eb_bh, OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + goto out; + } + next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; -int ocfs2_extent_map_get_clusters(struct inode *inode, - u32 v_cpos, int count, - u32 *p_cpos, int *ret_count) -{ - int ret; - u32 coff, ccount; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent = NULL; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) { + ret = -EROFS; + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb); + goto out; + } - *p_cpos = ccount = 0; + el = &next_eb->h_list; - if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) - return -EINVAL; + i = ocfs2_search_for_hole_index(el, v_cluster); + } - if ((v_cpos + count) > em->em_clusters) { +no_more_extents: + if (i == le16_to_cpu(el->l_next_free_rec)) { /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters + * We're at the end of our existing allocation. Just + * return the maximum number of clusters we could + * possibly allocate. */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } - - - ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); - if (ret) - return ret; - - if (ent) { - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, - v_cpos, - count)) - return -ESRCH; - - coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); - *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, - le64_to_cpu(ent->e_rec.e_blkno)) + - coff; - - if (ret_count) - *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; - - return 0; + *num_clusters = UINT_MAX - v_cluster; + } else { + *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; } - - return -ENOENT; + ret = 0; +out: + brelse(next_eb_bh); + return ret; } -#endif /* 0 */ - -int ocfs2_extent_map_get_blocks(struct inode *inode, - u64 v_blkno, int count, - u64 *p_blkno, int *ret_count) +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + unsigned int *extent_flags) { - int ret; - u64 boff; - u32 cpos, clusters; - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - struct ocfs2_extent_map_entry *ent = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + int ret, i; + unsigned int flags = 0; + struct buffer_head *di_bh = NULL; + struct buffer_head *eb_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; + u32 coff; - *p_blkno = 0; - - cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - clusters = ocfs2_blocks_to_clusters(inode->i_sb, - (u64)count + bpc - 1); - if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { - ret = -EINVAL; + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = -ERANGE; mlog_errno(ret); - return ret; + goto out; } - if ((cpos + clusters) > em->em_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } + ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, + num_clusters, extent_flags); + if (ret == 0) + goto out; - ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, + &di_bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); - return ret; + goto out; } - if (ent) - { - rec = &ent->e_rec; + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { - ret = -ESRCH; + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + if (ret) { mlog_errno(ret); - return ret; + goto out; } - boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - - le32_to_cpu(rec->e_cpos)); - boff += (v_blkno & (u64)(bpc - 1)); - *p_blkno = le64_to_cpu(rec->e_blkno) + boff; + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; - if (ret_count) { - *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(rec->e_clusters)) - boff; + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; } - - return 0; } - return -ENOENT; -} - -int ocfs2_extent_map_init(struct inode *inode) -{ - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - - em->em_extents = RB_ROOT; - em->em_clusters = 0; - - return 0; -} - -/* Needs the lock */ -static void __ocfs2_extent_map_drop(struct inode *inode, - u32 new_clusters, - struct rb_node **free_head, - struct ocfs2_extent_map_entry **tail_ent) -{ - struct rb_node *node, *next; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + /* + * A hole was found. Return some canned values that + * callers can key on. If asked for, num_clusters will + * be populated with the size of the hole. + */ + *p_cluster = 0; + if (num_clusters) { + ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, + v_cluster, + num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } else { + rec = &el->l_recs[i]; - *free_head = NULL; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); - ent = NULL; - node = rb_last(&em->em_extents); - while (node) - { - next = rb_prev(node); + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) - break; + coff = v_cluster - le32_to_cpu(rec->e_cpos); - rb_erase(&ent->e_node, &em->em_extents); + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; - node->rb_right = *free_head; - *free_head = node; + if (num_clusters) + *num_clusters = ocfs2_rec_clusters(el, rec) - coff; - ent = NULL; - node = next; - } + flags = rec->e_flags; - /* Do we have an entry straddling new_clusters? */ - if (tail_ent) { - if (ent && - ((le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) - *tail_ent = ent; - else - *tail_ent = NULL; + ocfs2_extent_map_insert_rec(inode, rec); } -} - -static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) -{ - struct rb_node *node; - struct ocfs2_extent_map_entry *ent; - while (free_head) { - node = free_head; - free_head = node->rb_right; + if (extent_flags) + *extent_flags = flags; - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - kmem_cache_free(ocfs2_em_ent_cachep, ent); - } +out: + brelse(di_bh); + brelse(eb_bh); + return ret; } /* - * Remove all entries past new_clusters, inclusive of an entry that - * contains new_clusters. This is effectively a cache forget. - * - * If you want to also clip the last extent by some number of clusters, - * you need to call ocfs2_extent_map_trunc(). - * This code does not check or modify ip_clusters. + * This expects alloc_sem to be held. The allocation cannot change at + * all while the map is in the process of being updated. */ -int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags) { - struct rb_node *free_head = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - - spin_lock(&OCFS2_I(inode)->ip_lock); + int ret; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 cpos, num_clusters, p_cluster; + u64 boff = 0; - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - if (ent) { - rb_erase(&ent->e_node, &em->em_extents); - ent->e_node.rb_right = free_head; - free_head = &ent->e_node; + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, + extent_flags); + if (ret) { + mlog_errno(ret); + goto out; } - spin_unlock(&OCFS2_I(inode)->ip_lock); - - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} - -/* - * Remove all entries past new_clusters and also clip any extent - * straddling new_clusters, if there is one. This does not check - * or modify ip_clusters - */ -int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) -{ - struct rb_node *free_head = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); - - if (ent) - ent->e_rec.e_clusters = cpu_to_le32(new_clusters - - le32_to_cpu(ent->e_rec.e_cpos)); - - OCFS2_I(inode)->ip_map.em_clusters = new_clusters; - - spin_unlock(&OCFS2_I(inode)->ip_lock); - - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} + /* + * p_cluster == 0 indicates a hole. + */ + if (p_cluster) { + boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + boff += (v_blkno & (u64)(bpc - 1)); + } -int __init init_ocfs2_extent_maps(void) -{ - ocfs2_em_ent_cachep = - kmem_cache_create("ocfs2_em_ent", - sizeof(struct ocfs2_extent_map_entry), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!ocfs2_em_ent_cachep) - return -ENOMEM; + *p_blkno = boff; - return 0; -} + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + *ret_count -= v_blkno & (u64)(bpc - 1); + } -void exit_ocfs2_extent_maps(void) -{ - kmem_cache_destroy(ocfs2_em_ent_cachep); +out: + return ret; }