]> err.no Git - linux-2.6/blob - fs/ocfs2/alloc.c
ocfs2: sparse b-tree support
[linux-2.6] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30
31 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
32 #include <cluster/masklog.h>
33
34 #include "ocfs2.h"
35
36 #include "alloc.h"
37 #include "dlmglue.h"
38 #include "extent_map.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "sysfile.h"
44 #include "file.h"
45 #include "super.h"
46 #include "uptodate.h"
47
48 #include "buffer_head_io.h"
49
50 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
51
52 /*
53  * Structures which describe a path through a btree, and functions to
54  * manipulate them.
55  *
56  * The idea here is to be as generic as possible with the tree
57  * manipulation code.
58  */
59 struct ocfs2_path_item {
60         struct buffer_head              *bh;
61         struct ocfs2_extent_list        *el;
62 };
63
64 #define OCFS2_MAX_PATH_DEPTH    5
65
66 struct ocfs2_path {
67         int                     p_tree_depth;
68         struct ocfs2_path_item  p_node[OCFS2_MAX_PATH_DEPTH];
69 };
70
71 #define path_root_bh(_path) ((_path)->p_node[0].bh)
72 #define path_root_el(_path) ((_path)->p_node[0].el)
73 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
74 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
75 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
76
77 /*
78  * Reset the actual path elements so that we can re-use the structure
79  * to build another path. Generally, this involves freeing the buffer
80  * heads.
81  */
82 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
83 {
84         int i, start = 0, depth = 0;
85         struct ocfs2_path_item *node;
86
87         if (keep_root)
88                 start = 1;
89
90         for(i = start; i < path_num_items(path); i++) {
91                 node = &path->p_node[i];
92
93                 brelse(node->bh);
94                 node->bh = NULL;
95                 node->el = NULL;
96         }
97
98         /*
99          * Tree depth may change during truncate, or insert. If we're
100          * keeping the root extent list, then make sure that our path
101          * structure reflects the proper depth.
102          */
103         if (keep_root)
104                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
105
106         path->p_tree_depth = depth;
107 }
108
109 static void ocfs2_free_path(struct ocfs2_path *path)
110 {
111         if (path) {
112                 ocfs2_reinit_path(path, 0);
113                 kfree(path);
114         }
115 }
116
117 /*
118  * Make the *dest path the same as src and re-initialize src path to
119  * have a root only.
120  */
121 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
122 {
123         int i;
124
125         BUG_ON(path_root_bh(dest) != path_root_bh(src));
126
127         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
128                 brelse(dest->p_node[i].bh);
129
130                 dest->p_node[i].bh = src->p_node[i].bh;
131                 dest->p_node[i].el = src->p_node[i].el;
132
133                 src->p_node[i].bh = NULL;
134                 src->p_node[i].el = NULL;
135         }
136 }
137
138 /*
139  * Insert an extent block at given index.
140  *
141  * This will not take an additional reference on eb_bh.
142  */
143 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
144                                         struct buffer_head *eb_bh)
145 {
146         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
147
148         /*
149          * Right now, no root bh is an extent block, so this helps
150          * catch code errors with dinode trees. The assertion can be
151          * safely removed if we ever need to insert extent block
152          * structures at the root.
153          */
154         BUG_ON(index == 0);
155
156         path->p_node[index].bh = eb_bh;
157         path->p_node[index].el = &eb->h_list;
158 }
159
160 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
161                                          struct ocfs2_extent_list *root_el)
162 {
163         struct ocfs2_path *path;
164
165         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
166
167         path = kzalloc(sizeof(*path), GFP_NOFS);
168         if (path) {
169                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
170                 get_bh(root_bh);
171                 path_root_bh(path) = root_bh;
172                 path_root_el(path) = root_el;
173         }
174
175         return path;
176 }
177
178 /*
179  * Allocate and initialize a new path based on a disk inode tree.
180  */
181 static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
182 {
183         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
184         struct ocfs2_extent_list *el = &di->id2.i_list;
185
186         return ocfs2_new_path(di_bh, el);
187 }
188
189 /*
190  * Convenience function to journal all components in a path.
191  */
192 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
193                                      struct ocfs2_path *path)
194 {
195         int i, ret = 0;
196
197         if (!path)
198                 goto out;
199
200         for(i = 0; i < path_num_items(path); i++) {
201                 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
202                                            OCFS2_JOURNAL_ACCESS_WRITE);
203                 if (ret < 0) {
204                         mlog_errno(ret);
205                         goto out;
206                 }
207         }
208
209 out:
210         return ret;
211 }
212
213 enum ocfs2_contig_type {
214         CONTIG_NONE = 0,
215         CONTIG_LEFT,
216         CONTIG_RIGHT
217 };
218
219 static int ocfs2_block_extent_contig(struct super_block *sb,
220                                      struct ocfs2_extent_rec *ext,
221                                      u64 blkno)
222 {
223         return blkno == (le64_to_cpu(ext->e_blkno) +
224                          ocfs2_clusters_to_blocks(sb,
225                                                   le32_to_cpu(ext->e_clusters)));
226 }
227
228 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
229                                   struct ocfs2_extent_rec *right)
230 {
231         return (le32_to_cpu(left->e_cpos) + le32_to_cpu(left->e_clusters) ==
232                 le32_to_cpu(right->e_cpos));
233 }
234
235 static enum ocfs2_contig_type
236         ocfs2_extent_contig(struct inode *inode,
237                             struct ocfs2_extent_rec *ext,
238                             struct ocfs2_extent_rec *insert_rec)
239 {
240         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
241
242         if (ocfs2_extents_adjacent(ext, insert_rec) &&
243             ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
244                         return CONTIG_RIGHT;
245
246         blkno = le64_to_cpu(ext->e_blkno);
247         if (ocfs2_extents_adjacent(insert_rec, ext) &&
248             ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
249                 return CONTIG_LEFT;
250
251         return CONTIG_NONE;
252 }
253
254 /*
255  * NOTE: We can have pretty much any combination of contiguousness and
256  * appending.
257  *
258  * The usefulness of APPEND_TAIL is more in that it lets us know that
259  * we'll have to update the path to that leaf.
260  */
261 enum ocfs2_append_type {
262         APPEND_NONE = 0,
263         APPEND_TAIL,
264 };
265
266 struct ocfs2_insert_type {
267         enum ocfs2_append_type  ins_appending;
268         enum ocfs2_contig_type  ins_contig;
269         int                     ins_contig_index;
270         int                     ins_free_records;
271         int                     ins_tree_depth;
272 };
273
274 /*
275  * How many free extents have we got before we need more meta data?
276  */
277 int ocfs2_num_free_extents(struct ocfs2_super *osb,
278                            struct inode *inode,
279                            struct ocfs2_dinode *fe)
280 {
281         int retval;
282         struct ocfs2_extent_list *el;
283         struct ocfs2_extent_block *eb;
284         struct buffer_head *eb_bh = NULL;
285
286         mlog_entry_void();
287
288         if (!OCFS2_IS_VALID_DINODE(fe)) {
289                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
290                 retval = -EIO;
291                 goto bail;
292         }
293
294         if (fe->i_last_eb_blk) {
295                 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
296                                           &eb_bh, OCFS2_BH_CACHED, inode);
297                 if (retval < 0) {
298                         mlog_errno(retval);
299                         goto bail;
300                 }
301                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
302                 el = &eb->h_list;
303         } else
304                 el = &fe->id2.i_list;
305
306         BUG_ON(el->l_tree_depth != 0);
307
308         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
309 bail:
310         if (eb_bh)
311                 brelse(eb_bh);
312
313         mlog_exit(retval);
314         return retval;
315 }
316
317 /* expects array to already be allocated
318  *
319  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
320  * l_count for you
321  */
322 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
323                                      handle_t *handle,
324                                      struct inode *inode,
325                                      int wanted,
326                                      struct ocfs2_alloc_context *meta_ac,
327                                      struct buffer_head *bhs[])
328 {
329         int count, status, i;
330         u16 suballoc_bit_start;
331         u32 num_got;
332         u64 first_blkno;
333         struct ocfs2_extent_block *eb;
334
335         mlog_entry_void();
336
337         count = 0;
338         while (count < wanted) {
339                 status = ocfs2_claim_metadata(osb,
340                                               handle,
341                                               meta_ac,
342                                               wanted - count,
343                                               &suballoc_bit_start,
344                                               &num_got,
345                                               &first_blkno);
346                 if (status < 0) {
347                         mlog_errno(status);
348                         goto bail;
349                 }
350
351                 for(i = count;  i < (num_got + count); i++) {
352                         bhs[i] = sb_getblk(osb->sb, first_blkno);
353                         if (bhs[i] == NULL) {
354                                 status = -EIO;
355                                 mlog_errno(status);
356                                 goto bail;
357                         }
358                         ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
359
360                         status = ocfs2_journal_access(handle, inode, bhs[i],
361                                                       OCFS2_JOURNAL_ACCESS_CREATE);
362                         if (status < 0) {
363                                 mlog_errno(status);
364                                 goto bail;
365                         }
366
367                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
368                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
369                         /* Ok, setup the minimal stuff here. */
370                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
371                         eb->h_blkno = cpu_to_le64(first_blkno);
372                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
373
374 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
375                         /* we always use slot zero's suballocator */
376                         eb->h_suballoc_slot = 0;
377 #else
378                         eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
379 #endif
380                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
381                         eb->h_list.l_count =
382                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
383
384                         suballoc_bit_start++;
385                         first_blkno++;
386
387                         /* We'll also be dirtied by the caller, so
388                          * this isn't absolutely necessary. */
389                         status = ocfs2_journal_dirty(handle, bhs[i]);
390                         if (status < 0) {
391                                 mlog_errno(status);
392                                 goto bail;
393                         }
394                 }
395
396                 count += num_got;
397         }
398
399         status = 0;
400 bail:
401         if (status < 0) {
402                 for(i = 0; i < wanted; i++) {
403                         if (bhs[i])
404                                 brelse(bhs[i]);
405                         bhs[i] = NULL;
406                 }
407         }
408         mlog_exit(status);
409         return status;
410 }
411
412 /*
413  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
414  *
415  * Returns the sum of the rightmost extent rec logical offset and
416  * cluster count.
417  *
418  * ocfs2_add_branch() uses this to determine what logical cluster
419  * value should be populated into the leftmost new branch records.
420  *
421  * ocfs2_shift_tree_depth() uses this to determine the # clusters
422  * value for the new topmost tree record.
423  */
424 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
425 {
426         int i;
427
428         i = le16_to_cpu(el->l_next_free_rec) - 1;
429
430         return le32_to_cpu(el->l_recs[i].e_cpos) +
431                 le32_to_cpu(el->l_recs[i].e_clusters);
432 }
433
434 /*
435  * Add an entire tree branch to our inode. eb_bh is the extent block
436  * to start at, if we don't want to start the branch at the dinode
437  * structure.
438  *
439  * last_eb_bh is required as we have to update it's next_leaf pointer
440  * for the new last extent block.
441  *
442  * the new branch will be 'empty' in the sense that every block will
443  * contain a single record with e_clusters == 0.
444  */
445 static int ocfs2_add_branch(struct ocfs2_super *osb,
446                             handle_t *handle,
447                             struct inode *inode,
448                             struct buffer_head *fe_bh,
449                             struct buffer_head *eb_bh,
450                             struct buffer_head *last_eb_bh,
451                             struct ocfs2_alloc_context *meta_ac)
452 {
453         int status, new_blocks, i;
454         u64 next_blkno, new_last_eb_blk;
455         struct buffer_head *bh;
456         struct buffer_head **new_eb_bhs = NULL;
457         struct ocfs2_dinode *fe;
458         struct ocfs2_extent_block *eb;
459         struct ocfs2_extent_list  *eb_el;
460         struct ocfs2_extent_list  *el;
461         u32 new_cpos;
462
463         mlog_entry_void();
464
465         BUG_ON(!last_eb_bh);
466
467         fe = (struct ocfs2_dinode *) fe_bh->b_data;
468
469         if (eb_bh) {
470                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
471                 el = &eb->h_list;
472         } else
473                 el = &fe->id2.i_list;
474
475         /* we never add a branch to a leaf. */
476         BUG_ON(!el->l_tree_depth);
477
478         new_blocks = le16_to_cpu(el->l_tree_depth);
479
480         /* allocate the number of new eb blocks we need */
481         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
482                              GFP_KERNEL);
483         if (!new_eb_bhs) {
484                 status = -ENOMEM;
485                 mlog_errno(status);
486                 goto bail;
487         }
488
489         status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
490                                            meta_ac, new_eb_bhs);
491         if (status < 0) {
492                 mlog_errno(status);
493                 goto bail;
494         }
495
496         eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
497         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
498
499         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
500          * linked with the rest of the tree.
501          * conversly, new_eb_bhs[0] is the new bottommost leaf.
502          *
503          * when we leave the loop, new_last_eb_blk will point to the
504          * newest leaf, and next_blkno will point to the topmost extent
505          * block. */
506         next_blkno = new_last_eb_blk = 0;
507         for(i = 0; i < new_blocks; i++) {
508                 bh = new_eb_bhs[i];
509                 eb = (struct ocfs2_extent_block *) bh->b_data;
510                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
511                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
512                         status = -EIO;
513                         goto bail;
514                 }
515                 eb_el = &eb->h_list;
516
517                 status = ocfs2_journal_access(handle, inode, bh,
518                                               OCFS2_JOURNAL_ACCESS_CREATE);
519                 if (status < 0) {
520                         mlog_errno(status);
521                         goto bail;
522                 }
523
524                 eb->h_next_leaf_blk = 0;
525                 eb_el->l_tree_depth = cpu_to_le16(i);
526                 eb_el->l_next_free_rec = cpu_to_le16(1);
527                 /*
528                  * This actually counts as an empty extent as
529                  * c_clusters == 0
530                  */
531                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
532                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
533                 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
534                 if (!eb_el->l_tree_depth)
535                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
536
537                 status = ocfs2_journal_dirty(handle, bh);
538                 if (status < 0) {
539                         mlog_errno(status);
540                         goto bail;
541                 }
542
543                 next_blkno = le64_to_cpu(eb->h_blkno);
544         }
545
546         /* This is a bit hairy. We want to update up to three blocks
547          * here without leaving any of them in an inconsistent state
548          * in case of error. We don't have to worry about
549          * journal_dirty erroring as it won't unless we've aborted the
550          * handle (in which case we would never be here) so reserving
551          * the write with journal_access is all we need to do. */
552         status = ocfs2_journal_access(handle, inode, last_eb_bh,
553                                       OCFS2_JOURNAL_ACCESS_WRITE);
554         if (status < 0) {
555                 mlog_errno(status);
556                 goto bail;
557         }
558         status = ocfs2_journal_access(handle, inode, fe_bh,
559                                       OCFS2_JOURNAL_ACCESS_WRITE);
560         if (status < 0) {
561                 mlog_errno(status);
562                 goto bail;
563         }
564         if (eb_bh) {
565                 status = ocfs2_journal_access(handle, inode, eb_bh,
566                                               OCFS2_JOURNAL_ACCESS_WRITE);
567                 if (status < 0) {
568                         mlog_errno(status);
569                         goto bail;
570                 }
571         }
572
573         /* Link the new branch into the rest of the tree (el will
574          * either be on the fe, or the extent block passed in. */
575         i = le16_to_cpu(el->l_next_free_rec);
576         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
577         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
578         el->l_recs[i].e_clusters = 0;
579         le16_add_cpu(&el->l_next_free_rec, 1);
580
581         /* fe needs a new last extent block pointer, as does the
582          * next_leaf on the previously last-extent-block. */
583         fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
584
585         eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
586         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
587
588         status = ocfs2_journal_dirty(handle, last_eb_bh);
589         if (status < 0)
590                 mlog_errno(status);
591         status = ocfs2_journal_dirty(handle, fe_bh);
592         if (status < 0)
593                 mlog_errno(status);
594         if (eb_bh) {
595                 status = ocfs2_journal_dirty(handle, eb_bh);
596                 if (status < 0)
597                         mlog_errno(status);
598         }
599
600         status = 0;
601 bail:
602         if (new_eb_bhs) {
603                 for (i = 0; i < new_blocks; i++)
604                         if (new_eb_bhs[i])
605                                 brelse(new_eb_bhs[i]);
606                 kfree(new_eb_bhs);
607         }
608
609         mlog_exit(status);
610         return status;
611 }
612
613 /*
614  * adds another level to the allocation tree.
615  * returns back the new extent block so you can add a branch to it
616  * after this call.
617  */
618 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
619                                   handle_t *handle,
620                                   struct inode *inode,
621                                   struct buffer_head *fe_bh,
622                                   struct ocfs2_alloc_context *meta_ac,
623                                   struct buffer_head **ret_new_eb_bh)
624 {
625         int status, i;
626         u32 new_clusters;
627         struct buffer_head *new_eb_bh = NULL;
628         struct ocfs2_dinode *fe;
629         struct ocfs2_extent_block *eb;
630         struct ocfs2_extent_list  *fe_el;
631         struct ocfs2_extent_list  *eb_el;
632
633         mlog_entry_void();
634
635         status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
636                                            &new_eb_bh);
637         if (status < 0) {
638                 mlog_errno(status);
639                 goto bail;
640         }
641
642         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
643         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
644                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
645                 status = -EIO;
646                 goto bail;
647         }
648
649         eb_el = &eb->h_list;
650         fe = (struct ocfs2_dinode *) fe_bh->b_data;
651         fe_el = &fe->id2.i_list;
652
653         status = ocfs2_journal_access(handle, inode, new_eb_bh,
654                                       OCFS2_JOURNAL_ACCESS_CREATE);
655         if (status < 0) {
656                 mlog_errno(status);
657                 goto bail;
658         }
659
660         /* copy the fe data into the new extent block */
661         eb_el->l_tree_depth = fe_el->l_tree_depth;
662         eb_el->l_next_free_rec = fe_el->l_next_free_rec;
663         for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
664                 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
665                 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
666                 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
667         }
668
669         status = ocfs2_journal_dirty(handle, new_eb_bh);
670         if (status < 0) {
671                 mlog_errno(status);
672                 goto bail;
673         }
674
675         status = ocfs2_journal_access(handle, inode, fe_bh,
676                                       OCFS2_JOURNAL_ACCESS_WRITE);
677         if (status < 0) {
678                 mlog_errno(status);
679                 goto bail;
680         }
681
682         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
683
684         /* update fe now */
685         le16_add_cpu(&fe_el->l_tree_depth, 1);
686         fe_el->l_recs[0].e_cpos = 0;
687         fe_el->l_recs[0].e_blkno = eb->h_blkno;
688         fe_el->l_recs[0].e_clusters = cpu_to_le32(new_clusters);
689         for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
690                 fe_el->l_recs[i].e_cpos = 0;
691                 fe_el->l_recs[i].e_clusters = 0;
692                 fe_el->l_recs[i].e_blkno = 0;
693         }
694         fe_el->l_next_free_rec = cpu_to_le16(1);
695
696         /* If this is our 1st tree depth shift, then last_eb_blk
697          * becomes the allocated extent block */
698         if (fe_el->l_tree_depth == cpu_to_le16(1))
699                 fe->i_last_eb_blk = eb->h_blkno;
700
701         status = ocfs2_journal_dirty(handle, fe_bh);
702         if (status < 0) {
703                 mlog_errno(status);
704                 goto bail;
705         }
706
707         *ret_new_eb_bh = new_eb_bh;
708         new_eb_bh = NULL;
709         status = 0;
710 bail:
711         if (new_eb_bh)
712                 brelse(new_eb_bh);
713
714         mlog_exit(status);
715         return status;
716 }
717
718 /*
719  * Should only be called when there is no space left in any of the
720  * leaf nodes. What we want to do is find the lowest tree depth
721  * non-leaf extent block with room for new records. There are three
722  * valid results of this search:
723  *
724  * 1) a lowest extent block is found, then we pass it back in
725  *    *lowest_eb_bh and return '0'
726  *
727  * 2) the search fails to find anything, but the dinode has room. We
728  *    pass NULL back in *lowest_eb_bh, but still return '0'
729  *
730  * 3) the search fails to find anything AND the dinode is full, in
731  *    which case we return > 0
732  *
733  * return status < 0 indicates an error.
734  */
735 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
736                                     struct inode *inode,
737                                     struct buffer_head *fe_bh,
738                                     struct buffer_head **target_bh)
739 {
740         int status = 0, i;
741         u64 blkno;
742         struct ocfs2_dinode *fe;
743         struct ocfs2_extent_block *eb;
744         struct ocfs2_extent_list  *el;
745         struct buffer_head *bh = NULL;
746         struct buffer_head *lowest_bh = NULL;
747
748         mlog_entry_void();
749
750         *target_bh = NULL;
751
752         fe = (struct ocfs2_dinode *) fe_bh->b_data;
753         el = &fe->id2.i_list;
754
755         while(le16_to_cpu(el->l_tree_depth) > 1) {
756                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
757                         ocfs2_error(inode->i_sb, "Dinode %llu has empty "
758                                     "extent list (next_free_rec == 0)",
759                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
760                         status = -EIO;
761                         goto bail;
762                 }
763                 i = le16_to_cpu(el->l_next_free_rec) - 1;
764                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
765                 if (!blkno) {
766                         ocfs2_error(inode->i_sb, "Dinode %llu has extent "
767                                     "list where extent # %d has no physical "
768                                     "block start",
769                                     (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
770                         status = -EIO;
771                         goto bail;
772                 }
773
774                 if (bh) {
775                         brelse(bh);
776                         bh = NULL;
777                 }
778
779                 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
780                                           inode);
781                 if (status < 0) {
782                         mlog_errno(status);
783                         goto bail;
784                 }
785
786                 eb = (struct ocfs2_extent_block *) bh->b_data;
787                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
788                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
789                         status = -EIO;
790                         goto bail;
791                 }
792                 el = &eb->h_list;
793
794                 if (le16_to_cpu(el->l_next_free_rec) <
795                     le16_to_cpu(el->l_count)) {
796                         if (lowest_bh)
797                                 brelse(lowest_bh);
798                         lowest_bh = bh;
799                         get_bh(lowest_bh);
800                 }
801         }
802
803         /* If we didn't find one and the fe doesn't have any room,
804          * then return '1' */
805         if (!lowest_bh
806             && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
807                 status = 1;
808
809         *target_bh = lowest_bh;
810 bail:
811         if (bh)
812                 brelse(bh);
813
814         mlog_exit(status);
815         return status;
816 }
817
818 static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
819 {
820         return !rec->e_clusters;
821 }
822
823 /*
824  * This function will discard the rightmost extent record.
825  */
826 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
827 {
828         int next_free = le16_to_cpu(el->l_next_free_rec);
829         int count = le16_to_cpu(el->l_count);
830         unsigned int num_bytes;
831
832         BUG_ON(!next_free);
833         /* This will cause us to go off the end of our extent list. */
834         BUG_ON(next_free >= count);
835
836         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
837
838         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
839 }
840
841 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
842                               struct ocfs2_extent_rec *insert_rec)
843 {
844         int i, insert_index, next_free, has_empty, num_bytes;
845         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
846         struct ocfs2_extent_rec *rec;
847
848         next_free = le16_to_cpu(el->l_next_free_rec);
849         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
850
851         BUG_ON(!next_free);
852
853         /* The tree code before us didn't allow enough room in the leaf. */
854         if (el->l_next_free_rec == el->l_count && !has_empty)
855                 BUG();
856
857         /*
858          * The easiest way to approach this is to just remove the
859          * empty extent and temporarily decrement next_free.
860          */
861         if (has_empty) {
862                 /*
863                  * If next_free was 1 (only an empty extent), this
864                  * loop won't execute, which is fine. We still want
865                  * the decrement above to happen.
866                  */
867                 for(i = 0; i < (next_free - 1); i++)
868                         el->l_recs[i] = el->l_recs[i+1];
869
870                 next_free--;
871         }
872
873         /*
874          * Figure out what the new record index should be.
875          */
876         for(i = 0; i < next_free; i++) {
877                 rec = &el->l_recs[i];
878
879                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
880                         break;
881         }
882         insert_index = i;
883
884         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
885              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
886
887         BUG_ON(insert_index < 0);
888         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
889         BUG_ON(insert_index > next_free);
890
891         /*
892          * No need to memmove if we're just adding to the tail.
893          */
894         if (insert_index != next_free) {
895                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
896
897                 num_bytes = next_free - insert_index;
898                 num_bytes *= sizeof(struct ocfs2_extent_rec);
899                 memmove(&el->l_recs[insert_index + 1],
900                         &el->l_recs[insert_index],
901                         num_bytes);
902         }
903
904         /*
905          * Either we had an empty extent, and need to re-increment or
906          * there was no empty extent on a non full rightmost leaf node,
907          * in which case we still need to increment.
908          */
909         next_free++;
910         el->l_next_free_rec = cpu_to_le16(next_free);
911         /*
912          * Make sure none of the math above just messed up our tree.
913          */
914         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
915
916         el->l_recs[insert_index] = *insert_rec;
917
918 }
919
920 /*
921  * Create an empty extent record .
922  *
923  * l_next_free_rec may be updated.
924  *
925  * If an empty extent already exists do nothing.
926  */
927 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
928 {
929         int next_free = le16_to_cpu(el->l_next_free_rec);
930
931         if (next_free == 0)
932                 goto set_and_inc;
933
934         if (ocfs2_is_empty_extent(&el->l_recs[0]))
935                 return;
936
937         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
938                         "Asked to create an empty extent in a full list:\n"
939                         "count = %u, tree depth = %u",
940                         le16_to_cpu(el->l_count),
941                         le16_to_cpu(el->l_tree_depth));
942
943         ocfs2_shift_records_right(el);
944
945 set_and_inc:
946         le16_add_cpu(&el->l_next_free_rec, 1);
947         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
948 }
949
950 /*
951  * For a rotation which involves two leaf nodes, the "root node" is
952  * the lowest level tree node which contains a path to both leafs. This
953  * resulting set of information can be used to form a complete "subtree"
954  *
955  * This function is passed two full paths from the dinode down to a
956  * pair of adjacent leaves. It's task is to figure out which path
957  * index contains the subtree root - this can be the root index itself
958  * in a worst-case rotation.
959  *
960  * The array index of the subtree root is passed back.
961  */
962 static int ocfs2_find_subtree_root(struct inode *inode,
963                                    struct ocfs2_path *left,
964                                    struct ocfs2_path *right)
965 {
966         int i = 0;
967
968         /*
969          * Check that the caller passed in two paths from the same tree.
970          */
971         BUG_ON(path_root_bh(left) != path_root_bh(right));
972
973         do {
974                 i++;
975
976                 /*
977                  * The caller didn't pass two adjacent paths.
978                  */
979                 mlog_bug_on_msg(i > left->p_tree_depth,
980                                 "Inode %lu, left depth %u, right depth %u\n"
981                                 "left leaf blk %llu, right leaf blk %llu\n",
982                                 inode->i_ino, left->p_tree_depth,
983                                 right->p_tree_depth,
984                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
985                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
986         } while (left->p_node[i].bh->b_blocknr ==
987                  right->p_node[i].bh->b_blocknr);
988
989         return i - 1;
990 }
991
992 typedef void (path_insert_t)(void *, struct buffer_head *);
993
994 /*
995  * Traverse a btree path in search of cpos, starting at root_el.
996  *
997  * This code can be called with a cpos larger than the tree, in which
998  * case it will return the rightmost path.
999  */
1000 static int __ocfs2_find_path(struct inode *inode,
1001                              struct ocfs2_extent_list *root_el, u32 cpos,
1002                              path_insert_t *func, void *data)
1003 {
1004         int i, ret = 0;
1005         u32 range;
1006         u64 blkno;
1007         struct buffer_head *bh = NULL;
1008         struct ocfs2_extent_block *eb;
1009         struct ocfs2_extent_list *el;
1010         struct ocfs2_extent_rec *rec;
1011         struct ocfs2_inode_info *oi = OCFS2_I(inode);
1012
1013         el = root_el;
1014         while (el->l_tree_depth) {
1015                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1016                         ocfs2_error(inode->i_sb,
1017                                     "Inode %llu has empty extent list at "
1018                                     "depth %u\n",
1019                                     (unsigned long long)oi->ip_blkno,
1020                                     le16_to_cpu(el->l_tree_depth));
1021                         ret = -EROFS;
1022                         goto out;
1023
1024                 }
1025
1026                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1027                         rec = &el->l_recs[i];
1028
1029                         /*
1030                          * In the case that cpos is off the allocation
1031                          * tree, this should just wind up returning the
1032                          * rightmost record.
1033                          */
1034                         range = le32_to_cpu(rec->e_cpos) +
1035                                 le32_to_cpu(rec->e_clusters);
1036                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1037                             break;
1038                 }
1039
1040                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1041                 if (blkno == 0) {
1042                         ocfs2_error(inode->i_sb,
1043                                     "Inode %llu has bad blkno in extent list "
1044                                     "at depth %u (index %d)\n",
1045                                     (unsigned long long)oi->ip_blkno,
1046                                     le16_to_cpu(el->l_tree_depth), i);
1047                         ret = -EROFS;
1048                         goto out;
1049                 }
1050
1051                 brelse(bh);
1052                 bh = NULL;
1053                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1054                                        &bh, OCFS2_BH_CACHED, inode);
1055                 if (ret) {
1056                         mlog_errno(ret);
1057                         goto out;
1058                 }
1059
1060                 eb = (struct ocfs2_extent_block *) bh->b_data;
1061                 el = &eb->h_list;
1062                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1063                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1064                         ret = -EIO;
1065                         goto out;
1066                 }
1067
1068                 if (le16_to_cpu(el->l_next_free_rec) >
1069                     le16_to_cpu(el->l_count)) {
1070                         ocfs2_error(inode->i_sb,
1071                                     "Inode %llu has bad count in extent list "
1072                                     "at block %llu (next free=%u, count=%u)\n",
1073                                     (unsigned long long)oi->ip_blkno,
1074                                     (unsigned long long)bh->b_blocknr,
1075                                     le16_to_cpu(el->l_next_free_rec),
1076                                     le16_to_cpu(el->l_count));
1077                         ret = -EROFS;
1078                         goto out;
1079                 }
1080
1081                 if (func)
1082                         func(data, bh);
1083         }
1084
1085 out:
1086         /*
1087          * Catch any trailing bh that the loop didn't handle.
1088          */
1089         brelse(bh);
1090
1091         return ret;
1092 }
1093
1094 /*
1095  * Given an initialized path (that is, it has a valid root extent
1096  * list), this function will traverse the btree in search of the path
1097  * which would contain cpos.
1098  *
1099  * The path traveled is recorded in the path structure.
1100  *
1101  * Note that this will not do any comparisons on leaf node extent
1102  * records, so it will work fine in the case that we just added a tree
1103  * branch.
1104  */
1105 struct find_path_data {
1106         int index;
1107         struct ocfs2_path *path;
1108 };
1109 static void find_path_ins(void *data, struct buffer_head *bh)
1110 {
1111         struct find_path_data *fp = data;
1112
1113         get_bh(bh);
1114         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1115         fp->index++;
1116 }
1117 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1118                            u32 cpos)
1119 {
1120         struct find_path_data data;
1121
1122         data.index = 1;
1123         data.path = path;
1124         return __ocfs2_find_path(inode, path_root_el(path), cpos,
1125                                  find_path_ins, &data);
1126 }
1127
1128 static void find_leaf_ins(void *data, struct buffer_head *bh)
1129 {
1130         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1131         struct ocfs2_extent_list *el = &eb->h_list;
1132         struct buffer_head **ret = data;
1133
1134         /* We want to retain only the leaf block. */
1135         if (le16_to_cpu(el->l_tree_depth) == 0) {
1136                 get_bh(bh);
1137                 *ret = bh;
1138         }
1139 }
1140 /*
1141  * Find the leaf block in the tree which would contain cpos. No
1142  * checking of the actual leaf is done.
1143  *
1144  * Some paths want to call this instead of allocating a path structure
1145  * and calling ocfs2_find_path().
1146  *
1147  * This function doesn't handle non btree extent lists.
1148  */
1149 static int ocfs2_find_leaf(struct inode *inode,
1150                            struct ocfs2_extent_list *root_el, u32 cpos,
1151                            struct buffer_head **leaf_bh)
1152 {
1153         int ret;
1154         struct buffer_head *bh = NULL;
1155
1156         ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1157         if (ret) {
1158                 mlog_errno(ret);
1159                 goto out;
1160         }
1161
1162         *leaf_bh = bh;
1163 out:
1164         return ret;
1165 }
1166
1167 /*
1168  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1169  *
1170  * Basically, we've moved stuff around at the bottom of the tree and
1171  * we need to fix up the extent records above the changes to reflect
1172  * the new changes.
1173  *
1174  * left_rec: the record on the left.
1175  * left_child_el: is the child list pointed to by left_rec
1176  * right_rec: the record to the right of left_rec
1177  * right_child_el: is the child list pointed to by right_rec
1178  *
1179  * By definition, this only works on interior nodes.
1180  */
1181 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1182                                   struct ocfs2_extent_list *left_child_el,
1183                                   struct ocfs2_extent_rec *right_rec,
1184                                   struct ocfs2_extent_list *right_child_el)
1185 {
1186         u32 left_clusters, right_end;
1187
1188         /*
1189          * Interior nodes never have holes. Their cpos is the cpos of
1190          * the leftmost record in their child list. Their cluster
1191          * count covers the full theoretical range of their child list
1192          * - the range between their cpos and the cpos of the record
1193          * immediately to their right.
1194          */
1195         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1196         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1197         left_rec->e_clusters = cpu_to_le32(left_clusters);
1198
1199         /*
1200          * Calculate the rightmost cluster count boundary before
1201          * moving cpos - we will need to adjust e_clusters after
1202          * updating e_cpos to keep the same highest cluster count.
1203          */
1204         right_end = le32_to_cpu(right_rec->e_cpos);
1205         right_end += le32_to_cpu(right_rec->e_clusters);
1206
1207         right_rec->e_cpos = left_rec->e_cpos;
1208         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1209
1210         right_end -= le32_to_cpu(right_rec->e_cpos);
1211         right_rec->e_clusters = cpu_to_le32(right_end);
1212 }
1213
1214 /*
1215  * Adjust the adjacent root node records involved in a
1216  * rotation. left_el_blkno is passed in as a key so that we can easily
1217  * find it's index in the root list.
1218  */
1219 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1220                                       struct ocfs2_extent_list *left_el,
1221                                       struct ocfs2_extent_list *right_el,
1222                                       u64 left_el_blkno)
1223 {
1224         int i;
1225
1226         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1227                le16_to_cpu(left_el->l_tree_depth));
1228
1229         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1230                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1231                         break;
1232         }
1233
1234         /*
1235          * The path walking code should have never returned a root and
1236          * two paths which are not adjacent.
1237          */
1238         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1239
1240         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1241                                       &root_el->l_recs[i + 1], right_el);
1242 }
1243
1244 /*
1245  * We've changed a leaf block (in right_path) and need to reflect that
1246  * change back up the subtree.
1247  *
1248  * This happens in multiple places:
1249  *   - When we've moved an extent record from the left path leaf to the right
1250  *     path leaf to make room for an empty extent in the left path leaf.
1251  *   - When our insert into the right path leaf is at the leftmost edge
1252  *     and requires an update of the path immediately to it's left. This
1253  *     can occur at the end of some types of rotation and appending inserts.
1254  */
1255 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1256                                        struct ocfs2_path *left_path,
1257                                        struct ocfs2_path *right_path,
1258                                        int subtree_index)
1259 {
1260         int ret, i, idx;
1261         struct ocfs2_extent_list *el, *left_el, *right_el;
1262         struct ocfs2_extent_rec *left_rec, *right_rec;
1263         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1264
1265         /*
1266          * Update the counts and position values within all the
1267          * interior nodes to reflect the leaf rotation we just did.
1268          *
1269          * The root node is handled below the loop.
1270          *
1271          * We begin the loop with right_el and left_el pointing to the
1272          * leaf lists and work our way up.
1273          *
1274          * NOTE: within this loop, left_el and right_el always refer
1275          * to the *child* lists.
1276          */
1277         left_el = path_leaf_el(left_path);
1278         right_el = path_leaf_el(right_path);
1279         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1280                 mlog(0, "Adjust records at index %u\n", i);
1281
1282                 /*
1283                  * One nice property of knowing that all of these
1284                  * nodes are below the root is that we only deal with
1285                  * the leftmost right node record and the rightmost
1286                  * left node record.
1287                  */
1288                 el = left_path->p_node[i].el;
1289                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1290                 left_rec = &el->l_recs[idx];
1291
1292                 el = right_path->p_node[i].el;
1293                 right_rec = &el->l_recs[0];
1294
1295                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1296                                               right_el);
1297
1298                 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1299                 if (ret)
1300                         mlog_errno(ret);
1301
1302                 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1303                 if (ret)
1304                         mlog_errno(ret);
1305
1306                 /*
1307                  * Setup our list pointers now so that the current
1308                  * parents become children in the next iteration.
1309                  */
1310                 left_el = left_path->p_node[i].el;
1311                 right_el = right_path->p_node[i].el;
1312         }
1313
1314         /*
1315          * At the root node, adjust the two adjacent records which
1316          * begin our path to the leaves.
1317          */
1318
1319         el = left_path->p_node[subtree_index].el;
1320         left_el = left_path->p_node[subtree_index + 1].el;
1321         right_el = right_path->p_node[subtree_index + 1].el;
1322
1323         ocfs2_adjust_root_records(el, left_el, right_el,
1324                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
1325
1326         root_bh = left_path->p_node[subtree_index].bh;
1327
1328         ret = ocfs2_journal_dirty(handle, root_bh);
1329         if (ret)
1330                 mlog_errno(ret);
1331 }
1332
1333 static int ocfs2_rotate_subtree_right(struct inode *inode,
1334                                       handle_t *handle,
1335                                       struct ocfs2_path *left_path,
1336                                       struct ocfs2_path *right_path,
1337                                       int subtree_index)
1338 {
1339         int ret, i;
1340         struct buffer_head *right_leaf_bh;
1341         struct buffer_head *left_leaf_bh = NULL;
1342         struct buffer_head *root_bh;
1343         struct ocfs2_extent_list *right_el, *left_el;
1344         struct ocfs2_extent_rec move_rec;
1345
1346         left_leaf_bh = path_leaf_bh(left_path);
1347         left_el = path_leaf_el(left_path);
1348
1349         if (left_el->l_next_free_rec != left_el->l_count) {
1350                 ocfs2_error(inode->i_sb,
1351                             "Inode %llu has non-full interior leaf node %llu"
1352                             "(next free = %u)",
1353                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
1354                             (unsigned long long)left_leaf_bh->b_blocknr,
1355                             le16_to_cpu(left_el->l_next_free_rec));
1356                 return -EROFS;
1357         }
1358
1359         /*
1360          * This extent block may already have an empty record, so we
1361          * return early if so.
1362          */
1363         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1364                 return 0;
1365
1366         root_bh = left_path->p_node[subtree_index].bh;
1367         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1368
1369         ret = ocfs2_journal_access(handle, inode, root_bh,
1370                                    OCFS2_JOURNAL_ACCESS_WRITE);
1371         if (ret) {
1372                 mlog_errno(ret);
1373                 goto out;
1374         }
1375
1376         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1377                 ret = ocfs2_journal_access(handle, inode,
1378                                            right_path->p_node[i].bh,
1379                                            OCFS2_JOURNAL_ACCESS_WRITE);
1380                 if (ret) {
1381                         mlog_errno(ret);
1382                         goto out;
1383                 }
1384
1385                 ret = ocfs2_journal_access(handle, inode,
1386                                            left_path->p_node[i].bh,
1387                                            OCFS2_JOURNAL_ACCESS_WRITE);
1388                 if (ret) {
1389                         mlog_errno(ret);
1390                         goto out;
1391                 }
1392         }
1393
1394         right_leaf_bh = path_leaf_bh(right_path);
1395         right_el = path_leaf_el(right_path);
1396
1397         /* This is a code error, not a disk corruption. */
1398         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1399                         "because rightmost leaf block %llu is empty\n",
1400                         (unsigned long long)OCFS2_I(inode)->ip_blkno,
1401                         (unsigned long long)right_leaf_bh->b_blocknr);
1402
1403         ocfs2_create_empty_extent(right_el);
1404
1405         ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1406         if (ret) {
1407                 mlog_errno(ret);
1408                 goto out;
1409         }
1410
1411         /* Do the copy now. */
1412         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1413         move_rec = left_el->l_recs[i];
1414         right_el->l_recs[0] = move_rec;
1415
1416         /*
1417          * Clear out the record we just copied and shift everything
1418          * over, leaving an empty extent in the left leaf.
1419          *
1420          * We temporarily subtract from next_free_rec so that the
1421          * shift will lose the tail record (which is now defunct).
1422          */
1423         le16_add_cpu(&left_el->l_next_free_rec, -1);
1424         ocfs2_shift_records_right(left_el);
1425         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1426         le16_add_cpu(&left_el->l_next_free_rec, 1);
1427
1428         ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1429         if (ret) {
1430                 mlog_errno(ret);
1431                 goto out;
1432         }
1433
1434         ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1435                                 subtree_index);
1436
1437 out:
1438         return ret;
1439 }
1440
1441 /*
1442  * Given a full path, determine what cpos value would return us a path
1443  * containing the leaf immediately to the left of the current one.
1444  *
1445  * Will return zero if the path passed in is already the leftmost path.
1446  */
1447 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1448                                          struct ocfs2_path *path, u32 *cpos)
1449 {
1450         int i, j, ret = 0;
1451         u64 blkno;
1452         struct ocfs2_extent_list *el;
1453
1454         *cpos = 0;
1455
1456         blkno = path_leaf_bh(path)->b_blocknr;
1457
1458         /* Start at the tree node just above the leaf and work our way up. */
1459         i = path->p_tree_depth - 1;
1460         while (i >= 0) {
1461                 el = path->p_node[i].el;
1462
1463                 /*
1464                  * Find the extent record just before the one in our
1465                  * path.
1466                  */
1467                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1468                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1469                                 if (j == 0) {
1470                                         if (i == 0) {
1471                                                 /*
1472                                                  * We've determined that the
1473                                                  * path specified is already
1474                                                  * the leftmost one - return a
1475                                                  * cpos of zero.
1476                                                  */
1477                                                 goto out;
1478                                         }
1479                                         /*
1480                                          * The leftmost record points to our
1481                                          * leaf - we need to travel up the
1482                                          * tree one level.
1483                                          */
1484                                         goto next_node;
1485                                 }
1486
1487                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1488                                 *cpos = *cpos + le32_to_cpu(el->l_recs[j - 1].e_clusters) - 1;
1489                                 goto out;
1490                         }
1491                 }
1492
1493                 /*
1494                  * If we got here, we never found a valid node where
1495                  * the tree indicated one should be.
1496                  */
1497                 ocfs2_error(sb,
1498                             "Invalid extent tree at extent block %llu\n",
1499                             (unsigned long long)blkno);
1500                 ret = -EROFS;
1501                 goto out;
1502
1503 next_node:
1504                 blkno = path->p_node[i].bh->b_blocknr;
1505                 i--;
1506         }
1507
1508 out:
1509         return ret;
1510 }
1511
1512 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1513                                            struct ocfs2_path *path)
1514 {
1515         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
1516
1517         if (handle->h_buffer_credits < credits)
1518                 return ocfs2_extend_trans(handle, credits);
1519
1520         return 0;
1521 }
1522
1523 /*
1524  * Trap the case where we're inserting into the theoretical range past
1525  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1526  * whose cpos is less than ours into the right leaf.
1527  *
1528  * It's only necessary to look at the rightmost record of the left
1529  * leaf because the logic that calls us should ensure that the
1530  * theoretical ranges in the path components above the leaves are
1531  * correct.
1532  */
1533 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1534                                                  u32 insert_cpos)
1535 {
1536         struct ocfs2_extent_list *left_el;
1537         struct ocfs2_extent_rec *rec;
1538         int next_free;
1539
1540         left_el = path_leaf_el(left_path);
1541         next_free = le16_to_cpu(left_el->l_next_free_rec);
1542         rec = &left_el->l_recs[next_free - 1];
1543
1544         if (insert_cpos > le32_to_cpu(rec->e_cpos))
1545                 return 1;
1546         return 0;
1547 }
1548
1549 /*
1550  * Rotate all the records in a btree right one record, starting at insert_cpos.
1551  *
1552  * The path to the rightmost leaf should be passed in.
1553  *
1554  * The array is assumed to be large enough to hold an entire path (tree depth).
1555  *
1556  * Upon succesful return from this function:
1557  *
1558  * - The 'right_path' array will contain a path to the leaf block
1559  *   whose range contains e_cpos.
1560  * - That leaf block will have a single empty extent in list index 0.
1561  * - In the case that the rotation requires a post-insert update,
1562  *   *ret_left_path will contain a valid path which can be passed to
1563  *   ocfs2_insert_path().
1564  */
1565 static int ocfs2_rotate_tree_right(struct inode *inode,
1566                                    handle_t *handle,
1567                                    u32 insert_cpos,
1568                                    struct ocfs2_path *right_path,
1569                                    struct ocfs2_path **ret_left_path)
1570 {
1571         int ret, start;
1572         u32 cpos;
1573         struct ocfs2_path *left_path = NULL;
1574
1575         *ret_left_path = NULL;
1576
1577         left_path = ocfs2_new_path(path_root_bh(right_path),
1578                                    path_root_el(right_path));
1579         if (!left_path) {
1580                 ret = -ENOMEM;
1581                 mlog_errno(ret);
1582                 goto out;
1583         }
1584
1585         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1586         if (ret) {
1587                 mlog_errno(ret);
1588                 goto out;
1589         }
1590
1591         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1592
1593         /*
1594          * What we want to do here is:
1595          *
1596          * 1) Start with the rightmost path.
1597          *
1598          * 2) Determine a path to the leaf block directly to the left
1599          *    of that leaf.
1600          *
1601          * 3) Determine the 'subtree root' - the lowest level tree node
1602          *    which contains a path to both leaves.
1603          *
1604          * 4) Rotate the subtree.
1605          *
1606          * 5) Find the next subtree by considering the left path to be
1607          *    the new right path.
1608          *
1609          * The check at the top of this while loop also accepts
1610          * insert_cpos == cpos because cpos is only a _theoretical_
1611          * value to get us the left path - insert_cpos might very well
1612          * be filling that hole.
1613          *
1614          * Stop at a cpos of '0' because we either started at the
1615          * leftmost branch (i.e., a tree with one branch and a
1616          * rotation inside of it), or we've gone as far as we can in
1617          * rotating subtrees.
1618          */
1619         while (cpos && insert_cpos <= cpos) {
1620                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1621                      insert_cpos, cpos);
1622
1623                 ret = ocfs2_find_path(inode, left_path, cpos);
1624                 if (ret) {
1625                         mlog_errno(ret);
1626                         goto out;
1627                 }
1628
1629                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
1630                                 path_leaf_bh(right_path),
1631                                 "Inode %lu: error during insert of %u "
1632                                 "(left path cpos %u) results in two identical "
1633                                 "paths ending at %llu\n",
1634                                 inode->i_ino, insert_cpos, cpos,
1635                                 (unsigned long long)
1636                                 path_leaf_bh(left_path)->b_blocknr);
1637
1638                 if (ocfs2_rotate_requires_path_adjustment(left_path,
1639                                                           insert_cpos)) {
1640                         mlog(0, "Path adjustment required\n");
1641
1642                         /*
1643                          * We've rotated the tree as much as we
1644                          * should. The rest is up to
1645                          * ocfs2_insert_path() to complete, after the
1646                          * record insertion. We indicate this
1647                          * situation by returning the left path.
1648                          *
1649                          * The reason we don't adjust the records here
1650                          * before the record insert is that an error
1651                          * later might break the rule where a parent
1652                          * record e_cpos will reflect the actual
1653                          * e_cpos of the 1st nonempty record of the
1654                          * child list.
1655                          */
1656                         *ret_left_path = left_path;
1657                         goto out_ret_path;
1658                 }
1659
1660                 start = ocfs2_find_subtree_root(inode, left_path, right_path);
1661
1662                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1663                      start,
1664                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1665                      right_path->p_tree_depth);
1666
1667                 ret = ocfs2_extend_rotate_transaction(handle, start,
1668                                                       right_path);
1669                 if (ret) {
1670                         mlog_errno(ret);
1671                         goto out;
1672                 }
1673
1674                 ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1675                                                  right_path, start);
1676                 if (ret) {
1677                         mlog_errno(ret);
1678                         goto out;
1679                 }
1680
1681                 /*
1682                  * There is no need to re-read the next right path
1683                  * as we know that it'll be our current left
1684                  * path. Optimize by copying values instead.
1685                  */
1686                 ocfs2_mv_path(right_path, left_path);
1687
1688                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1689                                                     &cpos);
1690                 if (ret) {
1691                         mlog_errno(ret);
1692                         goto out;
1693                 }
1694         }
1695
1696 out:
1697         ocfs2_free_path(left_path);
1698
1699 out_ret_path:
1700         return ret;
1701 }
1702
1703 /*
1704  * Do the final bits of extent record insertion at the target leaf
1705  * list. If this leaf is part of an allocation tree, it is assumed
1706  * that the tree above has been prepared.
1707  */
1708 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
1709                                  struct ocfs2_extent_list *el,
1710                                  struct ocfs2_insert_type *insert,
1711                                  struct inode *inode)
1712 {
1713         int i = insert->ins_contig_index;
1714         unsigned int range;
1715         struct ocfs2_extent_rec *rec;
1716
1717         BUG_ON(el->l_tree_depth);
1718
1719         /*
1720          * Contiguous insert - either left or right.
1721          */
1722         if (insert->ins_contig != CONTIG_NONE) {
1723                 rec = &el->l_recs[i];
1724                 if (insert->ins_contig == CONTIG_LEFT) {
1725                         rec->e_blkno = insert_rec->e_blkno;
1726                         rec->e_cpos = insert_rec->e_cpos;
1727                 }
1728                 le32_add_cpu(&rec->e_clusters,
1729                              le32_to_cpu(insert_rec->e_clusters));
1730                 return;
1731         }
1732
1733         /*
1734          * Handle insert into an empty leaf.
1735          */
1736         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
1737             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
1738              ocfs2_is_empty_extent(&el->l_recs[0]))) {
1739                 el->l_recs[0] = *insert_rec;
1740                 el->l_next_free_rec = cpu_to_le16(1);
1741                 return;
1742         }
1743
1744         /*
1745          * Appending insert.
1746          */
1747         if (insert->ins_appending == APPEND_TAIL) {
1748                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1749                 rec = &el->l_recs[i];
1750                 range = le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters);
1751                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
1752
1753                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
1754                                 le16_to_cpu(el->l_count),
1755                                 "inode %lu, depth %u, count %u, next free %u, "
1756                                 "rec.cpos %u, rec.clusters %u, "
1757                                 "insert.cpos %u, insert.clusters %u\n",
1758                                 inode->i_ino,
1759                                 le16_to_cpu(el->l_tree_depth),
1760                                 le16_to_cpu(el->l_count),
1761                                 le16_to_cpu(el->l_next_free_rec),
1762                                 le32_to_cpu(el->l_recs[i].e_cpos),
1763                                 le32_to_cpu(el->l_recs[i].e_clusters),
1764                                 le32_to_cpu(insert_rec->e_cpos),
1765                                 le32_to_cpu(insert_rec->e_clusters));
1766                 i++;
1767                 el->l_recs[i] = *insert_rec;
1768                 le16_add_cpu(&el->l_next_free_rec, 1);
1769                 return;
1770         }
1771
1772         /*
1773          * Ok, we have to rotate.
1774          *
1775          * At this point, it is safe to assume that inserting into an
1776          * empty leaf and appending to a leaf have both been handled
1777          * above.
1778          *
1779          * This leaf needs to have space, either by the empty 1st
1780          * extent record, or by virtue of an l_next_rec < l_count.
1781          */
1782         ocfs2_rotate_leaf(el, insert_rec);
1783 }
1784
1785 static inline void ocfs2_update_dinode_clusters(struct inode *inode,
1786                                                 struct ocfs2_dinode *di,
1787                                                 u32 clusters)
1788 {
1789         le32_add_cpu(&di->i_clusters, clusters);
1790         spin_lock(&OCFS2_I(inode)->ip_lock);
1791         OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
1792         spin_unlock(&OCFS2_I(inode)->ip_lock);
1793 }
1794
1795 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
1796                                     struct ocfs2_extent_rec *insert_rec,
1797                                     struct ocfs2_path *right_path,
1798                                     struct ocfs2_path **ret_left_path)
1799 {
1800         int ret, i, next_free;
1801         struct buffer_head *bh;
1802         struct ocfs2_extent_list *el;
1803         struct ocfs2_path *left_path = NULL;
1804
1805         *ret_left_path = NULL;
1806
1807         /*
1808          * If our appending insert is at the leftmost edge of a leaf,
1809          * then we might need to update the rightmost records of the
1810          * neighboring path.
1811          */
1812         el = path_leaf_el(right_path);
1813         next_free = le16_to_cpu(el->l_next_free_rec);
1814         if (next_free == 0 ||
1815             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
1816                 u32 left_cpos;
1817
1818                 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1819                                                     &left_cpos);
1820                 if (ret) {
1821                         mlog_errno(ret);
1822                         goto out;
1823                 }
1824
1825                 mlog(0, "Append may need a left path update. cpos: %u, "
1826                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
1827                      left_cpos);
1828
1829                 /*
1830                  * No need to worry if the append is already in the
1831                  * leftmost leaf.
1832                  */
1833                 if (left_cpos) {
1834                         left_path = ocfs2_new_path(path_root_bh(right_path),
1835                                                    path_root_el(right_path));
1836                         if (!left_path) {
1837                                 ret = -ENOMEM;
1838                                 mlog_errno(ret);
1839                                 goto out;
1840                         }
1841
1842                         ret = ocfs2_find_path(inode, left_path, left_cpos);
1843                         if (ret) {
1844                                 mlog_errno(ret);
1845                                 goto out;
1846                         }
1847
1848                         /*
1849                          * ocfs2_insert_path() will pass the left_path to the
1850                          * journal for us.
1851                          */
1852                 }
1853         }
1854
1855         ret = ocfs2_journal_access_path(inode, handle, right_path);
1856         if (ret) {
1857                 mlog_errno(ret);
1858                 goto out;
1859         }
1860
1861         el = path_root_el(right_path);
1862         bh = path_root_bh(right_path);
1863         i = 0;
1864         while (1) {
1865                 next_free = le16_to_cpu(el->l_next_free_rec);
1866                 if (next_free == 0) {
1867                         ocfs2_error(inode->i_sb,
1868                                     "Dinode %llu has a bad extent list",
1869                                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
1870                         ret = -EIO;
1871                         goto out;
1872                 }
1873
1874                 el->l_recs[next_free - 1].e_clusters = insert_rec->e_cpos;
1875                 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
1876                              le32_to_cpu(insert_rec->e_clusters));
1877                 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
1878                             -le32_to_cpu(el->l_recs[next_free - 1].e_cpos));
1879
1880                 ret = ocfs2_journal_dirty(handle, bh);
1881                 if (ret)
1882                         mlog_errno(ret);
1883
1884                 if (++i >= right_path->p_tree_depth)
1885                         break;
1886
1887                 bh = right_path->p_node[i].bh;
1888                 el = right_path->p_node[i].el;
1889         }
1890
1891         *ret_left_path = left_path;
1892         ret = 0;
1893 out:
1894         if (ret != 0)
1895                 ocfs2_free_path(left_path);
1896
1897         return ret;
1898 }
1899
1900 /*
1901  * This function only does inserts on an allocation b-tree. For dinode
1902  * lists, ocfs2_insert_at_leaf() is called directly.
1903  *
1904  * right_path is the path we want to do the actual insert
1905  * in. left_path should only be passed in if we need to update that
1906  * portion of the tree after an edge insert.
1907  */
1908 static int ocfs2_insert_path(struct inode *inode,
1909                              handle_t *handle,
1910                              struct ocfs2_path *left_path,
1911                              struct ocfs2_path *right_path,
1912                              struct ocfs2_extent_rec *insert_rec,
1913                              struct ocfs2_insert_type *insert)
1914 {
1915         int ret, subtree_index;
1916         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
1917         struct ocfs2_extent_list *el;
1918
1919         /*
1920          * Pass both paths to the journal. The majority of inserts
1921          * will be touching all components anyway.
1922          */
1923         ret = ocfs2_journal_access_path(inode, handle, right_path);
1924         if (ret < 0) {
1925                 mlog_errno(ret);
1926                 goto out;
1927         }
1928
1929         if (left_path) {
1930                 int credits = handle->h_buffer_credits;
1931
1932                 /*
1933                  * There's a chance that left_path got passed back to
1934                  * us without being accounted for in the
1935                  * journal. Extend our transaction here to be sure we
1936                  * can change those blocks.
1937                  */
1938                 credits += left_path->p_tree_depth;
1939
1940                 ret = ocfs2_extend_trans(handle, credits);
1941                 if (ret < 0) {
1942                         mlog_errno(ret);
1943                         goto out;
1944                 }
1945
1946                 ret = ocfs2_journal_access_path(inode, handle, left_path);
1947                 if (ret < 0) {
1948                         mlog_errno(ret);
1949                         goto out;
1950                 }
1951         }
1952
1953         el = path_leaf_el(right_path);
1954
1955         ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
1956         ret = ocfs2_journal_dirty(handle, leaf_bh);
1957         if (ret)
1958                 mlog_errno(ret);
1959
1960         if (left_path) {
1961                 /*
1962                  * The rotate code has indicated that we need to fix
1963                  * up portions of the tree after the insert.
1964                  *
1965                  * XXX: Should we extend the transaction here?
1966                  */
1967                 subtree_index = ocfs2_find_subtree_root(inode, left_path,
1968                                                         right_path);
1969                 ocfs2_complete_edge_insert(inode, handle, left_path,
1970                                            right_path, subtree_index);
1971         }
1972
1973         ret = 0;
1974 out:
1975         return ret;
1976 }
1977
1978 static int ocfs2_do_insert_extent(struct inode *inode,
1979                                   handle_t *handle,
1980                                   struct buffer_head *di_bh,
1981                                   struct ocfs2_extent_rec *insert_rec,
1982                                   struct ocfs2_insert_type *type)
1983 {
1984         int ret, rotate = 0;
1985         u32 cpos;
1986         struct ocfs2_path *right_path = NULL;
1987         struct ocfs2_path *left_path = NULL;
1988         struct ocfs2_dinode *di;
1989         struct ocfs2_extent_list *el;
1990
1991         di = (struct ocfs2_dinode *) di_bh->b_data;
1992         el = &di->id2.i_list;
1993
1994         ret = ocfs2_journal_access(handle, inode, di_bh,
1995                                    OCFS2_JOURNAL_ACCESS_WRITE);
1996         if (ret) {
1997                 mlog_errno(ret);
1998                 goto out;
1999         }
2000
2001         if (le16_to_cpu(el->l_tree_depth) == 0) {
2002                 ocfs2_insert_at_leaf(insert_rec, el, type, inode);
2003                 goto out_update_clusters;
2004         }
2005
2006         right_path = ocfs2_new_inode_path(di_bh);
2007         if (!right_path) {
2008                 ret = -ENOMEM;
2009                 mlog_errno(ret);
2010                 goto out;
2011         }
2012
2013         /*
2014          * Determine the path to start with. Rotations need the
2015          * rightmost path, everything else can go directly to the
2016          * target leaf.
2017          */
2018         cpos = le32_to_cpu(insert_rec->e_cpos);
2019         if (type->ins_appending == APPEND_NONE &&
2020             type->ins_contig == CONTIG_NONE) {
2021                 rotate = 1;
2022                 cpos = UINT_MAX;
2023         }
2024
2025         ret = ocfs2_find_path(inode, right_path, cpos);
2026         if (ret) {
2027                 mlog_errno(ret);
2028                 goto out;
2029         }
2030
2031         /*
2032          * Rotations and appends need special treatment - they modify
2033          * parts of the tree's above them.
2034          *
2035          * Both might pass back a path immediate to the left of the
2036          * one being inserted to. This will be cause
2037          * ocfs2_insert_path() to modify the rightmost records of
2038          * left_path to account for an edge insert.
2039          *
2040          * XXX: When modifying this code, keep in mind that an insert
2041          * can wind up skipping both of these two special cases...
2042          */
2043         if (rotate) {
2044                 ret = ocfs2_rotate_tree_right(inode, handle,
2045                                               le32_to_cpu(insert_rec->e_cpos),
2046                                               right_path, &left_path);
2047                 if (ret) {
2048                         mlog_errno(ret);
2049                         goto out;
2050                 }
2051         } else if (type->ins_appending == APPEND_TAIL
2052                    && type->ins_contig != CONTIG_LEFT) {
2053                 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
2054                                                right_path, &left_path);
2055                 if (ret) {
2056                         mlog_errno(ret);
2057                         goto out;
2058                 }
2059         }
2060
2061         ret = ocfs2_insert_path(inode, handle, left_path, right_path,
2062                                 insert_rec, type);
2063         if (ret) {
2064                 mlog_errno(ret);
2065                 goto out;
2066         }
2067
2068 out_update_clusters:
2069         ocfs2_update_dinode_clusters(inode, di,
2070                                      le32_to_cpu(insert_rec->e_clusters));
2071
2072         ret = ocfs2_journal_dirty(handle, di_bh);
2073         if (ret)
2074                 mlog_errno(ret);
2075
2076 out:
2077         ocfs2_free_path(left_path);
2078         ocfs2_free_path(right_path);
2079
2080         return ret;
2081 }
2082
2083 static void ocfs2_figure_contig_type(struct inode *inode,
2084                                      struct ocfs2_insert_type *insert,
2085                                      struct ocfs2_extent_list *el,
2086                                      struct ocfs2_extent_rec *insert_rec)
2087 {
2088         int i;
2089         enum ocfs2_contig_type contig_type = CONTIG_NONE;
2090
2091         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2092                 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
2093                                                   insert_rec);
2094                 if (contig_type != CONTIG_NONE) {
2095                         insert->ins_contig_index = i;
2096                         break;
2097                 }
2098         }
2099         insert->ins_contig = contig_type;
2100 }
2101
2102 /*
2103  * This should only be called against the righmost leaf extent list.
2104  *
2105  * ocfs2_figure_appending_type() will figure out whether we'll have to
2106  * insert at the tail of the rightmost leaf.
2107  *
2108  * This should also work against the dinode list for tree's with 0
2109  * depth. If we consider the dinode list to be the rightmost leaf node
2110  * then the logic here makes sense.
2111  */
2112 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
2113                                         struct ocfs2_extent_list *el,
2114                                         struct ocfs2_extent_rec *insert_rec)
2115 {
2116         int i;
2117         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
2118         struct ocfs2_extent_rec *rec;
2119
2120         insert->ins_appending = APPEND_NONE;
2121
2122         BUG_ON(el->l_tree_depth);
2123
2124         if (!el->l_next_free_rec)
2125                 goto set_tail_append;
2126
2127         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2128                 /* Were all records empty? */
2129                 if (le16_to_cpu(el->l_next_free_rec) == 1)
2130                         goto set_tail_append;
2131         }
2132
2133         i = le16_to_cpu(el->l_next_free_rec) - 1;
2134         rec = &el->l_recs[i];
2135
2136         if (cpos >= (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)))
2137                 goto set_tail_append;
2138
2139         return;
2140
2141 set_tail_append:
2142         insert->ins_appending = APPEND_TAIL;
2143 }
2144
2145 /*
2146  * Helper function called at the begining of an insert.
2147  *
2148  * This computes a few things that are commonly used in the process of
2149  * inserting into the btree:
2150  *   - Whether the new extent is contiguous with an existing one.
2151  *   - The current tree depth.
2152  *   - Whether the insert is an appending one.
2153  *   - The total # of free records in the tree.
2154  *
2155  * All of the information is stored on the ocfs2_insert_type
2156  * structure.
2157  */
2158 static int ocfs2_figure_insert_type(struct inode *inode,
2159                                     struct buffer_head *di_bh,
2160                                     struct buffer_head **last_eb_bh,
2161                                     struct ocfs2_extent_rec *insert_rec,
2162                                     struct ocfs2_insert_type *insert)
2163 {
2164         int ret;
2165         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2166         struct ocfs2_extent_block *eb;
2167         struct ocfs2_extent_list *el;
2168         struct ocfs2_path *path = NULL;
2169         struct buffer_head *bh = NULL;
2170
2171         el = &di->id2.i_list;
2172         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
2173
2174         if (el->l_tree_depth) {
2175                 /*
2176                  * If we have tree depth, we read in the
2177                  * rightmost extent block ahead of time as
2178                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
2179                  * may want it later.
2180                  */
2181                 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
2182                                        le64_to_cpu(di->i_last_eb_blk), &bh,
2183                                        OCFS2_BH_CACHED, inode);
2184                 if (ret) {
2185                         mlog_exit(ret);
2186                         goto out;
2187                 }
2188                 eb = (struct ocfs2_extent_block *) bh->b_data;
2189                 el = &eb->h_list;
2190         }
2191
2192         /*
2193          * Unless we have a contiguous insert, we'll need to know if
2194          * there is room left in our allocation tree for another
2195          * extent record.
2196          *
2197          * XXX: This test is simplistic, we can search for empty
2198          * extent records too.
2199          */
2200         insert->ins_free_records = le16_to_cpu(el->l_count) -
2201                 le16_to_cpu(el->l_next_free_rec);
2202
2203         if (!insert->ins_tree_depth) {
2204                 ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2205                 ocfs2_figure_appending_type(insert, el, insert_rec);
2206                 return 0;
2207         }
2208
2209         path = ocfs2_new_inode_path(di_bh);
2210         if (!path) {
2211                 ret = -ENOMEM;
2212                 mlog_errno(ret);
2213                 goto out;
2214         }
2215
2216         /*
2217          * In the case that we're inserting past what the tree
2218          * currently accounts for, ocfs2_find_path() will return for
2219          * us the rightmost tree path. This is accounted for below in
2220          * the appending code.
2221          */
2222         ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
2223         if (ret) {
2224                 mlog_errno(ret);
2225                 goto out;
2226         }
2227
2228         el = path_leaf_el(path);
2229
2230         /*
2231          * Now that we have the path, there's two things we want to determine:
2232          * 1) Contiguousness (also set contig_index if this is so)
2233          *
2234          * 2) Are we doing an append? We can trivially break this up
2235          *     into two types of appends: simple record append, or a
2236          *     rotate inside the tail leaf.
2237          */
2238         ocfs2_figure_contig_type(inode, insert, el, insert_rec);
2239
2240         /*
2241          * The insert code isn't quite ready to deal with all cases of
2242          * left contiguousness. Specifically, if it's an insert into
2243          * the 1st record in a leaf, it will require the adjustment of
2244          * e_clusters on the last record of the path directly to it's
2245          * left. For now, just catch that case and fool the layers
2246          * above us. This works just fine for tree_depth == 0, which
2247          * is why we allow that above.
2248          */
2249         if (insert->ins_contig == CONTIG_LEFT &&
2250             insert->ins_contig_index == 0)
2251                 insert->ins_contig = CONTIG_NONE;
2252
2253         /*
2254          * Ok, so we can simply compare against last_eb to figure out
2255          * whether the path doesn't exist. This will only happen in
2256          * the case that we're doing a tail append, so maybe we can
2257          * take advantage of that information somehow.
2258          */
2259         if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
2260                 /*
2261                  * Ok, ocfs2_find_path() returned us the rightmost
2262                  * tree path. This might be an appending insert. There are
2263                  * two cases:
2264                  *    1) We're doing a true append at the tail:
2265                  *      -This might even be off the end of the leaf
2266                  *    2) We're "appending" by rotating in the tail
2267                  */
2268                 ocfs2_figure_appending_type(insert, el, insert_rec);
2269         }
2270
2271 out:
2272         ocfs2_free_path(path);
2273
2274         if (ret == 0)
2275                 *last_eb_bh = bh;
2276         else
2277                 brelse(bh);
2278         return ret;
2279 }
2280
2281 /*
2282  * Insert an extent into an inode btree.
2283  *
2284  * The caller needs to update fe->i_clusters
2285  */
2286 int ocfs2_insert_extent(struct ocfs2_super *osb,
2287                         handle_t *handle,
2288                         struct inode *inode,
2289                         struct buffer_head *fe_bh,
2290                         u32 cpos,
2291                         u64 start_blk,
2292                         u32 new_clusters,
2293                         struct ocfs2_alloc_context *meta_ac)
2294 {
2295         int status, shift;
2296         struct buffer_head *last_eb_bh = NULL;
2297         struct buffer_head *bh = NULL;
2298         struct ocfs2_insert_type insert = {0, };
2299         struct ocfs2_extent_rec rec;
2300
2301         mlog(0, "add %u clusters at position %u to inode %llu\n",
2302              new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
2303
2304         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
2305                         (OCFS2_I(inode)->ip_clusters != cpos),
2306                         "Device %s, asking for sparse allocation: inode %llu, "
2307                         "cpos %u, clusters %u\n",
2308                         osb->dev_str,
2309                         (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
2310                         OCFS2_I(inode)->ip_clusters);
2311
2312         rec.e_cpos = cpu_to_le32(cpos);
2313         rec.e_blkno = cpu_to_le64(start_blk);
2314         rec.e_clusters = cpu_to_le32(new_clusters);
2315
2316         status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
2317                                           &insert);
2318         if (status < 0) {
2319                 mlog_errno(status);
2320                 goto bail;
2321         }
2322
2323         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
2324              "Insert.contig_index: %d, Insert.free_records: %d, "
2325              "Insert.tree_depth: %d\n",
2326              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
2327              insert.ins_free_records, insert.ins_tree_depth);
2328
2329         /*
2330          * Avoid growing the tree unless we're out of records and the
2331          * insert type requres one.
2332          */
2333         if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
2334                 goto out_add;
2335
2336         shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
2337         if (shift < 0) {
2338                 status = shift;
2339                 mlog_errno(status);
2340                 goto bail;
2341         }
2342
2343         /* We traveled all the way to the bottom of the allocation tree
2344          * and didn't find room for any more extents - we need to add
2345          * another tree level */
2346         if (shift) {
2347                 BUG_ON(bh);
2348                 mlog(0, "need to shift tree depth "
2349                      "(current = %d)\n", insert.ins_tree_depth);
2350
2351                 /* ocfs2_shift_tree_depth will return us a buffer with
2352                  * the new extent block (so we can pass that to
2353                  * ocfs2_add_branch). */
2354                 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
2355                                                 meta_ac, &bh);
2356                 if (status < 0) {
2357                         mlog_errno(status);
2358                         goto bail;
2359                 }
2360                 insert.ins_tree_depth++;
2361                 /* Special case: we have room now if we shifted from
2362                  * tree_depth 0 */
2363                 if (insert.ins_tree_depth == 1)
2364                         goto out_add;
2365         }
2366
2367         /* call ocfs2_add_branch to add the final part of the tree with
2368          * the new data. */
2369         mlog(0, "add branch. bh = %p\n", bh);
2370         status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
2371                                   meta_ac);
2372         if (status < 0) {
2373                 mlog_errno(status);
2374                 goto bail;
2375         }
2376
2377 out_add:
2378         /* Finally, we can add clusters. This might rotate the tree for us. */
2379         status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
2380         if (status < 0)
2381                 mlog_errno(status);
2382
2383 bail:
2384         if (bh)
2385                 brelse(bh);
2386
2387         if (last_eb_bh)
2388                 brelse(last_eb_bh);
2389
2390         mlog_exit(status);
2391         return status;
2392 }
2393
2394 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
2395 {
2396         struct buffer_head *tl_bh = osb->osb_tl_bh;
2397         struct ocfs2_dinode *di;
2398         struct ocfs2_truncate_log *tl;
2399
2400         di = (struct ocfs2_dinode *) tl_bh->b_data;
2401         tl = &di->id2.i_dealloc;
2402
2403         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
2404                         "slot %d, invalid truncate log parameters: used = "
2405                         "%u, count = %u\n", osb->slot_num,
2406                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
2407         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
2408 }
2409
2410 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
2411                                            unsigned int new_start)
2412 {
2413         unsigned int tail_index;
2414         unsigned int current_tail;
2415
2416         /* No records, nothing to coalesce */
2417         if (!le16_to_cpu(tl->tl_used))
2418                 return 0;
2419
2420         tail_index = le16_to_cpu(tl->tl_used) - 1;
2421         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
2422         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
2423
2424         return current_tail == new_start;
2425 }
2426
2427 static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
2428                                      handle_t *handle,
2429                                      u64 start_blk,
2430                                      unsigned int num_clusters)
2431 {
2432         int status, index;
2433         unsigned int start_cluster, tl_count;
2434         struct inode *tl_inode = osb->osb_tl_inode;
2435         struct buffer_head *tl_bh = osb->osb_tl_bh;
2436         struct ocfs2_dinode *di;
2437         struct ocfs2_truncate_log *tl;
2438
2439         mlog_entry("start_blk = %llu, num_clusters = %u\n",
2440                    (unsigned long long)start_blk, num_clusters);
2441
2442         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2443
2444         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
2445
2446         di = (struct ocfs2_dinode *) tl_bh->b_data;
2447         tl = &di->id2.i_dealloc;
2448         if (!OCFS2_IS_VALID_DINODE(di)) {
2449                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2450                 status = -EIO;
2451                 goto bail;
2452         }
2453
2454         tl_count = le16_to_cpu(tl->tl_count);
2455         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
2456                         tl_count == 0,
2457                         "Truncate record count on #%llu invalid "
2458                         "wanted %u, actual %u\n",
2459                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
2460                         ocfs2_truncate_recs_per_inode(osb->sb),
2461                         le16_to_cpu(tl->tl_count));
2462
2463         /* Caller should have known to flush before calling us. */
2464         index = le16_to_cpu(tl->tl_used);
2465         if (index >= tl_count) {
2466                 status = -ENOSPC;
2467                 mlog_errno(status);
2468                 goto bail;
2469         }
2470
2471         status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2472                                       OCFS2_JOURNAL_ACCESS_WRITE);
2473         if (status < 0) {
2474                 mlog_errno(status);
2475                 goto bail;
2476         }
2477
2478         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
2479              "%llu (index = %d)\n", num_clusters, start_cluster,
2480              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
2481
2482         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
2483                 /*
2484                  * Move index back to the record we are coalescing with.
2485                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
2486                  */
2487                 index--;
2488
2489                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
2490                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
2491                      index, le32_to_cpu(tl->tl_recs[index].t_start),
2492                      num_clusters);
2493         } else {
2494                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
2495                 tl->tl_used = cpu_to_le16(index + 1);
2496         }
2497         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
2498
2499         status = ocfs2_journal_dirty(handle, tl_bh);
2500         if (status < 0) {
2501                 mlog_errno(status);
2502                 goto bail;
2503         }
2504
2505 bail:
2506         mlog_exit(status);
2507         return status;
2508 }
2509
2510 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
2511                                          handle_t *handle,
2512                                          struct inode *data_alloc_inode,
2513                                          struct buffer_head *data_alloc_bh)
2514 {
2515         int status = 0;
2516         int i;
2517         unsigned int num_clusters;
2518         u64 start_blk;
2519         struct ocfs2_truncate_rec rec;
2520         struct ocfs2_dinode *di;
2521         struct ocfs2_truncate_log *tl;
2522         struct inode *tl_inode = osb->osb_tl_inode;
2523         struct buffer_head *tl_bh = osb->osb_tl_bh;
2524
2525         mlog_entry_void();
2526
2527         di = (struct ocfs2_dinode *) tl_bh->b_data;
2528         tl = &di->id2.i_dealloc;
2529         i = le16_to_cpu(tl->tl_used) - 1;
2530         while (i >= 0) {
2531                 /* Caller has given us at least enough credits to
2532                  * update the truncate log dinode */
2533                 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
2534                                               OCFS2_JOURNAL_ACCESS_WRITE);
2535                 if (status < 0) {
2536                         mlog_errno(status);
2537                         goto bail;
2538                 }
2539
2540                 tl->tl_used = cpu_to_le16(i);
2541
2542                 status = ocfs2_journal_dirty(handle, tl_bh);
2543                 if (status < 0) {
2544                         mlog_errno(status);
2545                         goto bail;
2546                 }
2547
2548                 /* TODO: Perhaps we can calculate the bulk of the
2549                  * credits up front rather than extending like
2550                  * this. */
2551                 status = ocfs2_extend_trans(handle,
2552                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
2553                 if (status < 0) {
2554                         mlog_errno(status);
2555                         goto bail;
2556                 }
2557
2558                 rec = tl->tl_recs[i];
2559                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
2560                                                     le32_to_cpu(rec.t_start));
2561                 num_clusters = le32_to_cpu(rec.t_clusters);
2562
2563                 /* if start_blk is not set, we ignore the record as
2564                  * invalid. */
2565                 if (start_blk) {
2566                         mlog(0, "free record %d, start = %u, clusters = %u\n",
2567                              i, le32_to_cpu(rec.t_start), num_clusters);
2568
2569                         status = ocfs2_free_clusters(handle, data_alloc_inode,
2570                                                      data_alloc_bh, start_blk,
2571                                                      num_clusters);
2572                         if (status < 0) {
2573                                 mlog_errno(status);
2574                                 goto bail;
2575                         }
2576                 }
2577                 i--;
2578         }
2579
2580 bail:
2581         mlog_exit(status);
2582         return status;
2583 }
2584
2585 /* Expects you to already be holding tl_inode->i_mutex */
2586 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2587 {
2588         int status;
2589         unsigned int num_to_flush;
2590         handle_t *handle;
2591         struct inode *tl_inode = osb->osb_tl_inode;
2592         struct inode *data_alloc_inode = NULL;
2593         struct buffer_head *tl_bh = osb->osb_tl_bh;
2594         struct buffer_head *data_alloc_bh = NULL;
2595         struct ocfs2_dinode *di;
2596         struct ocfs2_truncate_log *tl;
2597
2598         mlog_entry_void();
2599
2600         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
2601
2602         di = (struct ocfs2_dinode *) tl_bh->b_data;
2603         tl = &di->id2.i_dealloc;
2604         if (!OCFS2_IS_VALID_DINODE(di)) {
2605                 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
2606                 status = -EIO;
2607                 goto out;
2608         }
2609
2610         num_to_flush = le16_to_cpu(tl->tl_used);
2611         mlog(0, "Flush %u records from truncate log #%llu\n",
2612              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
2613         if (!num_to_flush) {
2614                 status = 0;
2615                 goto out;
2616         }
2617
2618         data_alloc_inode = ocfs2_get_system_file_inode(osb,
2619                                                        GLOBAL_BITMAP_SYSTEM_INODE,
2620                                                        OCFS2_INVALID_SLOT);
2621         if (!data_alloc_inode) {
2622                 status = -EINVAL;
2623                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
2624                 goto out;
2625         }
2626
2627         mutex_lock(&data_alloc_inode->i_mutex);
2628
2629         status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1);
2630         if (status < 0) {
2631                 mlog_errno(status);
2632                 goto out_mutex;
2633         }
2634
2635         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2636         if (IS_ERR(handle)) {
2637                 status = PTR_ERR(handle);
2638                 mlog_errno(status);
2639                 goto out_unlock;
2640         }
2641
2642         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
2643                                                data_alloc_bh);
2644         if (status < 0)
2645                 mlog_errno(status);
2646
2647         ocfs2_commit_trans(osb, handle);
2648
2649 out_unlock:
2650         brelse(data_alloc_bh);
2651         ocfs2_meta_unlock(data_alloc_inode, 1);
2652
2653 out_mutex:
2654         mutex_unlock(&data_alloc_inode->i_mutex);
2655         iput(data_alloc_inode);
2656
2657 out:
2658         mlog_exit(status);
2659         return status;
2660 }
2661
2662 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
2663 {
2664         int status;
2665         struct inode *tl_inode = osb->osb_tl_inode;
2666
2667         mutex_lock(&tl_inode->i_mutex);
2668         status = __ocfs2_flush_truncate_log(osb);
2669         mutex_unlock(&tl_inode->i_mutex);
2670
2671         return status;
2672 }
2673
2674 static void ocfs2_truncate_log_worker(struct work_struct *work)
2675 {
2676         int status;
2677         struct ocfs2_super *osb =
2678                 container_of(work, struct ocfs2_super,
2679                              osb_truncate_log_wq.work);
2680
2681         mlog_entry_void();
2682
2683         status = ocfs2_flush_truncate_log(osb);
2684         if (status < 0)
2685                 mlog_errno(status);
2686
2687         mlog_exit(status);
2688 }
2689
2690 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
2691 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
2692                                        int cancel)
2693 {
2694         if (osb->osb_tl_inode) {
2695                 /* We want to push off log flushes while truncates are
2696                  * still running. */
2697                 if (cancel)
2698                         cancel_delayed_work(&osb->osb_truncate_log_wq);
2699
2700                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
2701                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
2702         }
2703 }
2704
2705 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
2706                                        int slot_num,
2707                                        struct inode **tl_inode,
2708                                        struct buffer_head **tl_bh)
2709 {
2710         int status;
2711         struct inode *inode = NULL;
2712         struct buffer_head *bh = NULL;
2713
2714         inode = ocfs2_get_system_file_inode(osb,
2715                                            TRUNCATE_LOG_SYSTEM_INODE,
2716                                            slot_num);
2717         if (!inode) {
2718                 status = -EINVAL;
2719                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
2720                 goto bail;
2721         }
2722
2723         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
2724                                   OCFS2_BH_CACHED, inode);
2725         if (status < 0) {
2726                 iput(inode);
2727                 mlog_errno(status);
2728                 goto bail;
2729         }
2730
2731         *tl_inode = inode;
2732         *tl_bh    = bh;
2733 bail:
2734         mlog_exit(status);
2735         return status;
2736 }
2737
2738 /* called during the 1st stage of node recovery. we stamp a clean
2739  * truncate log and pass back a copy for processing later. if the
2740  * truncate log does not require processing, a *tl_copy is set to
2741  * NULL. */
2742 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
2743                                       int slot_num,
2744                                       struct ocfs2_dinode **tl_copy)
2745 {
2746         int status;
2747         struct inode *tl_inode = NULL;
2748         struct buffer_head *tl_bh = NULL;
2749         struct ocfs2_dinode *di;
2750         struct ocfs2_truncate_log *tl;
2751
2752         *tl_copy = NULL;
2753
2754         mlog(0, "recover truncate log from slot %d\n", slot_num);
2755
2756         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
2757         if (status < 0) {
2758                 mlog_errno(status);
2759                 goto bail;
2760         }
2761
2762         di = (struct ocfs2_dinode *) tl_bh->b_data;
2763         tl = &di->id2.i_dealloc;
2764         if (!OCFS2_IS_VALID_DINODE(di)) {
2765                 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
2766                 status = -EIO;
2767                 goto bail;
2768         }
2769
2770         if (le16_to_cpu(tl->tl_used)) {
2771                 mlog(0, "We'll have %u logs to recover\n",
2772                      le16_to_cpu(tl->tl_used));
2773
2774                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
2775                 if (!(*tl_copy)) {
2776                         status = -ENOMEM;
2777                         mlog_errno(status);
2778                         goto bail;
2779                 }
2780
2781                 /* Assuming the write-out below goes well, this copy
2782                  * will be passed back to recovery for processing. */
2783                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
2784
2785                 /* All we need to do to clear the truncate log is set
2786                  * tl_used. */
2787                 tl->tl_used = 0;
2788
2789                 status = ocfs2_write_block(osb, tl_bh, tl_inode);
2790                 if (status < 0) {
2791                         mlog_errno(status);
2792                         goto bail;
2793                 }
2794         }
2795
2796 bail:
2797         if (tl_inode)
2798                 iput(tl_inode);
2799         if (tl_bh)
2800                 brelse(tl_bh);
2801
2802         if (status < 0 && (*tl_copy)) {
2803                 kfree(*tl_copy);
2804                 *tl_copy = NULL;
2805         }
2806
2807         mlog_exit(status);
2808         return status;
2809 }
2810
2811 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
2812                                          struct ocfs2_dinode *tl_copy)
2813 {
2814         int status = 0;
2815         int i;
2816         unsigned int clusters, num_recs, start_cluster;
2817         u64 start_blk;
2818         handle_t *handle;
2819         struct inode *tl_inode = osb->osb_tl_inode;
2820         struct ocfs2_truncate_log *tl;
2821
2822         mlog_entry_void();
2823
2824         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
2825                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
2826                 return -EINVAL;
2827         }
2828
2829         tl = &tl_copy->id2.i_dealloc;
2830         num_recs = le16_to_cpu(tl->tl_used);
2831         mlog(0, "cleanup %u records from %llu\n", num_recs,
2832              (unsigned long long)tl_copy->i_blkno);
2833
2834         mutex_lock(&tl_inode->i_mutex);
2835         for(i = 0; i < num_recs; i++) {
2836                 if (ocfs2_truncate_log_needs_flush(osb)) {
2837                         status = __ocfs2_flush_truncate_log(osb);
2838                         if (status < 0) {
2839                                 mlog_errno(status);
2840                                 goto bail_up;
2841                         }
2842                 }
2843
2844                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
2845                 if (IS_ERR(handle)) {
2846                         status = PTR_ERR(handle);
2847                         mlog_errno(status);
2848                         goto bail_up;
2849                 }
2850
2851                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
2852                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
2853                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
2854
2855                 status = ocfs2_truncate_log_append(osb, handle,
2856                                                    start_blk, clusters);
2857                 ocfs2_commit_trans(osb, handle);
2858                 if (status < 0) {
2859                         mlog_errno(status);
2860                         goto bail_up;
2861                 }
2862         }
2863
2864 bail_up:
2865         mutex_unlock(&tl_inode->i_mutex);
2866
2867         mlog_exit(status);
2868         return status;
2869 }
2870
2871 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
2872 {
2873         int status;
2874         struct inode *tl_inode = osb->osb_tl_inode;
2875
2876         mlog_entry_void();
2877
2878         if (tl_inode) {
2879                 cancel_delayed_work(&osb->osb_truncate_log_wq);
2880                 flush_workqueue(ocfs2_wq);
2881
2882                 status = ocfs2_flush_truncate_log(osb);
2883                 if (status < 0)
2884                         mlog_errno(status);
2885
2886                 brelse(osb->osb_tl_bh);
2887                 iput(osb->osb_tl_inode);
2888         }
2889
2890         mlog_exit_void();
2891 }
2892
2893 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
2894 {
2895         int status;
2896         struct inode *tl_inode = NULL;
2897         struct buffer_head *tl_bh = NULL;
2898
2899         mlog_entry_void();
2900
2901         status = ocfs2_get_truncate_log_info(osb,
2902                                              osb->slot_num,
2903                                              &tl_inode,
2904                                              &tl_bh);
2905         if (status < 0)
2906                 mlog_errno(status);
2907
2908         /* ocfs2_truncate_log_shutdown keys on the existence of
2909          * osb->osb_tl_inode so we don't set any of the osb variables
2910          * until we're sure all is well. */
2911         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
2912                           ocfs2_truncate_log_worker);
2913         osb->osb_tl_bh    = tl_bh;
2914         osb->osb_tl_inode = tl_inode;
2915
2916         mlog_exit(status);
2917         return status;
2918 }
2919
2920 /* This function will figure out whether the currently last extent
2921  * block will be deleted, and if it will, what the new last extent
2922  * block will be so we can update his h_next_leaf_blk field, as well
2923  * as the dinodes i_last_eb_blk */
2924 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
2925                                        u32 new_i_clusters,
2926                                        struct ocfs2_path *path,
2927                                        struct buffer_head **new_last_eb)
2928 {
2929         int ret = 0;
2930         u32 cpos;
2931         struct ocfs2_extent_block *eb;
2932         struct ocfs2_extent_list *el;
2933         struct buffer_head *bh = NULL;
2934
2935         *new_last_eb = NULL;
2936
2937         /* we have no tree, so of course, no last_eb. */
2938         if (!path->p_tree_depth)
2939                 goto out;
2940
2941         /* trunc to zero special case - this makes tree_depth = 0
2942          * regardless of what it is.  */
2943         if (!new_i_clusters)
2944                 goto out;
2945
2946         el = path_leaf_el(path);
2947         BUG_ON(!el->l_next_free_rec);
2948
2949         /* Make sure that this guy will actually be empty after we
2950          * clear away the data. */
2951         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
2952                 if (le16_to_cpu(el->l_next_free_rec) > 1 &&
2953                     le32_to_cpu(el->l_recs[1].e_cpos) < new_i_clusters)
2954                         goto out;
2955         } else if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
2956                 goto out;
2957
2958         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2959         if (ret) {
2960                 mlog_errno(ret);
2961                 goto out;
2962         }
2963
2964         ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
2965         if (ret) {
2966                 mlog_errno(ret);
2967                 goto out;
2968         }
2969
2970         eb = (struct ocfs2_extent_block *) bh->b_data;
2971         el = &eb->h_list;
2972         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
2973                 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
2974                 ret = -EROFS;
2975                 goto out;
2976         }
2977
2978         *new_last_eb = bh;
2979         get_bh(*new_last_eb);
2980         mlog(0, "returning block %llu, (cpos: %u)\n",
2981              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
2982 out:
2983         brelse(bh);
2984
2985         return ret;
2986 }
2987
2988 static int ocfs2_do_truncate(struct ocfs2_super *osb,
2989                              unsigned int clusters_to_del,
2990                              struct inode *inode,
2991                              struct buffer_head *fe_bh,
2992                              handle_t *handle,
2993                              struct ocfs2_truncate_context *tc,
2994                              struct ocfs2_path *path)
2995 {
2996         int status, i, index;
2997         struct ocfs2_dinode *fe;
2998         struct ocfs2_extent_block *eb;
2999         struct ocfs2_extent_block *last_eb = NULL;
3000         struct ocfs2_extent_list *el;
3001         struct buffer_head *eb_bh = NULL;
3002         struct buffer_head *last_eb_bh = NULL;
3003         u64 delete_blk = 0;
3004
3005         fe = (struct ocfs2_dinode *) fe_bh->b_data;
3006
3007         status = ocfs2_find_new_last_ext_blk(inode,
3008                                              le32_to_cpu(fe->i_clusters) -
3009                                              clusters_to_del,
3010                                              path, &last_eb_bh);
3011         if (status < 0) {
3012                 mlog_errno(status);
3013                 goto bail;
3014         }
3015
3016         /*
3017          * Each component will be touched, so we might as well journal
3018          * here to avoid having to handle errors later.
3019          */
3020         for (i = 0; i < path_num_items(path); i++) {
3021                 status = ocfs2_journal_access(handle, inode,
3022                                               path->p_node[i].bh,
3023                                               OCFS2_JOURNAL_ACCESS_WRITE);
3024                 if (status < 0) {
3025                         mlog_errno(status);
3026                         goto bail;
3027                 }
3028         }
3029
3030         if (last_eb_bh) {
3031                 status = ocfs2_journal_access(handle, inode, last_eb_bh,
3032                                               OCFS2_JOURNAL_ACCESS_WRITE);
3033                 if (status < 0) {
3034                         mlog_errno(status);
3035                         goto bail;
3036                 }
3037
3038                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3039         }
3040
3041         el = &(fe->id2.i_list);
3042
3043         /*
3044          * Lower levels depend on this never happening, but it's best
3045          * to check it up here before changing the tree.
3046          */
3047         if (el->l_tree_depth && ocfs2_is_empty_extent(&el->l_recs[0])) {
3048                 ocfs2_error(inode->i_sb,
3049                             "Inode %lu has an empty extent record, depth %u\n",
3050                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
3051                 goto bail;
3052         }
3053
3054         spin_lock(&OCFS2_I(inode)->ip_lock);
3055         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
3056                                       clusters_to_del;
3057         spin_unlock(&OCFS2_I(inode)->ip_lock);
3058         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
3059
3060         i = le16_to_cpu(el->l_next_free_rec) - 1;
3061
3062         BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
3063         le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
3064         /* tree depth zero, we can just delete the clusters, otherwise
3065          * we need to record the offset of the next level extent block
3066          * as we may overwrite it. */
3067         if (!el->l_tree_depth) {
3068                 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
3069                         + ocfs2_clusters_to_blocks(osb->sb,
3070                                         le32_to_cpu(el->l_recs[i].e_clusters));
3071
3072                 if (!el->l_recs[i].e_clusters) {
3073                         /* if we deleted the whole extent record, then clear
3074                          * out the other fields and update the extent
3075                          * list.
3076                          */
3077                         el->l_recs[i].e_cpos = 0;
3078                         el->l_recs[i].e_blkno = 0;
3079                         BUG_ON(!el->l_next_free_rec);
3080                         le16_add_cpu(&el->l_next_free_rec, -1);
3081
3082                         /*
3083                          * The leftmost record might be an empty extent -
3084                          * delete it here too.
3085                          */
3086                         if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
3087                                 el->l_recs[0].e_cpos = 0;
3088                                 el->l_recs[0].e_blkno = 0;
3089                                 el->l_next_free_rec = 0;
3090                         }
3091                 }
3092         }
3093
3094         if (le32_to_cpu(fe->i_clusters) == 0) {
3095                 /* trunc to zero is a special case. */
3096                 el->l_tree_depth = 0;
3097                 fe->i_last_eb_blk = 0;
3098         } else if (last_eb)
3099                 fe->i_last_eb_blk = last_eb->h_blkno;
3100
3101         status = ocfs2_journal_dirty(handle, fe_bh);
3102         if (status < 0) {
3103                 mlog_errno(status);
3104                 goto bail;
3105         }
3106
3107         if (last_eb) {
3108                 /* If there will be a new last extent block, then by
3109                  * definition, there cannot be any leaves to the right of
3110                  * him. */
3111                 last_eb->h_next_leaf_blk = 0;
3112                 status = ocfs2_journal_dirty(handle, last_eb_bh);
3113                 if (status < 0) {
3114                         mlog_errno(status);
3115                         goto bail;
3116                 }
3117         }
3118
3119         index = 1;
3120         /* if our tree depth > 0, update all the tree blocks below us. */
3121         while (index <= path->p_tree_depth) {
3122                 eb_bh = path->p_node[index].bh;
3123                 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
3124                 el = path->p_node[index].el;
3125
3126                 mlog(0, "traveling tree (index = %d, extent block: %llu)\n",
3127                      index,  (unsigned long long)eb_bh->b_blocknr);
3128
3129                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
3130                 if (index !=
3131                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
3132                         ocfs2_error(inode->i_sb,
3133                                     "Inode %lu has invalid ext. block %llu\n",
3134                                     inode->i_ino,
3135                                     (unsigned long long)eb_bh->b_blocknr);
3136                         goto bail;
3137                 }
3138
3139                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3140
3141                 mlog(0, "extent block %llu, before: record %d: "
3142                      "(%u, %u, %llu), next = %u\n",
3143                      (unsigned long long)le64_to_cpu(eb->h_blkno), i,
3144                      le32_to_cpu(el->l_recs[i].e_cpos),
3145                      le32_to_cpu(el->l_recs[i].e_clusters),
3146                      (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
3147                      le16_to_cpu(el->l_next_free_rec));
3148
3149                 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
3150                 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
3151
3152                 /* bottom-most block requires us to delete data.*/
3153                 if (!el->l_tree_depth)
3154                         delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
3155                                 + ocfs2_clusters_to_blocks(osb->sb,
3156                                         le32_to_cpu(el->l_recs[i].e_clusters));
3157                 if (!el->l_recs[i].e_clusters) {
3158                         el->l_recs[i].e_cpos = 0;
3159                         el->l_recs[i].e_blkno = 0;
3160                         BUG_ON(!el->l_next_free_rec);
3161                         le16_add_cpu(&el->l_next_free_rec, -1);
3162                 }
3163                 if (i == 1 && ocfs2_is_empty_extent(&el->l_recs[0])) {
3164                         el->l_recs[0].e_cpos = 0;
3165                         el->l_recs[0].e_blkno = 0;
3166                         el->l_next_free_rec = 0;
3167                 }
3168
3169                 mlog(0, "extent block %llu, after: record %d: "
3170                      "(%u, %u, %llu), next = %u\n",
3171                      (unsigned long long)le64_to_cpu(eb->h_blkno), i,
3172                      le32_to_cpu(el->l_recs[i].e_cpos),
3173                      le32_to_cpu(el->l_recs[i].e_clusters),
3174                      (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
3175                      le16_to_cpu(el->l_next_free_rec));
3176
3177                 status = ocfs2_journal_dirty(handle, eb_bh);
3178                 if (status < 0) {
3179                         mlog_errno(status);
3180                         goto bail;
3181                 }
3182
3183                 if (!el->l_next_free_rec) {
3184                         mlog(0, "deleting this extent block.\n");
3185
3186                         ocfs2_remove_from_cache(inode, eb_bh);
3187
3188                         BUG_ON(el->l_recs[0].e_clusters);
3189                         BUG_ON(el->l_recs[0].e_cpos);
3190                         BUG_ON(el->l_recs[0].e_blkno);
3191
3192                         /*
3193                          * We need to remove this extent block from
3194                          * the list above it.
3195                          *
3196                          * Since we've passed it already in this loop,
3197                          * no need to worry about journaling.
3198                          */
3199                         el = path->p_node[index - 1].el;
3200                         i = le16_to_cpu(el->l_next_free_rec) - 1;
3201                         BUG_ON(i < 0);
3202                         el->l_recs[i].e_cpos = 0;
3203                         el->l_recs[i].e_clusters = 0;
3204                         el->l_recs[i].e_blkno = 0;
3205                         le16_add_cpu(&el->l_next_free_rec, -1);
3206
3207                         if (eb->h_suballoc_slot == 0) {
3208                                 /*
3209                                  * This code only understands how to
3210                                  * lock the suballocator in slot 0,
3211                                  * which is fine because allocation is
3212                                  * only ever done out of that
3213                                  * suballocator too. A future version
3214                                  * might change that however, so avoid
3215                                  * a free if we don't know how to
3216                                  * handle it. This way an fs incompat
3217                                  * bit will not be necessary.
3218                                  */
3219                                 status = ocfs2_free_extent_block(handle,
3220                                                                  tc->tc_ext_alloc_inode,
3221                                                                  tc->tc_ext_alloc_bh,
3222                                                                  eb);
3223                                 if (status < 0) {
3224                                         mlog_errno(status);
3225                                         goto bail;
3226                                 }
3227                         }
3228                 }
3229                 index++;
3230         }
3231
3232         BUG_ON(!delete_blk);
3233         status = ocfs2_truncate_log_append(osb, handle, delete_blk,
3234                                            clusters_to_del);
3235         if (status < 0) {
3236                 mlog_errno(status);
3237                 goto bail;
3238         }
3239         status = 0;
3240 bail:
3241
3242         mlog_exit(status);
3243         return status;
3244 }
3245
3246 /*
3247  * It is expected, that by the time you call this function,
3248  * inode->i_size and fe->i_size have been adjusted.
3249  *
3250  * WARNING: This will kfree the truncate context
3251  */
3252 int ocfs2_commit_truncate(struct ocfs2_super *osb,
3253                           struct inode *inode,
3254                           struct buffer_head *fe_bh,
3255                           struct ocfs2_truncate_context *tc)
3256 {
3257         int status, i, credits, tl_sem = 0;
3258         u32 clusters_to_del, new_highest_cpos, range;
3259         struct ocfs2_extent_list *el;
3260         handle_t *handle = NULL;
3261         struct inode *tl_inode = osb->osb_tl_inode;
3262         struct ocfs2_path *path = NULL;
3263
3264         mlog_entry_void();
3265
3266         down_write(&OCFS2_I(inode)->ip_alloc_sem);
3267
3268         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
3269                                                      i_size_read(inode));
3270
3271         path = ocfs2_new_inode_path(fe_bh);
3272         if (!path) {
3273                 status = -ENOMEM;
3274                 mlog_errno(status);
3275                 goto bail;
3276         }
3277 start:
3278         /*
3279          * Truncate always works against the rightmost tree branch.
3280          */
3281         status = ocfs2_find_path(inode, path, UINT_MAX);
3282         if (status) {
3283                 mlog_errno(status);
3284                 goto bail;
3285         }
3286
3287         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
3288              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
3289
3290         /*
3291          * By now, el will point to the extent list on the bottom most
3292          * portion of this tree. Only the tail record is considered in
3293          * each pass.
3294          *
3295          * We handle the following cases, in order:
3296          * - empty extent: delete the remaining branch
3297          * - remove the entire record
3298          * - remove a partial record
3299          * - no record needs to be removed (truncate has completed)
3300          */
3301         el = path_leaf_el(path);
3302         i = le16_to_cpu(el->l_next_free_rec) - 1;
3303         range = le32_to_cpu(el->l_recs[i].e_cpos) +
3304                 le32_to_cpu(el->l_recs[i].e_clusters);
3305         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
3306                 clusters_to_del = 0;
3307         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
3308                 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
3309         } else if (range > new_highest_cpos) {
3310                 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
3311                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
3312                                   new_highest_cpos;
3313         } else {
3314                 status = 0;
3315                 goto bail;
3316         }
3317
3318         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
3319              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
3320
3321         BUG_ON(clusters_to_del == 0);
3322
3323         mutex_lock(&tl_inode->i_mutex);
3324         tl_sem = 1;
3325         /* ocfs2_truncate_log_needs_flush guarantees us at least one
3326          * record is free for use. If there isn't any, we flush to get
3327          * an empty truncate log.  */
3328         if (ocfs2_truncate_log_needs_flush(osb)) {
3329                 status = __ocfs2_flush_truncate_log(osb);
3330                 if (status < 0) {
3331                         mlog_errno(status);
3332                         goto bail;
3333                 }
3334         }
3335
3336         credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
3337                                                 (struct ocfs2_dinode *)fe_bh->b_data,
3338                                                 el);
3339         handle = ocfs2_start_trans(osb, credits);
3340         if (IS_ERR(handle)) {
3341                 status = PTR_ERR(handle);
3342                 handle = NULL;
3343                 mlog_errno(status);
3344                 goto bail;
3345         }
3346
3347         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
3348                                    tc, path);
3349         if (status < 0) {
3350                 mlog_errno(status);
3351                 goto bail;
3352         }
3353
3354         mutex_unlock(&tl_inode->i_mutex);
3355         tl_sem = 0;
3356
3357         ocfs2_commit_trans(osb, handle);
3358         handle = NULL;
3359
3360         ocfs2_reinit_path(path, 1);
3361
3362         /*
3363          * Only loop if we still have allocation.
3364          */
3365         if (OCFS2_I(inode)->ip_clusters)
3366                 goto start;
3367 bail:
3368         up_write(&OCFS2_I(inode)->ip_alloc_sem);
3369
3370         ocfs2_schedule_truncate_log_flush(osb, 1);
3371
3372         if (tl_sem)
3373                 mutex_unlock(&tl_inode->i_mutex);
3374
3375         if (handle)
3376                 ocfs2_commit_trans(osb, handle);
3377
3378         ocfs2_free_path(path);
3379
3380         /* This will drop the ext_alloc cluster lock for us */
3381         ocfs2_free_truncate_context(tc);
3382
3383         mlog_exit(status);
3384         return status;
3385 }
3386
3387 /*
3388  * Expects the inode to already be locked. This will figure out which
3389  * inodes need to be locked and will put them on the returned truncate
3390  * context.
3391  */
3392 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
3393                            struct inode *inode,
3394                            struct buffer_head *fe_bh,
3395                            struct ocfs2_truncate_context **tc)
3396 {
3397         int status, metadata_delete, i;
3398         unsigned int new_i_clusters;
3399         struct ocfs2_dinode *fe;
3400         struct ocfs2_extent_block *eb;
3401         struct ocfs2_extent_list *el;
3402         struct buffer_head *last_eb_bh = NULL;
3403         struct inode *ext_alloc_inode = NULL;
3404         struct buffer_head *ext_alloc_bh = NULL;
3405
3406         mlog_entry_void();
3407
3408         *tc = NULL;
3409
3410         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
3411                                                   i_size_read(inode));
3412         fe = (struct ocfs2_dinode *) fe_bh->b_data;
3413
3414         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
3415              "%llu\n", fe->i_clusters, new_i_clusters,
3416              (unsigned long long)fe->i_size);
3417
3418         if (!ocfs2_sparse_alloc(osb) &&
3419             le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
3420                 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
3421                             "%u and size %llu whereas struct inode has "
3422                             "cluster count %u and size %llu which caused an "
3423                             "invalid truncate to %u clusters.",
3424                             (unsigned long long)le64_to_cpu(fe->i_blkno),
3425                             le32_to_cpu(fe->i_clusters),
3426                             (unsigned long long)le64_to_cpu(fe->i_size),
3427                             OCFS2_I(inode)->ip_clusters, i_size_read(inode),
3428                             new_i_clusters);
3429                 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
3430                 status = -EIO;
3431                 goto bail;
3432         }
3433
3434         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
3435         if (!(*tc)) {
3436                 status = -ENOMEM;
3437                 mlog_errno(status);
3438                 goto bail;
3439         }
3440
3441         metadata_delete = 0;
3442         if (fe->id2.i_list.l_tree_depth) {
3443                 /* If we have a tree, then the truncate may result in
3444                  * metadata deletes. Figure this out from the
3445                  * rightmost leaf block.*/
3446                 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
3447                                           &last_eb_bh, OCFS2_BH_CACHED, inode);
3448                 if (status < 0) {
3449                         mlog_errno(status);
3450                         goto bail;
3451                 }
3452                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
3453                 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
3454                         OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
3455
3456                         brelse(last_eb_bh);
3457                         status = -EIO;
3458                         goto bail;
3459                 }
3460                 el = &(eb->h_list);
3461
3462                 i = 0;
3463                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3464                         i = 1;
3465                 /*
3466                  * XXX: Should we check that next_free_rec contains
3467                  * the extent?
3468                  */
3469                 if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
3470                         metadata_delete = 1;
3471         }
3472
3473         (*tc)->tc_last_eb_bh = last_eb_bh;
3474
3475         if (metadata_delete) {
3476                 mlog(0, "Will have to delete metadata for this trunc. "
3477                      "locking allocator.\n");
3478                 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
3479                 if (!ext_alloc_inode) {
3480                         status = -ENOMEM;
3481                         mlog_errno(status);
3482                         goto bail;
3483                 }
3484
3485                 mutex_lock(&ext_alloc_inode->i_mutex);
3486                 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
3487
3488                 status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
3489                 if (status < 0) {
3490                         mlog_errno(status);
3491                         goto bail;
3492                 }
3493                 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
3494                 (*tc)->tc_ext_alloc_locked = 1;
3495         }
3496
3497         status = 0;
3498 bail:
3499         if (status < 0) {
3500                 if (*tc)
3501                         ocfs2_free_truncate_context(*tc);
3502                 *tc = NULL;
3503         }
3504         mlog_exit_void();
3505         return status;
3506 }
3507
3508 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
3509 {
3510         if (tc->tc_ext_alloc_inode) {
3511                 if (tc->tc_ext_alloc_locked)
3512                         ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
3513
3514                 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
3515                 iput(tc->tc_ext_alloc_inode);
3516         }
3517
3518         if (tc->tc_ext_alloc_bh)
3519                 brelse(tc->tc_ext_alloc_bh);
3520
3521         if (tc->tc_last_eb_bh)
3522                 brelse(tc->tc_last_eb_bh);
3523
3524         kfree(tc);
3525 }