1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
6 * Extent allocs and frees
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
31 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
32 #include <cluster/masklog.h>
38 #include "extent_map.h"
41 #include "localalloc.h"
48 #include "buffer_head_io.h"
50 static int ocfs2_extent_contig(struct inode *inode,
51 struct ocfs2_extent_rec *ext,
54 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
55 struct ocfs2_journal_handle *handle,
58 struct ocfs2_alloc_context *meta_ac,
59 struct buffer_head *bhs[]);
61 static int ocfs2_add_branch(struct ocfs2_super *osb,
62 struct ocfs2_journal_handle *handle,
64 struct buffer_head *fe_bh,
65 struct buffer_head *eb_bh,
66 struct buffer_head *last_eb_bh,
67 struct ocfs2_alloc_context *meta_ac);
69 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
70 struct ocfs2_journal_handle *handle,
72 struct buffer_head *fe_bh,
73 struct ocfs2_alloc_context *meta_ac,
74 struct buffer_head **ret_new_eb_bh);
76 static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
77 struct ocfs2_journal_handle *handle,
79 struct buffer_head *fe_bh,
83 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
85 struct buffer_head *fe_bh,
86 struct buffer_head **target_bh);
88 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
90 struct ocfs2_dinode *fe,
91 unsigned int new_i_clusters,
92 struct buffer_head *old_last_eb,
93 struct buffer_head **new_last_eb);
95 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
97 static int ocfs2_extent_contig(struct inode *inode,
98 struct ocfs2_extent_rec *ext,
101 return blkno == (le64_to_cpu(ext->e_blkno) +
102 ocfs2_clusters_to_blocks(inode->i_sb,
103 le32_to_cpu(ext->e_clusters)));
107 * How many free extents have we got before we need more meta data?
109 int ocfs2_num_free_extents(struct ocfs2_super *osb,
111 struct ocfs2_dinode *fe)
114 struct ocfs2_extent_list *el;
115 struct ocfs2_extent_block *eb;
116 struct buffer_head *eb_bh = NULL;
120 if (!OCFS2_IS_VALID_DINODE(fe)) {
121 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
126 if (fe->i_last_eb_blk) {
127 retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
128 &eb_bh, OCFS2_BH_CACHED, inode);
133 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
136 el = &fe->id2.i_list;
138 BUG_ON(el->l_tree_depth != 0);
140 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
149 /* expects array to already be allocated
151 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
154 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
155 struct ocfs2_journal_handle *handle,
158 struct ocfs2_alloc_context *meta_ac,
159 struct buffer_head *bhs[])
161 int count, status, i;
162 u16 suballoc_bit_start;
165 struct ocfs2_extent_block *eb;
170 while (count < wanted) {
171 status = ocfs2_claim_metadata(osb,
183 for(i = count; i < (num_got + count); i++) {
184 bhs[i] = sb_getblk(osb->sb, first_blkno);
185 if (bhs[i] == NULL) {
190 ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
192 status = ocfs2_journal_access(handle, inode, bhs[i],
193 OCFS2_JOURNAL_ACCESS_CREATE);
199 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
200 eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
201 /* Ok, setup the minimal stuff here. */
202 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
203 eb->h_blkno = cpu_to_le64(first_blkno);
204 eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
206 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
207 /* we always use slot zero's suballocator */
208 eb->h_suballoc_slot = 0;
210 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
212 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
214 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
216 suballoc_bit_start++;
219 /* We'll also be dirtied by the caller, so
220 * this isn't absolutely necessary. */
221 status = ocfs2_journal_dirty(handle, bhs[i]);
234 for(i = 0; i < wanted; i++) {
245 * Add an entire tree branch to our inode. eb_bh is the extent block
246 * to start at, if we don't want to start the branch at the dinode
249 * last_eb_bh is required as we have to update it's next_leaf pointer
250 * for the new last extent block.
252 * the new branch will be 'empty' in the sense that every block will
253 * contain a single record with e_clusters == 0.
255 static int ocfs2_add_branch(struct ocfs2_super *osb,
256 struct ocfs2_journal_handle *handle,
258 struct buffer_head *fe_bh,
259 struct buffer_head *eb_bh,
260 struct buffer_head *last_eb_bh,
261 struct ocfs2_alloc_context *meta_ac)
263 int status, new_blocks, i;
264 u64 next_blkno, new_last_eb_blk;
265 struct buffer_head *bh;
266 struct buffer_head **new_eb_bhs = NULL;
267 struct ocfs2_dinode *fe;
268 struct ocfs2_extent_block *eb;
269 struct ocfs2_extent_list *eb_el;
270 struct ocfs2_extent_list *el;
276 fe = (struct ocfs2_dinode *) fe_bh->b_data;
279 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
282 el = &fe->id2.i_list;
284 /* we never add a branch to a leaf. */
285 BUG_ON(!el->l_tree_depth);
287 new_blocks = le16_to_cpu(el->l_tree_depth);
289 /* allocate the number of new eb blocks we need */
290 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
298 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
299 meta_ac, new_eb_bhs);
305 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
306 * linked with the rest of the tree.
307 * conversly, new_eb_bhs[0] is the new bottommost leaf.
309 * when we leave the loop, new_last_eb_blk will point to the
310 * newest leaf, and next_blkno will point to the topmost extent
312 next_blkno = new_last_eb_blk = 0;
313 for(i = 0; i < new_blocks; i++) {
315 eb = (struct ocfs2_extent_block *) bh->b_data;
316 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
317 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
323 status = ocfs2_journal_access(handle, inode, bh,
324 OCFS2_JOURNAL_ACCESS_CREATE);
330 eb->h_next_leaf_blk = 0;
331 eb_el->l_tree_depth = cpu_to_le16(i);
332 eb_el->l_next_free_rec = cpu_to_le16(1);
333 eb_el->l_recs[0].e_cpos = fe->i_clusters;
334 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
335 eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
336 if (!eb_el->l_tree_depth)
337 new_last_eb_blk = le64_to_cpu(eb->h_blkno);
339 status = ocfs2_journal_dirty(handle, bh);
345 next_blkno = le64_to_cpu(eb->h_blkno);
348 /* This is a bit hairy. We want to update up to three blocks
349 * here without leaving any of them in an inconsistent state
350 * in case of error. We don't have to worry about
351 * journal_dirty erroring as it won't unless we've aborted the
352 * handle (in which case we would never be here) so reserving
353 * the write with journal_access is all we need to do. */
354 status = ocfs2_journal_access(handle, inode, last_eb_bh,
355 OCFS2_JOURNAL_ACCESS_WRITE);
360 status = ocfs2_journal_access(handle, inode, fe_bh,
361 OCFS2_JOURNAL_ACCESS_WRITE);
367 status = ocfs2_journal_access(handle, inode, eb_bh,
368 OCFS2_JOURNAL_ACCESS_WRITE);
375 /* Link the new branch into the rest of the tree (el will
376 * either be on the fe, or the extent block passed in. */
377 i = le16_to_cpu(el->l_next_free_rec);
378 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
379 el->l_recs[i].e_cpos = fe->i_clusters;
380 el->l_recs[i].e_clusters = 0;
381 le16_add_cpu(&el->l_next_free_rec, 1);
383 /* fe needs a new last extent block pointer, as does the
384 * next_leaf on the previously last-extent-block. */
385 fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
387 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
388 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
390 status = ocfs2_journal_dirty(handle, last_eb_bh);
393 status = ocfs2_journal_dirty(handle, fe_bh);
397 status = ocfs2_journal_dirty(handle, eb_bh);
405 for (i = 0; i < new_blocks; i++)
407 brelse(new_eb_bhs[i]);
416 * adds another level to the allocation tree.
417 * returns back the new extent block so you can add a branch to it
420 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
421 struct ocfs2_journal_handle *handle,
423 struct buffer_head *fe_bh,
424 struct ocfs2_alloc_context *meta_ac,
425 struct buffer_head **ret_new_eb_bh)
428 struct buffer_head *new_eb_bh = NULL;
429 struct ocfs2_dinode *fe;
430 struct ocfs2_extent_block *eb;
431 struct ocfs2_extent_list *fe_el;
432 struct ocfs2_extent_list *eb_el;
436 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
443 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
444 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
445 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
451 fe = (struct ocfs2_dinode *) fe_bh->b_data;
452 fe_el = &fe->id2.i_list;
454 status = ocfs2_journal_access(handle, inode, new_eb_bh,
455 OCFS2_JOURNAL_ACCESS_CREATE);
461 /* copy the fe data into the new extent block */
462 eb_el->l_tree_depth = fe_el->l_tree_depth;
463 eb_el->l_next_free_rec = fe_el->l_next_free_rec;
464 for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
465 eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
466 eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
467 eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
470 status = ocfs2_journal_dirty(handle, new_eb_bh);
476 status = ocfs2_journal_access(handle, inode, fe_bh,
477 OCFS2_JOURNAL_ACCESS_WRITE);
484 le16_add_cpu(&fe_el->l_tree_depth, 1);
485 fe_el->l_recs[0].e_cpos = 0;
486 fe_el->l_recs[0].e_blkno = eb->h_blkno;
487 fe_el->l_recs[0].e_clusters = fe->i_clusters;
488 for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
489 fe_el->l_recs[i].e_cpos = 0;
490 fe_el->l_recs[i].e_clusters = 0;
491 fe_el->l_recs[i].e_blkno = 0;
493 fe_el->l_next_free_rec = cpu_to_le16(1);
495 /* If this is our 1st tree depth shift, then last_eb_blk
496 * becomes the allocated extent block */
497 if (fe_el->l_tree_depth == cpu_to_le16(1))
498 fe->i_last_eb_blk = eb->h_blkno;
500 status = ocfs2_journal_dirty(handle, fe_bh);
506 *ret_new_eb_bh = new_eb_bh;
518 * Expects the tree to already have room in the rightmost leaf for the
519 * extent. Updates all the extent blocks (and the dinode) on the way
522 static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
523 struct ocfs2_journal_handle *handle,
525 struct buffer_head *fe_bh,
529 int status, i, num_bhs = 0;
532 struct buffer_head **eb_bhs = NULL;
533 struct ocfs2_dinode *fe;
534 struct ocfs2_extent_block *eb;
535 struct ocfs2_extent_list *el;
539 status = ocfs2_journal_access(handle, inode, fe_bh,
540 OCFS2_JOURNAL_ACCESS_WRITE);
546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
547 el = &fe->id2.i_list;
548 if (el->l_tree_depth) {
549 /* This is another operation where we want to be
550 * careful about our tree updates. An error here means
551 * none of the previous changes we made should roll
552 * forward. As a result, we have to record the buffers
553 * for this part of the tree in an array and reserve a
554 * journal write to them before making any changes. */
555 num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
556 eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
565 while(el->l_tree_depth) {
566 next_free = le16_to_cpu(el->l_next_free_rec);
567 if (next_free == 0) {
568 ocfs2_error(inode->i_sb,
569 "Dinode %llu has a bad extent list",
570 (unsigned long long)OCFS2_I(inode)->ip_blkno);
574 next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
576 BUG_ON(i >= num_bhs);
577 status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
578 OCFS2_BH_CACHED, inode);
583 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
584 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
585 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
591 status = ocfs2_journal_access(handle, inode, eb_bhs[i],
592 OCFS2_JOURNAL_ACCESS_WRITE);
600 /* When we leave this loop, eb_bhs[num_bhs - 1] will
601 * hold the bottom-most leaf extent block. */
603 BUG_ON(el->l_tree_depth);
605 el = &fe->id2.i_list;
606 /* If we have tree depth, then the fe update is
607 * trivial, and we want to switch el out for the
608 * bottom-most leaf in order to update it with the
609 * actual extent data below. */
610 next_free = le16_to_cpu(el->l_next_free_rec);
611 if (next_free == 0) {
612 ocfs2_error(inode->i_sb,
613 "Dinode %llu has a bad extent list",
614 (unsigned long long)OCFS2_I(inode)->ip_blkno);
618 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
620 /* (num_bhs - 1) to avoid the leaf */
621 for(i = 0; i < (num_bhs - 1); i++) {
622 eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
625 /* finally, make our actual change to the
626 * intermediate extent blocks. */
627 next_free = le16_to_cpu(el->l_next_free_rec);
628 le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
631 status = ocfs2_journal_dirty(handle, eb_bhs[i]);
635 BUG_ON(i != (num_bhs - 1));
636 /* note that the leaf block wasn't touched in
638 eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
640 BUG_ON(el->l_tree_depth);
643 /* yay, we can finally add the actual extent now! */
644 i = le16_to_cpu(el->l_next_free_rec) - 1;
645 if (le16_to_cpu(el->l_next_free_rec) &&
646 ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
647 le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
648 } else if (le16_to_cpu(el->l_next_free_rec) &&
649 (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
650 /* having an empty extent at eof is legal. */
651 if (el->l_recs[i].e_cpos != fe->i_clusters) {
652 ocfs2_error(inode->i_sb,
653 "Dinode %llu trailing extent is bad: "
654 "cpos (%u) != number of clusters (%u)",
655 (unsigned long long)OCFS2_I(inode)->ip_blkno,
656 le32_to_cpu(el->l_recs[i].e_cpos),
657 le32_to_cpu(fe->i_clusters));
661 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
662 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
664 /* No contiguous record, or no empty record at eof, so
665 * we add a new one. */
667 BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
668 le16_to_cpu(el->l_count));
669 i = le16_to_cpu(el->l_next_free_rec);
671 el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
672 el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
673 el->l_recs[i].e_cpos = fe->i_clusters;
674 le16_add_cpu(&el->l_next_free_rec, 1);
678 * extent_map errors are not fatal, so they are ignored outside
679 * of flushing the thing.
681 status = ocfs2_extent_map_append(inode, &el->l_recs[i],
685 ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
688 status = ocfs2_journal_dirty(handle, fe_bh);
691 if (fe->id2.i_list.l_tree_depth) {
692 status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
700 for (i = 0; i < num_bhs; i++)
711 * Should only be called when there is no space left in any of the
712 * leaf nodes. What we want to do is find the lowest tree depth
713 * non-leaf extent block with room for new records. There are three
714 * valid results of this search:
716 * 1) a lowest extent block is found, then we pass it back in
717 * *lowest_eb_bh and return '0'
719 * 2) the search fails to find anything, but the dinode has room. We
720 * pass NULL back in *lowest_eb_bh, but still return '0'
722 * 3) the search fails to find anything AND the dinode is full, in
723 * which case we return > 0
725 * return status < 0 indicates an error.
727 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
729 struct buffer_head *fe_bh,
730 struct buffer_head **target_bh)
734 struct ocfs2_dinode *fe;
735 struct ocfs2_extent_block *eb;
736 struct ocfs2_extent_list *el;
737 struct buffer_head *bh = NULL;
738 struct buffer_head *lowest_bh = NULL;
744 fe = (struct ocfs2_dinode *) fe_bh->b_data;
745 el = &fe->id2.i_list;
747 while(le16_to_cpu(el->l_tree_depth) > 1) {
748 if (le16_to_cpu(el->l_next_free_rec) == 0) {
749 ocfs2_error(inode->i_sb, "Dinode %llu has empty "
750 "extent list (next_free_rec == 0)",
751 (unsigned long long)OCFS2_I(inode)->ip_blkno);
755 i = le16_to_cpu(el->l_next_free_rec) - 1;
756 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
758 ocfs2_error(inode->i_sb, "Dinode %llu has extent "
759 "list where extent # %d has no physical "
761 (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
771 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
778 eb = (struct ocfs2_extent_block *) bh->b_data;
779 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
780 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
786 if (le16_to_cpu(el->l_next_free_rec) <
787 le16_to_cpu(el->l_count)) {
795 /* If we didn't find one and the fe doesn't have any room,
798 && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
801 *target_bh = lowest_bh;
810 /* the caller needs to update fe->i_clusters */
811 int ocfs2_insert_extent(struct ocfs2_super *osb,
812 struct ocfs2_journal_handle *handle,
814 struct buffer_head *fe_bh,
817 struct ocfs2_alloc_context *meta_ac)
819 int status, i, shift;
820 struct buffer_head *last_eb_bh = NULL;
821 struct buffer_head *bh = NULL;
822 struct ocfs2_dinode *fe;
823 struct ocfs2_extent_block *eb;
824 struct ocfs2_extent_list *el;
828 mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
829 new_clusters, (unsigned long long)start_blk,
830 (unsigned long long)OCFS2_I(inode)->ip_blkno);
832 fe = (struct ocfs2_dinode *) fe_bh->b_data;
833 el = &fe->id2.i_list;
835 if (el->l_tree_depth) {
836 /* jump to end of tree */
837 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
838 &last_eb_bh, OCFS2_BH_CACHED, inode);
843 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
847 /* Can we allocate without adding/shifting tree bits? */
848 i = le16_to_cpu(el->l_next_free_rec) - 1;
849 if (le16_to_cpu(el->l_next_free_rec) == 0
850 || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
851 || le32_to_cpu(el->l_recs[i].e_clusters) == 0
852 || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
855 mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
858 shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
865 /* We traveled all the way to the bottom of the allocation tree
866 * and didn't find room for any more extents - we need to add
867 * another tree level */
869 /* if we hit a leaf, we'd better be empty :) */
870 BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
871 le16_to_cpu(el->l_count));
873 mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
875 le16_to_cpu(fe->id2.i_list.l_tree_depth));
877 /* ocfs2_shift_tree_depth will return us a buffer with
878 * the new extent block (so we can pass that to
879 * ocfs2_add_branch). */
880 status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
886 /* Special case: we have room now if we shifted from
888 if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
892 /* call ocfs2_add_branch to add the final part of the tree with
894 mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
895 status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
903 /* Finally, we can add clusters. */
904 status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
905 start_blk, new_clusters);
920 static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
922 struct buffer_head *tl_bh = osb->osb_tl_bh;
923 struct ocfs2_dinode *di;
924 struct ocfs2_truncate_log *tl;
926 di = (struct ocfs2_dinode *) tl_bh->b_data;
927 tl = &di->id2.i_dealloc;
929 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
930 "slot %d, invalid truncate log parameters: used = "
931 "%u, count = %u\n", osb->slot_num,
932 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
933 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
936 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
937 unsigned int new_start)
939 unsigned int tail_index;
940 unsigned int current_tail;
942 /* No records, nothing to coalesce */
943 if (!le16_to_cpu(tl->tl_used))
946 tail_index = le16_to_cpu(tl->tl_used) - 1;
947 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
948 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
950 return current_tail == new_start;
953 static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
954 struct ocfs2_journal_handle *handle,
956 unsigned int num_clusters)
959 unsigned int start_cluster, tl_count;
960 struct inode *tl_inode = osb->osb_tl_inode;
961 struct buffer_head *tl_bh = osb->osb_tl_bh;
962 struct ocfs2_dinode *di;
963 struct ocfs2_truncate_log *tl;
965 mlog_entry("start_blk = %llu, num_clusters = %u\n",
966 (unsigned long long)start_blk, num_clusters);
968 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
970 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
972 di = (struct ocfs2_dinode *) tl_bh->b_data;
973 tl = &di->id2.i_dealloc;
974 if (!OCFS2_IS_VALID_DINODE(di)) {
975 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
980 tl_count = le16_to_cpu(tl->tl_count);
981 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
983 "Truncate record count on #%llu invalid "
984 "wanted %u, actual %u\n",
985 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
986 ocfs2_truncate_recs_per_inode(osb->sb),
987 le16_to_cpu(tl->tl_count));
989 /* Caller should have known to flush before calling us. */
990 index = le16_to_cpu(tl->tl_used);
991 if (index >= tl_count) {
997 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
998 OCFS2_JOURNAL_ACCESS_WRITE);
1004 mlog(0, "Log truncate of %u clusters starting at cluster %u to "
1005 "%llu (index = %d)\n", num_clusters, start_cluster,
1006 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
1008 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
1010 * Move index back to the record we are coalescing with.
1011 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
1015 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
1016 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
1017 index, le32_to_cpu(tl->tl_recs[index].t_start),
1020 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
1021 tl->tl_used = cpu_to_le16(index + 1);
1023 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
1025 status = ocfs2_journal_dirty(handle, tl_bh);
1036 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
1037 struct ocfs2_journal_handle *handle,
1038 struct inode *data_alloc_inode,
1039 struct buffer_head *data_alloc_bh)
1043 unsigned int num_clusters;
1045 struct ocfs2_truncate_rec rec;
1046 struct ocfs2_dinode *di;
1047 struct ocfs2_truncate_log *tl;
1048 struct inode *tl_inode = osb->osb_tl_inode;
1049 struct buffer_head *tl_bh = osb->osb_tl_bh;
1053 di = (struct ocfs2_dinode *) tl_bh->b_data;
1054 tl = &di->id2.i_dealloc;
1055 i = le16_to_cpu(tl->tl_used) - 1;
1057 /* Caller has given us at least enough credits to
1058 * update the truncate log dinode */
1059 status = ocfs2_journal_access(handle, tl_inode, tl_bh,
1060 OCFS2_JOURNAL_ACCESS_WRITE);
1066 tl->tl_used = cpu_to_le16(i);
1068 status = ocfs2_journal_dirty(handle, tl_bh);
1074 /* TODO: Perhaps we can calculate the bulk of the
1075 * credits up front rather than extending like
1077 status = ocfs2_extend_trans(handle,
1078 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
1084 rec = tl->tl_recs[i];
1085 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
1086 le32_to_cpu(rec.t_start));
1087 num_clusters = le32_to_cpu(rec.t_clusters);
1089 /* if start_blk is not set, we ignore the record as
1092 mlog(0, "free record %d, start = %u, clusters = %u\n",
1093 i, le32_to_cpu(rec.t_start), num_clusters);
1095 status = ocfs2_free_clusters(handle, data_alloc_inode,
1096 data_alloc_bh, start_blk,
1111 /* Expects you to already be holding tl_inode->i_mutex */
1112 static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1115 unsigned int num_to_flush;
1116 struct ocfs2_journal_handle *handle = NULL;
1117 struct inode *tl_inode = osb->osb_tl_inode;
1118 struct inode *data_alloc_inode = NULL;
1119 struct buffer_head *tl_bh = osb->osb_tl_bh;
1120 struct buffer_head *data_alloc_bh = NULL;
1121 struct ocfs2_dinode *di;
1122 struct ocfs2_truncate_log *tl;
1126 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
1128 di = (struct ocfs2_dinode *) tl_bh->b_data;
1129 tl = &di->id2.i_dealloc;
1130 if (!OCFS2_IS_VALID_DINODE(di)) {
1131 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
1136 num_to_flush = le16_to_cpu(tl->tl_used);
1137 mlog(0, "Flush %u records from truncate log #%llu\n",
1138 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
1139 if (!num_to_flush) {
1144 handle = ocfs2_alloc_handle(osb);
1151 data_alloc_inode = ocfs2_get_system_file_inode(osb,
1152 GLOBAL_BITMAP_SYSTEM_INODE,
1153 OCFS2_INVALID_SLOT);
1154 if (!data_alloc_inode) {
1156 mlog(ML_ERROR, "Could not get bitmap inode!\n");
1160 ocfs2_handle_add_inode(handle, data_alloc_inode);
1161 status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
1167 handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
1168 if (IS_ERR(handle)) {
1169 status = PTR_ERR(handle);
1175 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
1184 ocfs2_commit_trans(handle);
1186 if (data_alloc_inode)
1187 iput(data_alloc_inode);
1190 brelse(data_alloc_bh);
1196 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
1199 struct inode *tl_inode = osb->osb_tl_inode;
1201 mutex_lock(&tl_inode->i_mutex);
1202 status = __ocfs2_flush_truncate_log(osb);
1203 mutex_unlock(&tl_inode->i_mutex);
1208 static void ocfs2_truncate_log_worker(struct work_struct *work)
1211 struct ocfs2_super *osb =
1212 container_of(work, struct ocfs2_super,
1213 osb_truncate_log_wq.work);
1217 status = ocfs2_flush_truncate_log(osb);
1224 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
1225 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
1228 if (osb->osb_tl_inode) {
1229 /* We want to push off log flushes while truncates are
1232 cancel_delayed_work(&osb->osb_truncate_log_wq);
1234 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
1235 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
1239 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
1241 struct inode **tl_inode,
1242 struct buffer_head **tl_bh)
1245 struct inode *inode = NULL;
1246 struct buffer_head *bh = NULL;
1248 inode = ocfs2_get_system_file_inode(osb,
1249 TRUNCATE_LOG_SYSTEM_INODE,
1253 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
1257 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
1258 OCFS2_BH_CACHED, inode);
1272 /* called during the 1st stage of node recovery. we stamp a clean
1273 * truncate log and pass back a copy for processing later. if the
1274 * truncate log does not require processing, a *tl_copy is set to
1276 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
1278 struct ocfs2_dinode **tl_copy)
1281 struct inode *tl_inode = NULL;
1282 struct buffer_head *tl_bh = NULL;
1283 struct ocfs2_dinode *di;
1284 struct ocfs2_truncate_log *tl;
1288 mlog(0, "recover truncate log from slot %d\n", slot_num);
1290 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
1296 di = (struct ocfs2_dinode *) tl_bh->b_data;
1297 tl = &di->id2.i_dealloc;
1298 if (!OCFS2_IS_VALID_DINODE(di)) {
1299 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
1304 if (le16_to_cpu(tl->tl_used)) {
1305 mlog(0, "We'll have %u logs to recover\n",
1306 le16_to_cpu(tl->tl_used));
1308 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
1315 /* Assuming the write-out below goes well, this copy
1316 * will be passed back to recovery for processing. */
1317 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
1319 /* All we need to do to clear the truncate log is set
1323 status = ocfs2_write_block(osb, tl_bh, tl_inode);
1336 if (status < 0 && (*tl_copy)) {
1345 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
1346 struct ocfs2_dinode *tl_copy)
1350 unsigned int clusters, num_recs, start_cluster;
1352 struct ocfs2_journal_handle *handle;
1353 struct inode *tl_inode = osb->osb_tl_inode;
1354 struct ocfs2_truncate_log *tl;
1358 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
1359 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
1363 tl = &tl_copy->id2.i_dealloc;
1364 num_recs = le16_to_cpu(tl->tl_used);
1365 mlog(0, "cleanup %u records from %llu\n", num_recs,
1366 (unsigned long long)tl_copy->i_blkno);
1368 mutex_lock(&tl_inode->i_mutex);
1369 for(i = 0; i < num_recs; i++) {
1370 if (ocfs2_truncate_log_needs_flush(osb)) {
1371 status = __ocfs2_flush_truncate_log(osb);
1378 handle = ocfs2_start_trans(osb, NULL,
1379 OCFS2_TRUNCATE_LOG_UPDATE);
1380 if (IS_ERR(handle)) {
1381 status = PTR_ERR(handle);
1386 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
1387 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
1388 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
1390 status = ocfs2_truncate_log_append(osb, handle,
1391 start_blk, clusters);
1392 ocfs2_commit_trans(handle);
1400 mutex_unlock(&tl_inode->i_mutex);
1406 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
1409 struct inode *tl_inode = osb->osb_tl_inode;
1414 cancel_delayed_work(&osb->osb_truncate_log_wq);
1415 flush_workqueue(ocfs2_wq);
1417 status = ocfs2_flush_truncate_log(osb);
1421 brelse(osb->osb_tl_bh);
1422 iput(osb->osb_tl_inode);
1428 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
1431 struct inode *tl_inode = NULL;
1432 struct buffer_head *tl_bh = NULL;
1436 status = ocfs2_get_truncate_log_info(osb,
1443 /* ocfs2_truncate_log_shutdown keys on the existence of
1444 * osb->osb_tl_inode so we don't set any of the osb variables
1445 * until we're sure all is well. */
1446 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
1447 ocfs2_truncate_log_worker);
1448 osb->osb_tl_bh = tl_bh;
1449 osb->osb_tl_inode = tl_inode;
1455 /* This function will figure out whether the currently last extent
1456 * block will be deleted, and if it will, what the new last extent
1457 * block will be so we can update his h_next_leaf_blk field, as well
1458 * as the dinodes i_last_eb_blk */
1459 static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
1460 struct inode *inode,
1461 struct ocfs2_dinode *fe,
1463 struct buffer_head *old_last_eb,
1464 struct buffer_head **new_last_eb)
1468 struct ocfs2_extent_block *eb;
1469 struct ocfs2_extent_list *el;
1470 struct buffer_head *bh = NULL;
1472 *new_last_eb = NULL;
1474 if (!OCFS2_IS_VALID_DINODE(fe)) {
1475 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1480 /* we have no tree, so of course, no last_eb. */
1481 if (!fe->id2.i_list.l_tree_depth)
1484 /* trunc to zero special case - this makes tree_depth = 0
1485 * regardless of what it is. */
1486 if (!new_i_clusters)
1489 eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
1491 BUG_ON(!el->l_next_free_rec);
1493 /* Make sure that this guy will actually be empty after we
1494 * clear away the data. */
1495 if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
1498 /* Ok, at this point, we know that last_eb will definitely
1499 * change, so lets traverse the tree and find the second to
1500 * last extent block. */
1501 el = &(fe->id2.i_list);
1502 /* go down the tree, */
1504 for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
1505 if (le32_to_cpu(el->l_recs[i].e_cpos) <
1507 block = le64_to_cpu(el->l_recs[i].e_blkno);
1518 status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
1524 eb = (struct ocfs2_extent_block *) bh->b_data;
1526 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1527 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1531 } while (el->l_tree_depth);
1534 get_bh(*new_last_eb);
1535 mlog(0, "returning block %llu\n",
1536 (unsigned long long)le64_to_cpu(eb->h_blkno));
1544 static int ocfs2_do_truncate(struct ocfs2_super *osb,
1545 unsigned int clusters_to_del,
1546 struct inode *inode,
1547 struct buffer_head *fe_bh,
1548 struct buffer_head *old_last_eb_bh,
1549 struct ocfs2_journal_handle *handle,
1550 struct ocfs2_truncate_context *tc)
1552 int status, i, depth;
1553 struct ocfs2_dinode *fe;
1554 struct ocfs2_extent_block *eb;
1555 struct ocfs2_extent_block *last_eb = NULL;
1556 struct ocfs2_extent_list *el;
1557 struct buffer_head *eb_bh = NULL;
1558 struct buffer_head *last_eb_bh = NULL;
1562 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1564 status = ocfs2_find_new_last_ext_blk(osb,
1567 le32_to_cpu(fe->i_clusters) -
1576 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1578 status = ocfs2_journal_access(handle, inode, fe_bh,
1579 OCFS2_JOURNAL_ACCESS_WRITE);
1584 el = &(fe->id2.i_list);
1586 spin_lock(&OCFS2_I(inode)->ip_lock);
1587 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
1589 spin_unlock(&OCFS2_I(inode)->ip_lock);
1590 le32_add_cpu(&fe->i_clusters, -clusters_to_del);
1591 fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
1592 fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
1594 i = le16_to_cpu(el->l_next_free_rec) - 1;
1596 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1597 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1598 /* tree depth zero, we can just delete the clusters, otherwise
1599 * we need to record the offset of the next level extent block
1600 * as we may overwrite it. */
1601 if (!el->l_tree_depth)
1602 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1603 + ocfs2_clusters_to_blocks(osb->sb,
1604 le32_to_cpu(el->l_recs[i].e_clusters));
1606 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1608 if (!el->l_recs[i].e_clusters) {
1609 /* if we deleted the whole extent record, then clear
1610 * out the other fields and update the extent
1611 * list. For depth > 0 trees, we've already recorded
1612 * the extent block in 'next_eb' */
1613 el->l_recs[i].e_cpos = 0;
1614 el->l_recs[i].e_blkno = 0;
1615 BUG_ON(!el->l_next_free_rec);
1616 le16_add_cpu(&el->l_next_free_rec, -1);
1619 depth = le16_to_cpu(el->l_tree_depth);
1620 if (!fe->i_clusters) {
1621 /* trunc to zero is a special case. */
1622 el->l_tree_depth = 0;
1623 fe->i_last_eb_blk = 0;
1625 fe->i_last_eb_blk = last_eb->h_blkno;
1627 status = ocfs2_journal_dirty(handle, fe_bh);
1634 /* If there will be a new last extent block, then by
1635 * definition, there cannot be any leaves to the right of
1637 status = ocfs2_journal_access(handle, inode, last_eb_bh,
1638 OCFS2_JOURNAL_ACCESS_WRITE);
1643 last_eb->h_next_leaf_blk = 0;
1644 status = ocfs2_journal_dirty(handle, last_eb_bh);
1651 /* if our tree depth > 0, update all the tree blocks below us. */
1653 mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
1654 depth, (unsigned long long)next_eb);
1655 status = ocfs2_read_block(osb, next_eb, &eb_bh,
1656 OCFS2_BH_CACHED, inode);
1661 eb = (struct ocfs2_extent_block *)eb_bh->b_data;
1662 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1663 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1669 status = ocfs2_journal_access(handle, inode, eb_bh,
1670 OCFS2_JOURNAL_ACCESS_WRITE);
1676 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1677 BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
1679 i = le16_to_cpu(el->l_next_free_rec) - 1;
1681 mlog(0, "extent block %llu, before: record %d: "
1682 "(%u, %u, %llu), next = %u\n",
1683 (unsigned long long)le64_to_cpu(eb->h_blkno), i,
1684 le32_to_cpu(el->l_recs[i].e_cpos),
1685 le32_to_cpu(el->l_recs[i].e_clusters),
1686 (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1687 le16_to_cpu(el->l_next_free_rec));
1689 BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
1690 le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
1692 next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
1693 /* bottom-most block requires us to delete data.*/
1694 if (!el->l_tree_depth)
1695 delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
1696 + ocfs2_clusters_to_blocks(osb->sb,
1697 le32_to_cpu(el->l_recs[i].e_clusters));
1698 if (!el->l_recs[i].e_clusters) {
1699 el->l_recs[i].e_cpos = 0;
1700 el->l_recs[i].e_blkno = 0;
1701 BUG_ON(!el->l_next_free_rec);
1702 le16_add_cpu(&el->l_next_free_rec, -1);
1704 mlog(0, "extent block %llu, after: record %d: "
1705 "(%u, %u, %llu), next = %u\n",
1706 (unsigned long long)le64_to_cpu(eb->h_blkno), i,
1707 le32_to_cpu(el->l_recs[i].e_cpos),
1708 le32_to_cpu(el->l_recs[i].e_clusters),
1709 (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
1710 le16_to_cpu(el->l_next_free_rec));
1712 status = ocfs2_journal_dirty(handle, eb_bh);
1718 if (!el->l_next_free_rec) {
1719 mlog(0, "deleting this extent block.\n");
1721 ocfs2_remove_from_cache(inode, eb_bh);
1723 BUG_ON(el->l_recs[0].e_clusters);
1724 BUG_ON(el->l_recs[0].e_cpos);
1725 BUG_ON(el->l_recs[0].e_blkno);
1726 if (eb->h_suballoc_slot == 0) {
1728 * This code only understands how to
1729 * lock the suballocator in slot 0,
1730 * which is fine because allocation is
1731 * only ever done out of that
1732 * suballocator too. A future version
1733 * might change that however, so avoid
1734 * a free if we don't know how to
1735 * handle it. This way an fs incompat
1736 * bit will not be necessary.
1738 status = ocfs2_free_extent_block(handle,
1739 tc->tc_ext_alloc_inode,
1740 tc->tc_ext_alloc_bh,
1753 BUG_ON(!delete_blk);
1754 status = ocfs2_truncate_log_append(osb, handle, delete_blk,
1763 ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
1765 ocfs2_extent_map_drop(inode, 0);
1771 * It is expected, that by the time you call this function,
1772 * inode->i_size and fe->i_size have been adjusted.
1774 * WARNING: This will kfree the truncate context
1776 int ocfs2_commit_truncate(struct ocfs2_super *osb,
1777 struct inode *inode,
1778 struct buffer_head *fe_bh,
1779 struct ocfs2_truncate_context *tc)
1781 int status, i, credits, tl_sem = 0;
1782 u32 clusters_to_del, target_i_clusters;
1784 struct ocfs2_dinode *fe;
1785 struct ocfs2_extent_block *eb;
1786 struct ocfs2_extent_list *el;
1787 struct buffer_head *last_eb_bh;
1788 struct ocfs2_journal_handle *handle = NULL;
1789 struct inode *tl_inode = osb->osb_tl_inode;
1793 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1795 target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1796 i_size_read(inode));
1798 last_eb_bh = tc->tc_last_eb_bh;
1799 tc->tc_last_eb_bh = NULL;
1801 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1803 if (fe->id2.i_list.l_tree_depth) {
1804 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1807 el = &fe->id2.i_list;
1808 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1810 mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
1811 "last_eb = %llu, fe->i_last_eb_blk = %llu, "
1812 "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
1813 le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
1814 (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
1815 le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
1817 if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
1818 mlog(0, "last_eb changed!\n");
1819 BUG_ON(!fe->id2.i_list.l_tree_depth);
1820 last_eb = le64_to_cpu(fe->i_last_eb_blk);
1821 /* i_last_eb_blk may have changed, read it if
1822 * necessary. We don't have to worry about the
1823 * truncate to zero case here (where there becomes no
1824 * last_eb) because we never loop back after our work
1831 status = ocfs2_read_block(osb, last_eb,
1832 &last_eb_bh, OCFS2_BH_CACHED,
1838 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1839 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1840 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1847 /* by now, el will point to the extent list on the bottom most
1848 * portion of this tree. */
1849 i = le16_to_cpu(el->l_next_free_rec) - 1;
1850 if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
1851 clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
1853 clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
1854 le32_to_cpu(el->l_recs[i].e_cpos)) -
1857 mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
1859 mutex_lock(&tl_inode->i_mutex);
1861 /* ocfs2_truncate_log_needs_flush guarantees us at least one
1862 * record is free for use. If there isn't any, we flush to get
1863 * an empty truncate log. */
1864 if (ocfs2_truncate_log_needs_flush(osb)) {
1865 status = __ocfs2_flush_truncate_log(osb);
1872 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
1874 handle = ocfs2_start_trans(osb, NULL, credits);
1875 if (IS_ERR(handle)) {
1876 status = PTR_ERR(handle);
1882 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1883 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
1887 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
1888 last_eb_bh, handle, tc);
1894 mutex_unlock(&tl_inode->i_mutex);
1897 ocfs2_commit_trans(handle);
1900 BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
1901 if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
1904 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1906 ocfs2_schedule_truncate_log_flush(osb, 1);
1909 mutex_unlock(&tl_inode->i_mutex);
1912 ocfs2_commit_trans(handle);
1917 /* This will drop the ext_alloc cluster lock for us */
1918 ocfs2_free_truncate_context(tc);
1926 * Expects the inode to already be locked. This will figure out which
1927 * inodes need to be locked and will put them on the returned truncate
1930 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
1931 struct inode *inode,
1932 struct buffer_head *fe_bh,
1933 struct ocfs2_truncate_context **tc)
1935 int status, metadata_delete;
1936 unsigned int new_i_clusters;
1937 struct ocfs2_dinode *fe;
1938 struct ocfs2_extent_block *eb;
1939 struct ocfs2_extent_list *el;
1940 struct buffer_head *last_eb_bh = NULL;
1941 struct inode *ext_alloc_inode = NULL;
1942 struct buffer_head *ext_alloc_bh = NULL;
1948 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
1949 i_size_read(inode));
1950 fe = (struct ocfs2_dinode *) fe_bh->b_data;
1952 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
1953 "%llu\n", fe->i_clusters, new_i_clusters,
1954 (unsigned long long)fe->i_size);
1956 if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
1957 ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
1958 "%u and size %llu whereas struct inode has "
1959 "cluster count %u and size %llu which caused an "
1960 "invalid truncate to %u clusters.",
1961 (unsigned long long)le64_to_cpu(fe->i_blkno),
1962 le32_to_cpu(fe->i_clusters),
1963 (unsigned long long)le64_to_cpu(fe->i_size),
1964 OCFS2_I(inode)->ip_clusters, i_size_read(inode),
1966 mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
1971 *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
1978 metadata_delete = 0;
1979 if (fe->id2.i_list.l_tree_depth) {
1980 /* If we have a tree, then the truncate may result in
1981 * metadata deletes. Figure this out from the
1982 * rightmost leaf block.*/
1983 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
1984 &last_eb_bh, OCFS2_BH_CACHED, inode);
1989 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
1990 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1991 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1998 if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
1999 metadata_delete = 1;
2002 (*tc)->tc_last_eb_bh = last_eb_bh;
2004 if (metadata_delete) {
2005 mlog(0, "Will have to delete metadata for this trunc. "
2006 "locking allocator.\n");
2007 ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
2008 if (!ext_alloc_inode) {
2014 mutex_lock(&ext_alloc_inode->i_mutex);
2015 (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
2017 status = ocfs2_meta_lock(ext_alloc_inode,
2025 (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
2026 (*tc)->tc_ext_alloc_locked = 1;
2033 ocfs2_free_truncate_context(*tc);
2040 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
2042 if (tc->tc_ext_alloc_inode) {
2043 if (tc->tc_ext_alloc_locked)
2044 ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
2046 mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
2047 iput(tc->tc_ext_alloc_inode);
2050 if (tc->tc_ext_alloc_bh)
2051 brelse(tc->tc_ext_alloc_bh);
2053 if (tc->tc_last_eb_bh)
2054 brelse(tc->tc_last_eb_bh);