X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Focfs2%2Fextent_map.c;h=843db64e9d4ae9d5b9a353a3e7b3d001c3456297;hb=1557aca7904ed6fadd22cdc3364754070bb3d3c3;hp=b6ba292e9544000444718395aee0be5f39bf6de0;hpb=ebdec83ba46c123fe3bfdcaacf62d0dfe8fe4187;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index b6ba292..843db64 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -3,8 +3,7 @@ * * extent_map.c * - * In-memory extent map for OCFS2. Man, this code was prettier in - * the library. + * Block/Cluster mapping functions * * Copyright (C) 2004 Oracle. All rights reserved. * @@ -26,967 +25,862 @@ #include #include #include -#include -#include +#include #define MLOG_MASK_PREFIX ML_EXTENT_MAP #include #include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" #include "extent_map.h" #include "inode.h" #include "super.h" #include "buffer_head_io.h" - /* - * SUCK SUCK SUCK - * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h + * The extent caching implementation is intentionally trivial. + * + * We only cache a small number of extents stored directly on the + * inode, so linear order operations are acceptable. If we ever want + * to increase the size of the extent map, then these algorithms must + * get smarter. */ -struct ocfs2_extent_map_entry { - struct rb_node e_node; - int e_tree_depth; - struct ocfs2_extent_rec e_rec; -}; - -struct ocfs2_em_insert_context { - int need_left; - int need_right; - struct ocfs2_extent_map_entry *new_ent; - struct ocfs2_extent_map_entry *old_ent; - struct ocfs2_extent_map_entry *left_ent; - struct ocfs2_extent_map_entry *right_ent; -}; - -static kmem_cache_t *ocfs2_em_ent_cachep = NULL; - - -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent); -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth); -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent); -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el); -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_map_entry **ret_ent); -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt); - -/* returns 1 only if the rec contains all the given clusters -- that is that - * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + - * clusters) is >= the argument's endpoint */ -static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, - u32 cpos, u32 clusters) +void ocfs2_extent_map_init(struct inode *inode) { - if (le32_to_cpu(rec->e_cpos) > cpos) - return 0; - if (cpos + clusters > le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) - return 0; - return 1; -} + struct ocfs2_inode_info *oi = OCFS2_I(inode); - -/* - * Find an entry in the tree that intersects the region passed in. - * Note that this will find straddled intervals, it is up to the - * callers to enforce any boundary conditions. - * - * Callers must hold ip_lock. This lookup is not guaranteed to return - * a tree_depth 0 match, and as such can race inserts if the lock - * were not held. - * - * The rb_node garbage lets insertion share the search. Trivial - * callers pass NULL. - */ -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent) -{ - struct rb_node **p = &em->em_extents.rb_node; - struct rb_node *parent = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - while (*p) - { - parent = *p; - ent = rb_entry(parent, struct ocfs2_extent_map_entry, - e_node); - if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { - p = &(*p)->rb_left; - ent = NULL; - } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters))) { - p = &(*p)->rb_right; - ent = NULL; - } else - break; - } - - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - return ent; + oi->ip_extent_map.em_num_items = 0; + INIT_LIST_HEAD(&oi->ip_extent_map.em_list); } -/* - * Find the leaf containing the interval we want. While we're on our - * way down the tree, fill in every record we see at any depth, because - * we might want it later. - * - * Note that this code is run without ip_lock. That's because it - * sleeps while reading. If someone is also filling the extent list at - * the same time we are, we might have to restart. - */ -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el) +static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + unsigned int cpos, + struct ocfs2_extent_map_item **ret_emi) { - int i, ret; - struct buffer_head *eb_bh = NULL; - u64 blkno; - u32 rec_end; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_rec *rec; - - /* - * The bh data containing the el cannot change here, because - * we hold alloc_sem. So we can do this without other - * locks. - */ - while (el->l_tree_depth) - { - blkno = 0; - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - rec_end = (le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - - ret = -EBADR; - if (rec_end > OCFS2_I(inode)->ip_clusters) { - mlog_errno(ret); - goto out_free; - } + unsigned int range; + struct ocfs2_extent_map_item *emi; - if (rec_end <= cpos) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } - if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } + *ret_emi = NULL; - /* - * We've found a record that matches our - * interval. We don't insert it because we're - * about to traverse it. - */ - - /* Check to see if we're stradling */ - ret = -ESRCH; - if (!ocfs2_extent_rec_contains_clusters(rec, - cpos, - clusters)) { - mlog_errno(ret); - goto out_free; - } - - /* - * If we've already found a record, the el has - * two records covering the same interval. - * EEEK! - */ - ret = -EBADR; - if (blkno) { - mlog_errno(ret); - goto out_free; - } - - blkno = le64_to_cpu(rec->e_blkno); - } + list_for_each_entry(emi, &em->em_list, ei_list) { + range = emi->ei_cpos + emi->ei_clusters; - /* - * We don't support holes, and we're still up - * in the branches, so we'd better have found someone - */ - ret = -EBADR; - if (!blkno) { - mlog_errno(ret); - goto out_free; - } + if (cpos >= emi->ei_cpos && cpos < range) { + list_move(&emi->ei_list, &em->em_list); - if (eb_bh) { - brelse(eb_bh); - eb_bh = NULL; - } - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - blkno, &eb_bh, OCFS2_BH_CACHED, - inode); - if (ret) { - mlog_errno(ret); - goto out_free; - } - eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EIO; - goto out_free; + *ret_emi = emi; + break; } - el = &eb->h_list; } +} - BUG_ON(el->l_tree_depth); - - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret) { - mlog_errno(ret); - goto out_free; - } +static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, + unsigned int *phys, unsigned int *len, + unsigned int *flags) +{ + unsigned int coff; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map_item *emi; + + spin_lock(&oi->ip_lock); + + __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); + if (emi) { + coff = cpos - emi->ei_cpos; + *phys = emi->ei_phys + coff; + if (len) + *len = emi->ei_clusters - coff; + if (flags) + *flags = emi->ei_flags; } - ret = 0; + spin_unlock(&oi->ip_lock); -out_free: - if (eb_bh) - brelse(eb_bh); + if (emi == NULL) + return -ENOENT; - return ret; + return 0; } /* - * This lookup actually will read from disk. It has one invariant: - * It will never re-traverse blocks. This means that all inserts should - * be new regions or more granular regions (both allowed by insert). + * Forget about all clusters equal to or greater than cpos. */ -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, - u32 clusters, - struct ocfs2_extent_map_entry **ret_ent) +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) { - int ret; - u64 blkno; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct buffer_head *bh = NULL; - struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; - struct ocfs2_extent_list *el; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (ent) { - if (!ent->e_tree_depth) { - spin_unlock(&OCFS2_I(inode)->ip_lock); - *ret_ent = ent; - return 0; - } - blkno = le64_to_cpu(ent->e_rec.e_blkno); - spin_unlock(&OCFS2_I(inode)->ip_lock); - - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; + struct ocfs2_extent_map_item *emi, *n; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + LIST_HEAD(tmp_list); + unsigned int range; + + spin_lock(&oi->ip_lock); + list_for_each_entry_safe(emi, n, &em->em_list, ei_list) { + if (emi->ei_cpos >= cpos) { + /* Full truncate of this record. */ + list_move(&emi->ei_list, &tmp_list); + BUG_ON(em->em_num_items == 0); + em->em_num_items--; + continue; } - eb = (struct ocfs2_extent_block *)bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - brelse(bh); - return -EIO; - } - el = &eb->h_list; - } else { - spin_unlock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; - } - di = (struct ocfs2_dinode *)bh->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - brelse(bh); - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); - return -EIO; + range = emi->ei_cpos + emi->ei_clusters; + if (range > cpos) { + /* Partial truncate */ + emi->ei_clusters = cpos - emi->ei_cpos; } - el = &di->id2.i_list; } + spin_unlock(&oi->ip_lock); - ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); - brelse(bh); - if (ret) { - mlog_errno(ret); - return ret; + list_for_each_entry_safe(emi, n, &tmp_list, ei_list) { + list_del(&emi->ei_list); + kfree(emi); } - - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (!ent) { - ret = -ESRCH; - mlog_errno(ret); - return ret; - } - - /* FIXME: Make sure this isn't a corruption */ - BUG_ON(ent->e_tree_depth); - - *ret_ent = ent; - - return 0; } /* - * Callers must hold ip_lock. This can insert pieces of the tree, - * thus racing lookup if the lock weren't held. + * Is any part of emi2 contained within emi1 */ -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent) +static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, + struct ocfs2_extent_map_item *emi2) { - struct rb_node **p, *parent; - struct ocfs2_extent_map_entry *old_ent; + unsigned int range1, range2; - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), - le32_to_cpu(ent->e_rec.e_clusters), - &p, &parent); - if (old_ent) - return -EEXIST; + /* + * Check if logical start of emi2 is inside emi1 + */ + range1 = emi1->ei_cpos + emi1->ei_clusters; + if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) + return 1; - rb_link_node(&ent->e_node, parent, p); - rb_insert_color(&ent->e_node, &em->em_extents); + /* + * Check if logical end of emi2 is inside emi1 + */ + range2 = emi2->ei_cpos + emi2->ei_clusters; + if (range2 > emi1->ei_cpos && range2 <= range1) + return 1; return 0; } +static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, + struct ocfs2_extent_map_item *src) +{ + dest->ei_cpos = src->ei_cpos; + dest->ei_phys = src->ei_phys; + dest->ei_clusters = src->ei_clusters; + dest->ei_flags = src->ei_flags; +} /* - * Simple rule: on any return code other than -EAGAIN, anything left - * in the insert_context will be freed. + * Try to merge emi with ins. Returns 1 if merge succeeds, zero + * otherwise. */ -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt) +static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, + struct ocfs2_extent_map_item *ins) { - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *old_ent; - - ctxt->need_left = 0; - ctxt->need_right = 0; - ctxt->old_ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - if (!ret) { - ctxt->new_ent = NULL; - goto out_unlock; + /* + * Handle contiguousness + */ + if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && + ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && + ins->ei_flags == emi->ei_flags) { + emi->ei_clusters += ins->ei_clusters; + return 1; + } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && + (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && + ins->ei_flags == emi->ei_flags) { + emi->ei_phys = ins->ei_phys; + emi->ei_cpos = ins->ei_cpos; + emi->ei_clusters += ins->ei_clusters; + return 1; } - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), - le32_to_cpu(rec->e_clusters), NULL, - NULL); - - BUG_ON(!old_ent); - - ret = -EEXIST; - if (old_ent->e_tree_depth < tree_depth) - goto out_unlock; + /* + * Overlapping extents - this shouldn't happen unless we've + * split an extent to change it's flags. That is exceedingly + * rare, so there's no sense in trying to optimize it yet. + */ + if (ocfs2_ei_is_contained(emi, ins) || + ocfs2_ei_is_contained(ins, emi)) { + ocfs2_copy_emi_fields(emi, ins); + return 1; + } - if (old_ent->e_tree_depth == tree_depth) { - if (!memcmp(rec, &old_ent->e_rec, - sizeof(struct ocfs2_extent_rec))) - ret = 0; + /* No merge was possible. */ + return 0; +} - /* FIXME: Should this be ESRCH/EBADR??? */ - goto out_unlock; +/* + * In order to reduce complexity on the caller, this insert function + * is intentionally liberal in what it will accept. + * + * The only rule is that the truncate call *must* be used whenever + * records have been deleted. This avoids inserting overlapping + * records with different physical mappings. + */ +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + struct ocfs2_extent_map_item *emi, *new_emi = NULL; + struct ocfs2_extent_map_item ins; + + ins.ei_cpos = le32_to_cpu(rec->e_cpos); + ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); + ins.ei_flags = rec->e_flags; + +search: + spin_lock(&oi->ip_lock); + + list_for_each_entry(emi, &em->em_list, ei_list) { + if (ocfs2_try_to_merge_extent_map(emi, &ins)) { + list_move(&emi->ei_list, &em->em_list); + spin_unlock(&oi->ip_lock); + goto out; + } } /* - * We do it in this order specifically so that no actual tree - * changes occur until we have all the pieces we need. We - * don't want malloc failures to leave an inconsistent tree. - * Whenever we drop the lock, another process could be - * inserting. Also note that, if another process just beat us - * to an insert, we might not need the same pieces we needed - * the first go round. In the end, the pieces we need will - * be used, and the pieces we don't will be freed. + * No item could be merged. + * + * Either allocate and add a new item, or overwrite the last recently + * inserted. */ - ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > - le32_to_cpu(old_ent->e_rec.e_cpos)); - ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) > - (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); - ret = -EAGAIN; - if (ctxt->need_left) { - if (!ctxt->left_ent) - goto out_unlock; - *(ctxt->left_ent) = *old_ent; - ctxt->left_ent->e_rec.e_clusters = - cpu_to_le32(le32_to_cpu(rec->e_cpos) - - le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); - } - if (ctxt->need_right) { - if (!ctxt->right_ent) - goto out_unlock; - *(ctxt->right_ent) = *old_ent; - ctxt->right_ent->e_rec.e_cpos = - cpu_to_le32(le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - ctxt->right_ent->e_rec.e_clusters = - cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) - - le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); - } - - rb_erase(&old_ent->e_node, &em->em_extents); - /* Now that he's erased, set him up for deletion */ - ctxt->old_ent = old_ent; - - if (ctxt->need_left) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->left_ent); - if (ret) - goto out_unlock; - ctxt->left_ent = NULL; - } - if (ctxt->need_right) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->right_ent); - if (ret) - goto out_unlock; - ctxt->right_ent = NULL; - } + if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { + if (new_emi == NULL) { + spin_unlock(&oi->ip_lock); - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); + new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); + if (new_emi == NULL) + goto out; - if (!ret) - ctxt->new_ent = NULL; + goto search; + } -out_unlock: - spin_unlock(&OCFS2_I(inode)->ip_lock); + ocfs2_copy_emi_fields(new_emi, &ins); + list_add(&new_emi->ei_list, &em->em_list); + em->em_num_items++; + new_emi = NULL; + } else { + BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); + emi = list_entry(em->em_list.prev, + struct ocfs2_extent_map_item, ei_list); + list_move(&emi->ei_list, &em->em_list); + ocfs2_copy_emi_fields(emi, &ins); + } - return ret; -} + spin_unlock(&oi->ip_lock); +out: + if (new_emi) + kfree(new_emi); +} -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth) +static int ocfs2_last_eb_is_empty(struct inode *inode, + struct ocfs2_dinode *di) { - int ret; - struct ocfs2_em_insert_context ctxt = {0, }; + int ret, next_free; + u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk); + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; + ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh); + if (ret) { mlog_errno(ret); - return ret; + goto out; } - /* Zero e_clusters means a truncated tail record. It better be EOF */ - if (!rec->e_clusters) { - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - return ret; - } + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; - /* Ignore the truncated tail */ - return 0; + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; } - ret = -ENOMEM; - ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); - if (!ctxt.new_ent) { - mlog_errno(ret); - return ret; - } - - ctxt.new_ent->e_rec = *rec; - ctxt.new_ent->e_tree_depth = tree_depth; - - do { - ret = -ENOMEM; - if (ctxt.need_left && !ctxt.left_ent) { - ctxt.left_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); - if (!ctxt.left_ent) - break; - } - if (ctxt.need_right && !ctxt.right_ent) { - ctxt.right_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); - if (!ctxt.right_ent) - break; - } - - ret = ocfs2_extent_map_try_insert(inode, rec, - tree_depth, &ctxt); - } while (ret == -EAGAIN); - - if (ret < 0) - mlog_errno(ret); + next_free = le16_to_cpu(el->l_next_free_rec); - if (ctxt.left_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); - if (ctxt.right_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); - if (ctxt.old_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); - if (ctxt.new_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) + ret = 1; +out: + brelse(eb_bh); return ret; } /* - * Append this record to the tail of the extent map. It must be - * tree_depth 0. The record might be an extension of an existing - * record, and as such that needs to be handled. eg: - * - * Existing record in the extent map: - * - * cpos = 10, len = 10 - * |---------| - * - * New Record: - * - * cpos = 10, len = 20 - * |------------------| - * - * The passed record is the new on-disk record. The new_clusters value - * is how many clusters were added to the file. If the append is a - * contiguous append, the new_clusters has been added to - * rec->e_clusters. If the append is an entirely new extent, then - * rec->e_clusters is == new_clusters. + * Return the 1st index within el which contains an extent start + * larger than v_cluster. */ -int ocfs2_extent_map_append(struct inode *inode, - struct ocfs2_extent_rec *rec, - u32 new_clusters) +static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, + u32 v_cluster) { - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct ocfs2_extent_rec *old; + int i; + struct ocfs2_extent_rec *rec; - BUG_ON(!new_clusters); - BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; - if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } - - mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (em->em_clusters + new_clusters), - "Inode %"MLFu64":\n" - "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" - "em->em_clusters = %u + new_clusters = %u = %u\n", - OCFS2_I(inode)->ip_blkno, - le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), - le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), - em->em_clusters, new_clusters, - em->em_clusters + new_clusters); - - em->em_clusters += new_clusters; - - ret = -ENOENT; - if (le32_to_cpu(rec->e_clusters) > new_clusters) { - /* This is a contiguous append */ - ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, - NULL, NULL); - if (ent) { - old = &ent->e_rec; - BUG_ON((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (le32_to_cpu(old->e_cpos) + - le32_to_cpu(old->e_clusters) + - new_clusters)); - if (ent->e_tree_depth == 0) { - BUG_ON(le32_to_cpu(old->e_cpos) != - le32_to_cpu(rec->e_cpos)); - BUG_ON(le64_to_cpu(old->e_blkno) != - le64_to_cpu(rec->e_blkno)); - ret = 0; - } - /* - * Let non-leafs fall through as -ENOENT to - * force insertion of the new leaf. - */ - le32_add_cpu(&old->e_clusters, new_clusters); - } + if (v_cluster < le32_to_cpu(rec->e_cpos)) + break; } - if (ret == -ENOENT) - ret = ocfs2_extent_map_insert(inode, rec, 0); - if (ret < 0) - mlog_errno(ret); - return ret; + return i; } -#if 0 -/* Code here is included but defined out as it completes the extent - * map api and may be used in the future. */ - /* - * Look up the record containing this cluster offset. This record is - * part of the extent map. Do not free it. Any changes you make to - * it will reflect in the extent map. So, if your last extent - * is (cpos = 10, clusters = 10) and you truncate the file by 5 - * clusters, you can do: - * - * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); - * rec->e_clusters -= 5; + * Figure out the size of a hole which starts at v_cluster within the given + * extent list. * - * The lookup does not read from disk. If the map isn't filled in for - * an entry, you won't find it. + * If there is no more allocation past v_cluster, we return the maximum + * cluster size minus v_cluster. * - * Also note that the returned record is valid until alloc_sem is - * dropped. After that, truncate and extend can happen. Caveat Emptor. + * If we have in-inode extents, then el points to the dinode list and + * eb_bh is NULL. Otherwise, eb_bh should point to the extent block + * containing el. */ -int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, - struct ocfs2_extent_rec **rec, - int *tree_depth) +int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) { - int ret = -ENOENT; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; + int ret, i; + struct buffer_head *next_eb_bh = NULL; + struct ocfs2_extent_block *eb, *next_eb; - *rec = NULL; + i = ocfs2_search_for_hole_index(el, v_cluster); - if (cpos >= OCFS2_I(inode)->ip_clusters) - return -EINVAL; + if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { + eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (cpos >= em->em_clusters) { /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters + * Check the next leaf for any extents. */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters ; - } - ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, - NULL, NULL); + if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) + goto no_more_extents; - if (ent) { - *rec = &ent->e_rec; - if (tree_depth) - *tree_depth = ent->e_tree_depth; - ret = 0; + ret = ocfs2_read_extent_block(ci, + le64_to_cpu(eb->h_next_leaf_blk), + &next_eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; + el = &next_eb->h_list; + i = ocfs2_search_for_hole_index(el, v_cluster); } +no_more_extents: + if (i == le16_to_cpu(el->l_next_free_rec)) { + /* + * We're at the end of our existing allocation. Just + * return the maximum number of clusters we could + * possibly allocate. + */ + *num_clusters = UINT_MAX - v_cluster; + } else { + *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; + } + + ret = 0; +out: + brelse(next_eb_bh); return ret; } -int ocfs2_extent_map_get_clusters(struct inode *inode, - u32 v_cpos, int count, - u32 *p_cpos, int *ret_count) +static int ocfs2_get_clusters_nocache(struct inode *inode, + struct buffer_head *di_bh, + u32 v_cluster, unsigned int *hole_len, + struct ocfs2_extent_rec *ret_rec, + unsigned int *is_last) { - int ret; - u32 coff, ccount; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent = NULL; + int i, ret, tree_height, len; + struct ocfs2_dinode *di; + struct ocfs2_extent_block *uninitialized_var(eb); + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct buffer_head *eb_bh = NULL; + + memset(ret_rec, 0, sizeof(*ret_rec)); + if (is_last) + *is_last = 0; + + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; + tree_height = le16_to_cpu(el->l_tree_depth); + + if (tree_height > 0) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } - *p_cpos = ccount = 0; + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; - if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) - return -EINVAL; + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } - if ((v_cpos + count) > em->em_clusters) { + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters + * Holes can be larger than the maximum size of an + * extent, so we return their lengths in a seperate + * field. */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } + if (hole_len) { + ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode), + el, eb_bh, + v_cluster, &len); + if (ret) { + mlog_errno(ret); + goto out; + } + *hole_len = len; + } + goto out_hole; + } - ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); - if (ret) - return ret; + rec = &el->l_recs[i]; - if (ent) { - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, - v_cpos, - count)) - return -ESRCH; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); - coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); - *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, - le64_to_cpu(ent->e_rec.e_blkno)) + - coff; + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } - if (ret_count) - *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; + *ret_rec = *rec; - return 0; + /* + * Checking for last extent is potentially expensive - we + * might have to look at the next leaf over to see if it's + * empty. + * + * The first two checks are to see whether the caller even + * cares for this information, and if the extent is at least + * the last in it's list. + * + * If those hold true, then the extent is last if any of the + * additional conditions hold true: + * - Extent list is in-inode + * - Extent list is right-most + * - Extent list is 2nd to rightmost, with empty right-most + */ + if (is_last) { + if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) { + if (tree_height == 0) + *is_last = 1; + else if (eb->h_blkno == di->i_last_eb_blk) + *is_last = 1; + else if (eb->h_next_leaf_blk == di->i_last_eb_blk) { + ret = ocfs2_last_eb_is_empty(inode, di); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + if (ret == 1) + *is_last = 1; + } + } } - - return -ENOENT; +out_hole: + ret = 0; +out: + brelse(eb_bh); + return ret; } -#endif /* 0 */ +static void ocfs2_relative_extent_offsets(struct super_block *sb, + u32 v_cluster, + struct ocfs2_extent_rec *rec, + u32 *p_cluster, u32 *num_clusters) -int ocfs2_extent_map_get_blocks(struct inode *inode, - u64 v_blkno, int count, - u64 *p_blkno, int *ret_count) { - int ret; - u64 boff; - u32 cpos, clusters; - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - struct ocfs2_extent_map_entry *ent = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + u32 coff = v_cluster - le32_to_cpu(rec->e_cpos); + + *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + + if (num_clusters) + *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff; +} + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el, + unsigned int *extent_flags) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; struct ocfs2_extent_rec *rec; + u32 coff; - *p_blkno = 0; + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster, + &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } - cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - clusters = ocfs2_blocks_to_clusters(inode->i_sb, - (u64)count + bpc - 1); - if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { - ret = -EINVAL; - mlog_errno(ret); - return ret; - } + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; - if ((cpos + clusters) > em->em_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } } - ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); - if (ret) { + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + ret = -EROFS; mlog_errno(ret); - return ret; + goto out; + } else { + rec = &el->l_recs[i]; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + coff = v_cluster - le32_to_cpu(rec->e_cpos); + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + if (num_clusters) + *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + + if (extent_flags) + *extent_flags = rec->e_flags; } +out: + if (eb_bh) + brelse(eb_bh); + return ret; +} - if (ent) - { - rec = &ent->e_rec; +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + unsigned int *extent_flags) +{ + int ret; + unsigned int uninitialized_var(hole_len), flags = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { - ret = -ESRCH; - mlog_errno(ret); - return ret; - } + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = -ERANGE; + mlog_errno(ret); + goto out; + } - boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - - le32_to_cpu(rec->e_cpos)); - boff += (v_blkno & (u64)(bpc - 1)); - *p_blkno = le64_to_cpu(rec->e_blkno) + boff; + ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, + num_clusters, extent_flags); + if (ret == 0) + goto out; - if (ret_count) { - *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(rec->e_clusters)) - boff; - } + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } - return 0; + ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len, + &rec, NULL); + if (ret) { + mlog_errno(ret); + goto out; } - return -ENOENT; -} + if (rec.e_blkno == 0ULL) { + /* + * A hole was found. Return some canned values that + * callers can key on. If asked for, num_clusters will + * be populated with the size of the hole. + */ + *p_cluster = 0; + if (num_clusters) { + *num_clusters = hole_len; + } + } else { + ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec, + p_cluster, num_clusters); + flags = rec.e_flags; -int ocfs2_extent_map_init(struct inode *inode) -{ - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + ocfs2_extent_map_insert_rec(inode, &rec); + } - em->em_extents = RB_ROOT; - em->em_clusters = 0; + if (extent_flags) + *extent_flags = flags; - return 0; +out: + brelse(di_bh); + return ret; } -/* Needs the lock */ -static void __ocfs2_extent_map_drop(struct inode *inode, - u32 new_clusters, - struct rb_node **free_head, - struct ocfs2_extent_map_entry **tail_ent) +/* + * This expects alloc_sem to be held. The allocation cannot change at + * all while the map is in the process of being updated. + */ +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags) { - struct rb_node *node, *next; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - - *free_head = NULL; + int ret; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 cpos, num_clusters, p_cluster; + u64 boff = 0; - ent = NULL; - node = rb_last(&em->em_extents); - while (node) - { - next = rb_prev(node); + cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) - break; + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, + extent_flags); + if (ret) { + mlog_errno(ret); + goto out; + } - rb_erase(&ent->e_node, &em->em_extents); + /* + * p_cluster == 0 indicates a hole. + */ + if (p_cluster) { + boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + boff += (v_blkno & (u64)(bpc - 1)); + } - node->rb_right = *free_head; - *free_head = node; + *p_blkno = boff; - ent = NULL; - node = next; + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + *ret_count -= v_blkno & (u64)(bpc - 1); } - /* Do we have an entry straddling new_clusters? */ - if (tail_ent) { - if (ent && - ((le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) - *tail_ent = ent; - else - *tail_ent = NULL; - } +out: + return ret; } -static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) +static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, + struct fiemap_extent_info *fieinfo, + u64 map_start) { - struct rb_node *node; - struct ocfs2_extent_map_entry *ent; + int ret; + unsigned int id_count; + struct ocfs2_dinode *di; + u64 phys; + u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + di = (struct ocfs2_dinode *)di_bh->b_data; + id_count = le16_to_cpu(di->id2.i_data.id_count); - while (free_head) { - node = free_head; - free_head = node->rb_right; + if (map_start < id_count) { + phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; + phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - kmem_cache_free(ocfs2_em_ent_cachep, ent); + ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, + flags); + if (ret < 0) + return ret; } + + return 0; } -/* - * Remove all entries past new_clusters, inclusive of an entry that - * contains new_clusters. This is effectively a cache forget. - * - * If you want to also clip the last extent by some number of clusters, - * you need to call ocfs2_extent_map_trunc(). - * This code does not check or modify ip_clusters. - */ -int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) +#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len) { - struct rb_node *free_head = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; + int ret, is_last; + u32 mapping_end, cpos; + unsigned int hole_size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u64 len_bytes, phys_bytes, virt_bytes; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + if (ret) + return ret; - spin_lock(&OCFS2_I(inode)->ip_lock); + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret) { + mlog_errno(ret); + goto out; + } - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + down_read(&OCFS2_I(inode)->ip_alloc_sem); - if (ent) { - rb_erase(&ent->e_node, &em->em_extents); - ent->e_node.rb_right = free_head; - free_head = &ent->e_node; + /* + * Handle inline-data separately. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); + goto out_unlock; } - spin_unlock(&OCFS2_I(inode)->ip_lock); + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + mapping_end -= cpos; + is_last = 0; + while (cpos < mapping_end && !is_last) { + u32 fe_flags; - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + &hole_size, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } -/* - * Remove all entries past new_clusters and also clip any extent - * straddling new_clusters, if there is one. This does not check - * or modify ip_clusters - */ -int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) -{ - struct rb_node *free_head = NULL; - struct ocfs2_extent_map_entry *ent = NULL; + if (rec.e_blkno == 0ULL) { + cpos += hole_size; + continue; + } - spin_lock(&OCFS2_I(inode)->ip_lock); + fe_flags = 0; + if (rec.e_flags & OCFS2_EXT_UNWRITTEN) + fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (is_last) + fe_flags |= FIEMAP_EXTENT_LAST; + len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; + phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; + virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; + + ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, + len_bytes, fe_flags); + if (ret) + break; - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters); + } - if (ent) - ent->e_rec.e_clusters = cpu_to_le32(new_clusters - - le32_to_cpu(ent->e_rec.e_cpos)); + if (ret > 0) + ret = 0; - OCFS2_I(inode)->ip_map.em_clusters = new_clusters; +out_unlock: + brelse(di_bh); - spin_unlock(&OCFS2_I(inode)->ip_lock); + up_read(&OCFS2_I(inode)->ip_alloc_sem); - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); + ocfs2_inode_unlock(inode, 0); +out: - return 0; + return ret; } -int __init init_ocfs2_extent_maps(void) +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) { - ocfs2_em_ent_cachep = - kmem_cache_create("ocfs2_em_ent", - sizeof(struct ocfs2_extent_map_entry), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!ocfs2_em_ent_cachep) - return -ENOMEM; + int rc = 0; + u64 p_block, p_count; + int i, count, done = 0; + + mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, " + "flags = %x, validate = %p)\n", + inode, (unsigned long long)v_block, nr, bhs, flags, + validate); + + if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!(flags & OCFS2_BH_READAHEAD)); + goto out; + } - return 0; -} + while (done < nr) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); + rc = ocfs2_extent_map_get_blocks(inode, v_block + done, + &p_block, &p_count, NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (rc) { + mlog_errno(rc); + break; + } -void exit_ocfs2_extent_maps(void) -{ - kmem_cache_destroy(ocfs2_em_ent_cachep); + if (!p_block) { + rc = -EIO; + mlog(ML_ERROR, + "Inode #%llu contains a hole at offset %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)(v_block + done) << + inode->i_sb->s_blocksize_bits); + break; + } + + count = nr - done; + if (p_count < count) + count = p_count; + + /* + * If the caller passed us bhs, they should have come + * from a previous readahead call to this function. Thus, + * they should have the right b_blocknr. + */ + for (i = 0; i < count; i++) { + if (!bhs[done + i]) + continue; + BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); + } + + rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count, + bhs + done, flags, validate); + if (rc) { + mlog_errno(rc); + break; + } + done += count; + } + +out: + mlog_exit(rc); + return rc; } + +