ext4: fix lock order problem in ext4_move_extents()
authorAkira Fujita <a-fujita@rs.jp.nec.com>
Mon, 23 Nov 2009 12:24:43 +0000 (07:24 -0500)
committerTheodore Ts'o <tytso@mit.edu>
Mon, 23 Nov 2009 12:24:43 +0000 (07:24 -0500)
ext4_move_extents() checks the logical block contiguousness
of original file with ext4_find_extent() and mext_next_extent().
Therefore the extent which ext4_ext_path structure indicates
must not be changed between above functions.

But in current implementation, there is no i_data_sem protection
between ext4_ext_find_extent() and mext_next_extent().  So the extent
which ext4_ext_path structure indicates may be overwritten by
delalloc.  As a result, ext4_move_extents() will exchange wrong blocks
between original and donor files.  I change the place where
acquire/release i_data_sem to solve this problem.

Moreover, I changed move_extent_per_page() to start transaction first,
and then acquire i_data_sem.  Without this change, there is a
possibility of the deadlock between mmap() and ext4_move_extents():

* NOTE: "A", "B" and "C" mean different processes

A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes.

B:   do_page_fault() starts the transaction (T),
     and then tries to acquire i_data_sem.
     But process "A" is already holding it, so it is kept waiting.

C:   While "A" and "B" running, kjournald2 tries to commit transaction (T)
     but it is under updating, so kjournald2 waits for it.

A-2: Call ext4_journal_start with holding i_data_sem,
     but transaction (T) is locked.

Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
fs/ext4/move_extent.c

index 83f8c9e..a7410b3 100644 (file)
@@ -77,12 +77,14 @@ static int
 mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                      struct ext4_extent **extent)
 {
+       struct ext4_extent_header *eh;
        int ppos, leaf_ppos = path->p_depth;
 
        ppos = leaf_ppos;
        if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
                /* leaf block */
                *extent = ++path[ppos].p_ext;
+               path[ppos].p_block = ext_pblock(path[ppos].p_ext);
                return 0;
        }
 
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
                                        ext_block_hdr(path[cur_ppos+1].p_bh);
                        }
 
+                       path[leaf_ppos].p_ext = *extent = NULL;
+
+                       eh = path[leaf_ppos].p_hdr;
+                       if (le16_to_cpu(eh->eh_entries) == 0)
+                               /* empty leaf is found */
+                               return -ENODATA;
+
                        /* leaf block */
                        path[leaf_ppos].p_ext = *extent =
                                EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+                       path[leaf_ppos].p_block =
+                                       ext_pblock(path[leaf_ppos].p_ext);
                        return 0;
                }
        }
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
 }
 
 /**
- * mext_double_down_read - Acquire two inodes' read semaphore
- *
- * @orig_inode:                original inode structure
- * @donor_inode:       donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-       struct inode *first = orig_inode, *second = donor_inode;
-
-       /*
-        * Use the inode number to provide the stable locking order instead
-        * of its address, because the C language doesn't guarantee you can
-        * compare pointers that don't come from the same array.
-        */
-       if (donor_inode->i_ino < orig_inode->i_ino) {
-               first = donor_inode;
-               second = orig_inode;
-       }
-
-       down_read(&EXT4_I(first)->i_data_sem);
-       down_read(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
  *
  * @orig_inode:                original inode structure
  * @donor_inode:       donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
  */
 static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
        struct inode *first = orig_inode, *second = donor_inode;
 
@@ -207,28 +193,14 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 }
 
 /**
- * mext_double_up_read - Release two inodes' read semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
  *
  * @orig_inode:                original inode structure to be released its lock first
  * @donor_inode:       donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
  */
 static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
-       up_read(&EXT4_I(orig_inode)->i_data_sem);
-       up_read(&EXT4_I(donor_inode)->i_data_sem);
-}
-
-/**
- * mext_double_up_write - Release two inodes' write semaphore
- *
- * @orig_inode:                original inode structure to be released its lock first
- * @donor_inode:       donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
 {
        up_write(&EXT4_I(orig_inode)->i_data_sem);
        up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        int replaced_count = 0;
        int dext_alen;
 
-       mext_double_down_write(orig_inode, donor_inode);
-
        /* Get the original extent for the block "orig_off" */
        *err = get_ext_path(orig_inode, orig_off, &orig_path);
        if (*err)
@@ -785,7 +755,6 @@ out:
                kfree(donor_path);
        }
 
-       mext_double_up_write(orig_inode, donor_inode);
        return replaced_count;
 }
 
@@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
         * Just swap data blocks between orig and donor.
         */
        if (uninit) {
+               /*
+                * Protect extent trees against block allocations
+                * via delalloc
+                */
+               double_down_write_data_sem(orig_inode, donor_inode);
                replaced_count = mext_replace_branches(handle, orig_inode,
                                                donor_inode, orig_blk_offset,
                                                block_len_in_page, err);
@@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                /* Clear the inode cache not to refer to the old data */
                ext4_ext_invalidate_cache(orig_inode);
                ext4_ext_invalidate_cache(donor_inode);
+               double_up_write_data_sem(orig_inode, donor_inode);
                goto out2;
        }
 
@@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
        /* Release old bh and drop refs */
        try_to_release_page(page, 0);
 
+       /* Protect extent trees against block allocations via delalloc */
+       double_down_write_data_sem(orig_inode, donor_inode);
        replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
                                        orig_blk_offset, block_len_in_page,
                                        &err2);
@@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                        block_len_in_page = replaced_count;
                        replaced_size =
                                block_len_in_page << orig_inode->i_blkbits;
-               } else
+               } else {
+                       double_up_write_data_sem(orig_inode, donor_inode);
                        goto out;
+               }
        }
 
        /* Clear the inode cache not to refer to the old data */
        ext4_ext_invalidate_cache(orig_inode);
        ext4_ext_invalidate_cache(donor_inode);
 
+       double_up_write_data_sem(orig_inode, donor_inode);
+
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
 
@@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                return -EINVAL;
        }
 
-       /* protect orig and donor against a truncate */
+       /* Protect orig and donor inodes against a truncate */
        ret1 = mext_inode_double_lock(orig_inode, donor_inode);
        if (ret1 < 0)
                return ret1;
 
-       mext_double_down_read(orig_inode, donor_inode);
+       /* Protect extent tree against block allocations via delalloc */
+       double_down_write_data_sem(orig_inode, donor_inode);
        /* Check the filesystem environment whether move_extent can be done */
        ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
                                        donor_start, &len, *moved_len);
-       mext_double_up_read(orig_inode, donor_inode);
        if (ret1)
                goto out;
 
@@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                         ext4_ext_get_actual_len(ext_cur), block_end + 1) -
                     max(le32_to_cpu(ext_cur->ee_block), block_start);
 
+       /* Discard preallocations of two inodes */
+       ext4_discard_preallocations(orig_inode);
+       ext4_discard_preallocations(donor_inode);
+
        while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
                seq_blocks += add_blocks;
 
@@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                seq_start = le32_to_cpu(ext_cur->ee_block);
                rest_blocks = seq_blocks;
 
-               /* Discard preallocations of two inodes */
-               down_write(&EXT4_I(orig_inode)->i_data_sem);
-               ext4_discard_preallocations(orig_inode);
-               up_write(&EXT4_I(orig_inode)->i_data_sem);
-
-               down_write(&EXT4_I(donor_inode)->i_data_sem);
-               ext4_discard_preallocations(donor_inode);
-               up_write(&EXT4_I(donor_inode)->i_data_sem);
+               /*
+                * Up semaphore to avoid following problems:
+                * a. transaction deadlock among ext4_journal_start,
+                *    ->write_begin via pagefault, and jbd2_journal_commit
+                * b. racing with ->readpage, ->write_begin, and ext4_get_block
+                *    in move_extent_per_page
+                */
+               double_up_write_data_sem(orig_inode, donor_inode);
 
                while (orig_page_offset <= seq_end_page) {
 
@@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                        /* Count how many blocks we have exchanged */
                        *moved_len += block_len_in_page;
                        if (ret1 < 0)
-                               goto out;
+                               break;
                        if (*moved_len > len) {
                                ext4_error(orig_inode->i_sb, __func__,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
                                ret1 = -EIO;
-                               goto out;
+                               break;
                        }
 
                        orig_page_offset++;
@@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                                block_len_in_page = rest_blocks;
                }
 
+               double_down_write_data_sem(orig_inode, donor_inode);
+               if (ret1 < 0)
+                       break;
+
                /* Decrease buffer counter */
                if (holecheck_path)
                        ext4_ext_drop_refs(holecheck_path);
@@ -1429,7 +1418,7 @@ out:
                ext4_ext_drop_refs(holecheck_path);
                kfree(holecheck_path);
        }
-
+       double_up_write_data_sem(orig_inode, donor_inode);
        ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
        if (ret1)