string: factorize skip_spaces and export it to be generally available
[safe/jmp/linux-2.6] / fs / jbd / commit.c
index a003d50..4bd8825 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -36,7 +37,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 
 /*
  * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * not successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -45,8 +46,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  * So here, we have a buffer which has just come off the forget list.  Look to
  * see if we can strip all buffers from the backing page.
  *
- * Called under lock_journal(), and possibly under journal_datalist_lock.  The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under journal->j_list_lock.  The caller provided us with a ref
+ * against the buffer, and we drop that here.
  */
 static void release_buffer_page(struct buffer_head *bh)
 {
@@ -63,7 +64,7 @@ static void release_buffer_page(struct buffer_head *bh)
                goto nope;
 
        /* OK, it's a truncated page */
-       if (TestSetPageLocked(page))
+       if (!trylock_page(page))
                goto nope;
 
        page_cache_get(page);
@@ -78,6 +79,19 @@ nope:
 }
 
 /*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+       if (buffer_freed(bh)) {
+               clear_buffer_freed(bh);
+               release_buffer_page(bh);
+       } else
+               put_bh(bh);
+}
+
+/*
  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  * return 0.  j_list_lock is dropped in this case.
@@ -104,7 +118,8 @@ static int journal_write_commit_record(journal_t *journal,
 {
        struct journal_head *descriptor;
        struct buffer_head *bh;
-       int i, ret;
+       journal_header_t *header;
+       int ret;
        int barrier_done = 0;
 
        if (is_journal_aborted(journal))
@@ -116,13 +131,10 @@ static int journal_write_commit_record(journal_t *journal,
 
        bh = jh2bh(descriptor);
 
-       /* AKPM: buglet - add `i' to tmp! */
-       for (i = 0; i < bh->b_size; i += 512) {
-               journal_header_t *tmp = (journal_header_t*)bh->b_data;
-               tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
-               tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
-               tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
-       }
+       header = (journal_header_t *)(bh->b_data);
+       header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+       header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+       header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 
        JBUFFER_TRACE(descriptor, "write commit block");
        set_buffer_dirty(bh);
@@ -131,6 +143,8 @@ static int journal_write_commit_record(journal_t *journal,
                barrier_done = 1;
        }
        ret = sync_dirty_buffer(bh);
+       if (barrier_done)
+               clear_buffer_ordered(bh);
        /* is it possible for another commit to fail at roughly
         * the same time as this one?  If so, we don't want to
         * trust the barrier flag in the super, but instead want
@@ -148,7 +162,6 @@ static int journal_write_commit_record(journal_t *journal,
                spin_unlock(&journal->j_state_lock);
 
                /* And try again, without the barrier */
-               clear_buffer_ordered(bh);
                set_buffer_uptodate(bh);
                set_buffer_dirty(bh);
                ret = sync_dirty_buffer(bh);
@@ -159,28 +172,31 @@ static int journal_write_commit_record(journal_t *journal,
        return (ret == -EIO);
 }
 
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+                                  int write_op)
 {
        int i;
 
        for (i = 0; i < bufs; i++) {
                wbuf[i]->b_end_io = end_buffer_write_sync;
                /* We use-up our safety reference in submit_bh() */
-               submit_bh(WRITE, wbuf[i]);
+               submit_bh(write_op, wbuf[i]);
        }
 }
 
 /*
  *  Submit all the data buffers to disk
  */
-static void journal_submit_data_buffers(journal_t *journal,
-                               transaction_t *commit_transaction)
+static int journal_submit_data_buffers(journal_t *journal,
+                                      transaction_t *commit_transaction,
+                                      int write_op)
 {
        struct journal_head *jh;
        struct buffer_head *bh;
        int locked;
        int bufs = 0;
        struct buffer_head **wbuf = journal->j_wbuf;
+       int err = 0;
 
        /*
         * Whenever we unlock the journal and sleep, things can get added
@@ -208,11 +224,11 @@ write_out_data:
                 * blocking lock_buffer().
                 */
                if (buffer_dirty(bh)) {
-                       if (test_set_buffer_locked(bh)) {
+                       if (!trylock_buffer(bh)) {
                                BUFFER_TRACE(bh, "needs blocking lock");
                                spin_unlock(&journal->j_list_lock);
                                /* Write out all data to prevent deadlocks */
-                               journal_do_submit_data(wbuf, bufs);
+                               journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                lock_buffer(bh);
                                spin_lock(&journal->j_list_lock);
@@ -225,14 +241,14 @@ write_out_data:
                        spin_lock(&journal->j_list_lock);
                }
                /* Someone already cleaned up the buffer? */
-               if (!buffer_jbd(bh)
+               if (!buffer_jbd(bh) || bh2jh(bh) != jh
                        || jh->b_transaction != commit_transaction
                        || jh->b_jlist != BJ_SyncData) {
                        jbd_unlock_bh_state(bh);
                        if (locked)
                                unlock_buffer(bh);
                        BUFFER_TRACE(bh, "already cleaned up");
-                       put_bh(bh);
+                       release_data_buffer(bh);
                        continue;
                }
                if (locked && test_clear_buffer_dirty(bh)) {
@@ -243,7 +259,7 @@ write_out_data:
                        jbd_unlock_bh_state(bh);
                        if (bufs == journal->j_wbufsize) {
                                spin_unlock(&journal->j_list_lock);
-                               journal_do_submit_data(wbuf, bufs);
+                               journal_do_submit_data(wbuf, bufs, write_op);
                                bufs = 0;
                                goto write_out_data;
                        }
@@ -254,24 +270,28 @@ write_out_data:
                        put_bh(bh);
                } else {
                        BUFFER_TRACE(bh, "writeout complete: unfile");
+                       if (unlikely(!buffer_uptodate(bh)))
+                               err = -EIO;
                        __journal_unfile_buffer(jh);
                        jbd_unlock_bh_state(bh);
                        if (locked)
                                unlock_buffer(bh);
                        journal_remove_journal_head(bh);
-                       /* Once for our safety reference, once for
+                       /* One for our safety reference, other for
                         * journal_remove_journal_head() */
                        put_bh(bh);
-                       put_bh(bh);
+                       release_data_buffer(bh);
                }
 
-               if (lock_need_resched(&journal->j_list_lock)) {
+               if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
                        spin_unlock(&journal->j_list_lock);
                        goto write_out_data;
                }
        }
        spin_unlock(&journal->j_list_lock);
-       journal_do_submit_data(wbuf, bufs);
+       journal_do_submit_data(wbuf, bufs, write_op);
+
+       return err;
 }
 
 /*
@@ -288,7 +308,9 @@ void journal_commit_transaction(journal_t *journal)
        int bufs;
        int flags;
        int err;
-       unsigned long blocknr;
+       unsigned int blocknr;
+       ktime_t start_time;
+       u64 commit_time;
        char *tagp = NULL;
        journal_header_t *header;
        journal_block_tag_t *tag = NULL;
@@ -296,6 +318,7 @@ void journal_commit_transaction(journal_t *journal)
        int first_tag = 0;
        int tag_flag;
        int i;
+       int write_op = WRITE;
 
        /*
         * First job: lock down the current transaction and wait for
@@ -328,6 +351,13 @@ void journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
 
+       /*
+        * Use plugged writes here, since we want to submit several before
+        * we unplug the device. We don't do explicit unplugging in here,
+        * instead we rely on sync_buffer() doing the unplug for us.
+        */
+       if (commit_transaction->t_synchronous_commit)
+               write_op = WRITE_SYNC_PLUG;
        spin_lock(&commit_transaction->t_handle_lock);
        while (commit_transaction->t_updates) {
                DEFINE_WAIT(wait);
@@ -375,7 +405,7 @@ void journal_commit_transaction(journal_t *journal)
                        struct buffer_head *bh = jh2bh(jh);
 
                        jbd_lock_bh_state(bh);
-                       jbd_slab_free(jh->b_committed_data, bh->b_size);
+                       jbd_free(jh->b_committed_data, bh->b_size);
                        jh->b_committed_data = NULL;
                        jbd_unlock_bh_state(bh);
                }
@@ -401,6 +431,7 @@ void journal_commit_transaction(journal_t *journal)
        commit_transaction->t_state = T_FLUSH;
        journal->j_committing_transaction = commit_transaction;
        journal->j_running_transaction = NULL;
+       start_time = ktime_get();
        commit_transaction->t_log_start = journal->j_head;
        wake_up(&journal->j_wait_transaction_locked);
        spin_unlock(&journal->j_state_lock);
@@ -408,27 +439,11 @@ void journal_commit_transaction(journal_t *journal)
        jbd_debug (3, "JBD: commit phase 2\n");
 
        /*
-        * First, drop modified flag: all accesses to the buffers
-        * will be tracked for a new trasaction only -bzzz
-        */
-       spin_lock(&journal->j_list_lock);
-       if (commit_transaction->t_buffers) {
-               new_jh = jh = commit_transaction->t_buffers->b_tnext;
-               do {
-                       J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
-                                       new_jh->b_modified == 0);
-                       new_jh->b_modified = 0;
-                       new_jh = new_jh->b_tnext;
-               } while (new_jh != jh);
-       }
-       spin_unlock(&journal->j_list_lock);
-
-       /*
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
-       err = 0;
-       journal_submit_data_buffers(journal, commit_transaction);
+       err = journal_submit_data_buffers(journal, commit_transaction,
+                                         write_op);
 
        /*
         * Wait for all previously submitted IO to complete.
@@ -443,16 +458,29 @@ void journal_commit_transaction(journal_t *journal)
                if (buffer_locked(bh)) {
                        spin_unlock(&journal->j_list_lock);
                        wait_on_buffer(bh);
-                       if (unlikely(!buffer_uptodate(bh)))
-                               err = -EIO;
                        spin_lock(&journal->j_list_lock);
                }
+               if (unlikely(!buffer_uptodate(bh))) {
+                       if (!trylock_page(bh->b_page)) {
+                               spin_unlock(&journal->j_list_lock);
+                               lock_page(bh->b_page);
+                               spin_lock(&journal->j_list_lock);
+                       }
+                       if (bh->b_page->mapping)
+                               set_bit(AS_EIO, &bh->b_page->mapping->flags);
+
+                       unlock_page(bh->b_page);
+                       SetPageError(bh->b_page);
+                       err = -EIO;
+               }
                if (!inverted_lock(journal, bh)) {
                        put_bh(bh);
                        spin_lock(&journal->j_list_lock);
                        continue;
                }
-               if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+               if (buffer_jbd(bh) && bh2jh(bh) == jh &&
+                   jh->b_transaction == commit_transaction &&
+                   jh->b_jlist == BJ_Locked) {
                        __journal_unfile_buffer(jh);
                        jbd_unlock_bh_state(bh);
                        journal_remove_journal_head(bh);
@@ -460,17 +488,23 @@ void journal_commit_transaction(journal_t *journal)
                } else {
                        jbd_unlock_bh_state(bh);
                }
-               put_bh(bh);
+               release_data_buffer(bh);
                cond_resched_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
 
-       if (err)
-               __journal_abort_hard(journal);
+       if (err) {
+               char b[BDEVNAME_SIZE];
 
-       journal_write_revoke_records(journal, commit_transaction);
+               printk(KERN_WARNING
+                       "JBD: Detected IO errors while flushing file data "
+                       "on %s\n", bdevname(journal->j_fs_dev, b));
+               if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+                       journal_abort(journal, err);
+               err = 0;
+       }
 
-       jbd_debug(3, "JBD: commit phase 2\n");
+       journal_write_revoke_records(journal, commit_transaction, write_op);
 
        /*
         * If we found any dirty or locked buffers, then we should have
@@ -487,7 +521,12 @@ void journal_commit_transaction(journal_t *journal)
         * transaction!  Now comes the tricky part: we need to write out
         * metadata.  Loop over the transaction's entire buffer list:
         */
+       spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_COMMIT;
+       spin_unlock(&journal->j_state_lock);
+
+       J_ASSERT(commit_transaction->t_nr_buffers <=
+                commit_transaction->t_outstanding_credits);
 
        descriptor = NULL;
        bufs = 0;
@@ -498,9 +537,10 @@ void journal_commit_transaction(journal_t *journal)
                jh = commit_transaction->t_buffers;
 
                /* If we're in abort mode, we just un-journal the buffer and
-                  release it for background writing. */
+                  release it. */
 
                if (is_journal_aborted(journal)) {
+                       clear_buffer_jbddirty(jh2bh(jh));
                        JBUFFER_TRACE(jh, "journal is aborting: refile");
                        journal_refile_buffer(journal, jh);
                        /* If that was the last one, we need to clean up
@@ -524,7 +564,7 @@ void journal_commit_transaction(journal_t *journal)
 
                        descriptor = journal_get_descriptor_buffer(journal);
                        if (!descriptor) {
-                               __journal_abort_hard(journal);
+                               journal_abort(journal, -EIO);
                                continue;
                        }
 
@@ -557,7 +597,7 @@ void journal_commit_transaction(journal_t *journal)
                   and repeat this loop: we'll fall into the
                   refile-on-abort condition above. */
                if (err) {
-                       __journal_abort_hard(journal);
+                       journal_abort(journal, err);
                        continue;
                }
 
@@ -634,7 +674,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                               submit_bh(WRITE, bh);
+                               submit_bh(write_op, bh);
                        }
                        cond_resched();
 
@@ -742,13 +782,16 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
 
+       if (err)
+               journal_abort(journal, err);
+
        jbd_debug(3, "JBD: commit phase 6\n");
 
        if (journal_write_commit_record(journal, commit_transaction))
                err = -EIO;
 
        if (err)
-               __journal_abort_hard(journal);
+               journal_abort(journal, err);
 
        /* End of a transaction!  Finally, we can do checkpoint
            processing: any buffers committed as a result of this
@@ -792,14 +835,14 @@ restart_loop:
                 * Otherwise, we can just throw away the frozen data now.
                 */
                if (jh->b_committed_data) {
-                       jbd_slab_free(jh->b_committed_data, bh->b_size);
+                       jbd_free(jh->b_committed_data, bh->b_size);
                        jh->b_committed_data = NULL;
                        if (jh->b_frozen_data) {
                                jh->b_committed_data = jh->b_frozen_data;
                                jh->b_frozen_data = NULL;
                        }
                } else if (jh->b_frozen_data) {
-                       jbd_slab_free(jh->b_frozen_data, bh->b_size);
+                       jbd_free(jh->b_frozen_data, bh->b_size);
                        jh->b_frozen_data = NULL;
                }
 
@@ -832,6 +875,8 @@ restart_loop:
                if (buffer_jbddirty(bh)) {
                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
                        __journal_insert_checkpoint(jh, commit_transaction);
+                       if (is_journal_aborted(journal))
+                               clear_buffer_jbddirty(bh);
                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
                        __journal_refile_buffer(jh);
                        jbd_unlock_bh_state(bh);
@@ -858,10 +903,10 @@ restart_loop:
        }
        spin_unlock(&journal->j_list_lock);
        /*
-        * This is a bit sleazy.  We borrow j_list_lock to protect
-        * journal->j_committing_transaction in __journal_remove_checkpoint.
-        * Really, __journal_remove_checkpoint should be using j_state_lock but
-        * it's a bit hassle to hold that across __journal_remove_checkpoint
+        * This is a bit sleazy.  We use j_list_lock to protect transition
+        * of a transaction into T_FINISHED state and calling
+        * __journal_drop_transaction(). Otherwise we could race with
+        * other checkpointing code processing the transaction...
         */
        spin_lock(&journal->j_state_lock);
        spin_lock(&journal->j_list_lock);
@@ -885,6 +930,18 @@ restart_loop:
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
        journal->j_committing_transaction = NULL;
+       commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+       /*
+        * weight the commit time higher than the average time so we don't
+        * react too strongly to vast changes in commit time
+        */
+       if (likely(journal->j_average_commit_time))
+               journal->j_average_commit_time = (commit_time*3 +
+                               journal->j_average_commit_time) / 4;
+       else
+               journal->j_average_commit_time = commit_time;
+
        spin_unlock(&journal->j_state_lock);
 
        if (commit_transaction->t_checkpoint_list == NULL &&