Merge branch 'alpha-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mattst88...

[safe/jmp/linux-2.6] / fs / jbd / commit.c
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c

index a38c718..28a9dda 100644 (file)
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -17,9 +17,9 @@
  #include <linux/fs.h>
  #include <linux/jbd.h>
  #include <linux/errno.h>
-#include <linux/slab.h>
  #include <linux/mm.h>
  #include <linux/pagemap.h>
+#include <linux/bio.h>
  
  /*
   * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  
  /*
   * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * not successfully freed, because they are attached to a committing transaction.
   * After the transaction commits, these pages are left on the LRU, with no
   * ->mapping, and with attached buffers.  These pages are trivially reclaimable
   * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
   * So here, we have a buffer which has just come off the forget list.  Look to
   * see if we can strip all buffers from the backing page.
   *
- * Called under lock_journal(), and possibly under journal_datalist_lock.  The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under journal->j_list_lock.  The caller provided us with a ref
+ * against the buffer, and we drop that here.
   */
  static void release_buffer_page(struct buffer_head *bh)
  {
@@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh)
                 goto nope;
  
         /* OK, it's a truncated page */
-       if (TestSetPageLocked(page))
+       if (!trylock_page(page))
                 goto nope;
  
         page_cache_get(page);
@@ -78,6 +78,19 @@ nope:
  }
  
  /*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+       if (buffer_freed(bh)) {
+               clear_buffer_freed(bh);
+               release_buffer_page(bh);
+       } else
+               put_bh(bh);
+}
+
+/*
   * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
   * held.  For ranking reasons we must trylock.  If we lose, schedule away and
   * return 0.  j_list_lock is dropped in this case.
@@ -158,28 +171,31 @@ static int journal_write_commit_record(journal_t *journal,
         return (ret == -EIO);
  }
  
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+                                  int write_op)
  {
         int i;
  
         for (i = 0; i < bufs; i++) {
                 wbuf[i]->b_end_io = end_buffer_write_sync;
                 /* We use-up our safety reference in submit_bh() */
-               submit_bh(WRITE, wbuf[i]);
+               submit_bh(write_op, wbuf[i]);
         }
  }
  
  /*
   *  Submit all the data buffers to disk
   */
-static void journal_submit_data_buffers(journal_t *journal,
-                               transaction_t *commit_transaction)
+static int journal_submit_data_buffers(journal_t *journal,
+                                      transaction_t *commit_transaction,
+                                      int write_op)
  {
         struct journal_head *jh;
         struct buffer_head *bh;
         int locked;
         int bufs = 0;
         struct buffer_head **wbuf = journal->j_wbuf;
+       int err = 0;
  
         /*
          * Whenever we unlock the journal and sleep, things can get added
@@ -207,11 +223,11 @@ write_out_data:
                  * blocking lock_buffer().
                  */
                 if (buffer_dirty(bh)) {
-                       if (test_set_buffer_locked(bh)) {
+                       if (!trylock_buffer(bh)) {
                                 BUFFER_TRACE(bh, "needs blocking lock");
                                 spin_unlock(&journal->j_list_lock);
                                 /* Write out all data to prevent deadlocks */
-                               journal_do_submit_data(wbuf, bufs);
+                               journal_do_submit_data(wbuf, bufs, write_op);
                                 bufs = 0;
                                 lock_buffer(bh);
                                 spin_lock(&journal->j_list_lock);
@@ -224,14 +240,14 @@ write_out_data:
                         spin_lock(&journal->j_list_lock);
                 }
                 /* Someone already cleaned up the buffer? */
-               if (!buffer_jbd(bh)
+               if (!buffer_jbd(bh) || bh2jh(bh) != jh
                         || jh->b_transaction != commit_transaction
                         || jh->b_jlist != BJ_SyncData) {
                         jbd_unlock_bh_state(bh);
                         if (locked)
                                 unlock_buffer(bh);
                         BUFFER_TRACE(bh, "already cleaned up");
-                       put_bh(bh);
+                       release_data_buffer(bh);
                         continue;
                 }
                 if (locked && test_clear_buffer_dirty(bh)) {
@@ -242,7 +258,7 @@ write_out_data:
                         jbd_unlock_bh_state(bh);
                         if (bufs == journal->j_wbufsize) {
                                 spin_unlock(&journal->j_list_lock);
-                               journal_do_submit_data(wbuf, bufs);
+                               journal_do_submit_data(wbuf, bufs, write_op);
                                 bufs = 0;
                                 goto write_out_data;
                         }
@@ -253,15 +269,17 @@ write_out_data:
                         put_bh(bh);
                 } else {
                         BUFFER_TRACE(bh, "writeout complete: unfile");
+                       if (unlikely(!buffer_uptodate(bh)))
+                               err = -EIO;
                         __journal_unfile_buffer(jh);
                         jbd_unlock_bh_state(bh);
                         if (locked)
                                 unlock_buffer(bh);
                         journal_remove_journal_head(bh);
-                       /* Once for our safety reference, once for
+                       /* One for our safety reference, other for
                          * journal_remove_journal_head() */
                         put_bh(bh);
-                       put_bh(bh);
+                       release_data_buffer(bh);
                 }
  
                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
@@ -270,7 +288,9 @@ write_out_data:
                 }
         }
         spin_unlock(&journal->j_list_lock);
-       journal_do_submit_data(wbuf, bufs);
+       journal_do_submit_data(wbuf, bufs, write_op);
+
+       return err;
  }
  
  /*
@@ -287,7 +307,9 @@ void journal_commit_transaction(journal_t *journal)
         int bufs;
         int flags;
         int err;
-       unsigned long blocknr;
+       unsigned int blocknr;
+       ktime_t start_time;
+       u64 commit_time;
         char *tagp = NULL;
         journal_header_t *header;
         journal_block_tag_t *tag = NULL;
@@ -295,6 +317,7 @@ void journal_commit_transaction(journal_t *journal)
         int first_tag = 0;
         int tag_flag;
         int i;
+       int write_op = WRITE;
  
         /*
          * First job: lock down the current transaction and wait for
@@ -327,6 +350,13 @@ void journal_commit_transaction(journal_t *journal)
         spin_lock(&journal->j_state_lock);
         commit_transaction->t_state = T_LOCKED;
  
+       /*
+        * Use plugged writes here, since we want to submit several before
+        * we unplug the device. We don't do explicit unplugging in here,
+        * instead we rely on sync_buffer() doing the unplug for us.
+        */
+       if (commit_transaction->t_synchronous_commit)
+               write_op = WRITE_SYNC_PLUG;
         spin_lock(&commit_transaction->t_handle_lock);
         while (commit_transaction->t_updates) {
                 DEFINE_WAIT(wait);
@@ -400,6 +430,7 @@ void journal_commit_transaction(journal_t *journal)
         commit_transaction->t_state = T_FLUSH;
         journal->j_committing_transaction = commit_transaction;
         journal->j_running_transaction = NULL;
+       start_time = ktime_get();
         commit_transaction->t_log_start = journal->j_head;
         wake_up(&journal->j_wait_transaction_locked);
         spin_unlock(&journal->j_state_lock);
@@ -407,27 +438,11 @@ void journal_commit_transaction(journal_t *journal)
         jbd_debug (3, "JBD: commit phase 2\n");
  
         /*
-        * First, drop modified flag: all accesses to the buffers
-        * will be tracked for a new trasaction only -bzzz
-        */
-       spin_lock(&journal->j_list_lock);
-       if (commit_transaction->t_buffers) {
-               new_jh = jh = commit_transaction->t_buffers->b_tnext;
-               do {
-                       J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
-                                       new_jh->b_modified == 0);
-                       new_jh->b_modified = 0;
-                       new_jh = new_jh->b_tnext;
-               } while (new_jh != jh);
-       }
-       spin_unlock(&journal->j_list_lock);
-
-       /*
          * Now start flushing things to disk, in the order they appear
          * on the transaction lists.  Data blocks go first.
          */
-       err = 0;
-       journal_submit_data_buffers(journal, commit_transaction);
+       err = journal_submit_data_buffers(journal, commit_transaction,
+                                         write_op);
  
         /*
          * Wait for all previously submitted IO to complete.
@@ -442,16 +457,29 @@ void journal_commit_transaction(journal_t *journal)
                 if (buffer_locked(bh)) {
                         spin_unlock(&journal->j_list_lock);
                         wait_on_buffer(bh);
-                       if (unlikely(!buffer_uptodate(bh)))
-                               err = -EIO;
                         spin_lock(&journal->j_list_lock);
                 }
+               if (unlikely(!buffer_uptodate(bh))) {
+                       if (!trylock_page(bh->b_page)) {
+                               spin_unlock(&journal->j_list_lock);
+                               lock_page(bh->b_page);
+                               spin_lock(&journal->j_list_lock);
+                       }
+                       if (bh->b_page->mapping)
+                               set_bit(AS_EIO, &bh->b_page->mapping->flags);
+
+                       unlock_page(bh->b_page);
+                       SetPageError(bh->b_page);
+                       err = -EIO;
+               }
                 if (!inverted_lock(journal, bh)) {
                         put_bh(bh);
                         spin_lock(&journal->j_list_lock);
                         continue;
                 }
-               if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+               if (buffer_jbd(bh) && bh2jh(bh) == jh &&
+                   jh->b_transaction == commit_transaction &&
+                   jh->b_jlist == BJ_Locked) {
                         __journal_unfile_buffer(jh);
                         jbd_unlock_bh_state(bh);
                         journal_remove_journal_head(bh);
@@ -459,17 +487,23 @@ void journal_commit_transaction(journal_t *journal)
                 } else {
                         jbd_unlock_bh_state(bh);
                 }
-               put_bh(bh);
+               release_data_buffer(bh);
                 cond_resched_lock(&journal->j_list_lock);
         }
         spin_unlock(&journal->j_list_lock);
  
-       if (err)
-               journal_abort(journal, err);
+       if (err) {
+               char b[BDEVNAME_SIZE];
  
-       journal_write_revoke_records(journal, commit_transaction);
+               printk(KERN_WARNING
+                       "JBD: Detected IO errors while flushing file data "
+                       "on %s\n", bdevname(journal->j_fs_dev, b));
+               if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+                       journal_abort(journal, err);
+               err = 0;
+       }
  
-       jbd_debug(3, "JBD: commit phase 2\n");
+       journal_write_revoke_records(journal, commit_transaction, write_op);
  
         /*
          * If we found any dirty or locked buffers, then we should have
@@ -486,7 +520,12 @@ void journal_commit_transaction(journal_t *journal)
          * transaction!  Now comes the tricky part: we need to write out
          * metadata.  Loop over the transaction's entire buffer list:
          */
+       spin_lock(&journal->j_state_lock);
         commit_transaction->t_state = T_COMMIT;
+       spin_unlock(&journal->j_state_lock);
+
+       J_ASSERT(commit_transaction->t_nr_buffers <=
+                commit_transaction->t_outstanding_credits);
  
         descriptor = NULL;
         bufs = 0;
@@ -497,9 +536,10 @@ void journal_commit_transaction(journal_t *journal)
                 jh = commit_transaction->t_buffers;
  
                 /* If we're in abort mode, we just un-journal the buffer and
-                  release it for background writing. */
+                  release it. */
  
                 if (is_journal_aborted(journal)) {
+                       clear_buffer_jbddirty(jh2bh(jh));
                         JBUFFER_TRACE(jh, "journal is aborting: refile");
                         journal_refile_buffer(journal, jh);
                         /* If that was the last one, we need to clean up
@@ -633,7 +673,7 @@ start_journal_io:
                                 clear_buffer_dirty(bh);
                                 set_buffer_uptodate(bh);
                                 bh->b_end_io = journal_end_buffer_io_sync;
-                               submit_bh(WRITE, bh);
+                               submit_bh(write_op, bh);
                         }
                         cond_resched();
  
@@ -741,8 +781,17 @@ wait_for_iobuf:
                 /* AKPM: bforget here */
         }
  
+       if (err)
+               journal_abort(journal, err);
+
         jbd_debug(3, "JBD: commit phase 6\n");
  
+       /* All metadata is written, now write commit record and do cleanup */
+       spin_lock(&journal->j_state_lock);
+       J_ASSERT(commit_transaction->t_state == T_COMMIT);
+       commit_transaction->t_state = T_COMMIT_RECORD;
+       spin_unlock(&journal->j_state_lock);
+
         if (journal_write_commit_record(journal, commit_transaction))
                 err = -EIO;
  
@@ -818,12 +867,12 @@ restart_loop:
                 /* A buffer which has been freed while still being
                  * journaled by a previous transaction may end up still
                  * being dirty here, but we want to avoid writing back
-                * that buffer in the future now that the last use has
-                * been committed.  That's not only a performance gain,
-                * it also stops aliasing problems if the buffer is left
-                * behind for writeback and gets reallocated for another
+                * that buffer in the future after the "add to orphan"
+                * operation been committed,  That's not only a performance
+                * gain, it also stops aliasing problems if the buffer is
+                * left behind for writeback and gets reallocated for another
                  * use in a different page. */
-               if (buffer_freed(bh)) {
+               if (buffer_freed(bh) && !jh->b_next_transaction) {
                         clear_buffer_freed(bh);
                         clear_buffer_jbddirty(bh);
                 }
@@ -831,6 +880,8 @@ restart_loop:
                 if (buffer_jbddirty(bh)) {
                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
                         __journal_insert_checkpoint(jh, commit_transaction);
+                       if (is_journal_aborted(journal))
+                               clear_buffer_jbddirty(bh);
                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
                         __journal_refile_buffer(jh);
                         jbd_unlock_bh_state(bh);
@@ -878,12 +929,24 @@ restart_loop:
  
         jbd_debug(3, "JBD: commit phase 8\n");
  
-       J_ASSERT(commit_transaction->t_state == T_COMMIT);
+       J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
  
         commit_transaction->t_state = T_FINISHED;
         J_ASSERT(commit_transaction == journal->j_committing_transaction);
         journal->j_commit_sequence = commit_transaction->t_tid;
         journal->j_committing_transaction = NULL;
+       commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+       /*
+        * weight the commit time higher than the average time so we don't
+        * react too strongly to vast changes in commit time
+        */
+       if (likely(journal->j_average_commit_time))
+               journal->j_average_commit_time = (commit_time*3 +
+                               journal->j_average_commit_time) / 4;
+       else
+               journal->j_average_commit_time = commit_time;
+
         spin_unlock(&journal->j_state_lock);
  
         if (commit_transaction->t_checkpoint_list == NULL &&