/*
- * linux/fs/commit.c
+ * linux/fs/jbd/commit.c
*
* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
*
#include <linux/fs.h>
#include <linux/jbd.h>
#include <linux/errno.h>
-#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
+#include <linux/bio.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
/*
* When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * not successfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
* So here, we have a buffer which has just come off the forget list. Look to
* see if we can strip all buffers from the backing page.
*
- * Called under lock_journal(), and possibly under journal_datalist_lock. The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under journal->j_list_lock. The caller provided us with a ref
+ * against the buffer, and we drop that here.
*/
static void release_buffer_page(struct buffer_head *bh)
{
goto nope;
/* OK, it's a truncated page */
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto nope;
page_cache_get(page);
}
/*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+ if (buffer_freed(bh)) {
+ clear_buffer_freed(bh);
+ release_buffer_page(bh);
+ } else
+ put_bh(bh);
+}
+
+/*
* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
* held. For ranking reasons we must trylock. If we lose, schedule away and
* return 0. j_list_lock is dropped in this case.
{
struct journal_head *descriptor;
struct buffer_head *bh;
- int i, ret;
+ journal_header_t *header;
+ int ret;
int barrier_done = 0;
if (is_journal_aborted(journal))
bh = jh2bh(descriptor);
- /* AKPM: buglet - add `i' to tmp! */
- for (i = 0; i < bh->b_size; i += 512) {
- journal_header_t *tmp = (journal_header_t*)bh->b_data;
- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
- }
+ header = (journal_header_t *)(bh->b_data);
+ header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+ header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+ header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
JBUFFER_TRACE(descriptor, "write commit block");
set_buffer_dirty(bh);
barrier_done = 1;
}
ret = sync_dirty_buffer(bh);
+ if (barrier_done)
+ clear_buffer_ordered(bh);
/* is it possible for another commit to fail at roughly
* the same time as this one? If so, we don't want to
* trust the barrier flag in the super, but instead want
spin_unlock(&journal->j_state_lock);
/* And try again, without the barrier */
- clear_buffer_ordered(bh);
set_buffer_uptodate(bh);
set_buffer_dirty(bh);
ret = sync_dirty_buffer(bh);
return (ret == -EIO);
}
+static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
+ int write_op)
+{
+ int i;
+
+ for (i = 0; i < bufs; i++) {
+ wbuf[i]->b_end_io = end_buffer_write_sync;
+ /* We use-up our safety reference in submit_bh() */
+ submit_bh(write_op, wbuf[i]);
+ }
+}
+
+/*
+ * Submit all the data buffers to disk
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction,
+ int write_op)
+{
+ struct journal_head *jh;
+ struct buffer_head *bh;
+ int locked;
+ int bufs = 0;
+ struct buffer_head **wbuf = journal->j_wbuf;
+ int err = 0;
+
+ /*
+ * Whenever we unlock the journal and sleep, things can get added
+ * onto ->t_sync_datalist, so we have to keep looping back to
+ * write_out_data until we *know* that the list is empty.
+ *
+ * Cleanup any flushed data buffers from the data list. Even in
+ * abort mode, we want to flush this out as soon as possible.
+ */
+write_out_data:
+ cond_resched();
+ spin_lock(&journal->j_list_lock);
+
+ while (commit_transaction->t_sync_datalist) {
+ jh = commit_transaction->t_sync_datalist;
+ bh = jh2bh(jh);
+ locked = 0;
+
+ /* Get reference just to make sure buffer does not disappear
+ * when we are forced to drop various locks */
+ get_bh(bh);
+ /* If the buffer is dirty, we need to submit IO and hence
+ * we need the buffer lock. We try to lock the buffer without
+ * blocking. If we fail, we need to drop j_list_lock and do
+ * blocking lock_buffer().
+ */
+ if (buffer_dirty(bh)) {
+ if (!trylock_buffer(bh)) {
+ BUFFER_TRACE(bh, "needs blocking lock");
+ spin_unlock(&journal->j_list_lock);
+ /* Write out all data to prevent deadlocks */
+ journal_do_submit_data(wbuf, bufs, write_op);
+ bufs = 0;
+ lock_buffer(bh);
+ spin_lock(&journal->j_list_lock);
+ }
+ locked = 1;
+ }
+ /* We have to get bh_state lock. Again out of order, sigh. */
+ if (!inverted_lock(journal, bh)) {
+ jbd_lock_bh_state(bh);
+ spin_lock(&journal->j_list_lock);
+ }
+ /* Someone already cleaned up the buffer? */
+ if (!buffer_jbd(bh) || bh2jh(bh) != jh
+ || jh->b_transaction != commit_transaction
+ || jh->b_jlist != BJ_SyncData) {
+ jbd_unlock_bh_state(bh);
+ if (locked)
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "already cleaned up");
+ release_data_buffer(bh);
+ continue;
+ }
+ if (locked && test_clear_buffer_dirty(bh)) {
+ BUFFER_TRACE(bh, "needs writeout, adding to array");
+ wbuf[bufs++] = bh;
+ __journal_file_buffer(jh, commit_transaction,
+ BJ_Locked);
+ jbd_unlock_bh_state(bh);
+ if (bufs == journal->j_wbufsize) {
+ spin_unlock(&journal->j_list_lock);
+ journal_do_submit_data(wbuf, bufs, write_op);
+ bufs = 0;
+ goto write_out_data;
+ }
+ } else if (!locked && buffer_locked(bh)) {
+ __journal_file_buffer(jh, commit_transaction,
+ BJ_Locked);
+ jbd_unlock_bh_state(bh);
+ put_bh(bh);
+ } else {
+ BUFFER_TRACE(bh, "writeout complete: unfile");
+ if (unlikely(!buffer_uptodate(bh)))
+ err = -EIO;
+ __journal_unfile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ if (locked)
+ unlock_buffer(bh);
+ journal_remove_journal_head(bh);
+ /* One for our safety reference, other for
+ * journal_remove_journal_head() */
+ put_bh(bh);
+ release_data_buffer(bh);
+ }
+
+ if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
+ spin_unlock(&journal->j_list_lock);
+ goto write_out_data;
+ }
+ }
+ spin_unlock(&journal->j_list_lock);
+ journal_do_submit_data(wbuf, bufs, write_op);
+
+ return err;
+}
+
/*
* journal_commit_transaction
*
int bufs;
int flags;
int err;
- unsigned long blocknr;
+ unsigned int blocknr;
+ ktime_t start_time;
+ u64 commit_time;
char *tagp = NULL;
journal_header_t *header;
journal_block_tag_t *tag = NULL;
int first_tag = 0;
int tag_flag;
int i;
+ int write_op = WRITE;
/*
* First job: lock down the current transaction and wait for
spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_LOCKED;
+ /*
+ * Use plugged writes here, since we want to submit several before
+ * we unplug the device. We don't do explicit unplugging in here,
+ * instead we rely on sync_buffer() doing the unplug for us.
+ */
+ if (commit_transaction->t_synchronous_commit)
+ write_op = WRITE_SYNC_PLUG;
spin_lock(&commit_transaction->t_handle_lock);
while (commit_transaction->t_updates) {
DEFINE_WAIT(wait);
struct buffer_head *bh = jh2bh(jh);
jbd_lock_bh_state(bh);
- kfree(jh->b_committed_data);
+ jbd_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
jbd_unlock_bh_state(bh);
}
commit_transaction->t_state = T_FLUSH;
journal->j_committing_transaction = commit_transaction;
journal->j_running_transaction = NULL;
+ start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
wake_up(&journal->j_wait_transaction_locked);
spin_unlock(&journal->j_state_lock);
jbd_debug (3, "JBD: commit phase 2\n");
/*
- * First, drop modified flag: all accesses to the buffers
- * will be tracked for a new trasaction only -bzzz
- */
- spin_lock(&journal->j_list_lock);
- if (commit_transaction->t_buffers) {
- new_jh = jh = commit_transaction->t_buffers->b_tnext;
- do {
- J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
- new_jh->b_modified == 0);
- new_jh->b_modified = 0;
- new_jh = new_jh->b_tnext;
- } while (new_jh != jh);
- }
- spin_unlock(&journal->j_list_lock);
-
- /*
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
-
- err = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- */
- bufs = 0;
- /*
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
- spin_lock(&journal->j_list_lock);
-
- while (commit_transaction->t_sync_datalist) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_sync_datalist;
- commit_transaction->t_sync_datalist = jh->b_tnext;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- BUFFER_TRACE(bh, "locked");
- if (!inverted_lock(journal, bh))
- goto write_out_data;
- __journal_temp_unlink_buffer(jh);
- __journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (lock_need_resched(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
- }
- } else {
- if (buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "start journal writeout");
- get_bh(bh);
- wbuf[bufs++] = bh;
- if (bufs == journal->j_wbufsize) {
- jbd_debug(2, "submit %d writes\n",
- bufs);
- spin_unlock(&journal->j_list_lock);
- ll_rw_block(SWRITE, bufs, wbuf);
- journal_brelse_array(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- if (!inverted_lock(journal, bh))
- goto write_out_data;
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
- put_bh(bh);
- if (lock_need_resched(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
- }
- }
- }
- }
-
- if (bufs) {
- spin_unlock(&journal->j_list_lock);
- ll_rw_block(SWRITE, bufs, wbuf);
- journal_brelse_array(wbuf, bufs);
- spin_lock(&journal->j_list_lock);
- }
+ err = journal_submit_data_buffers(journal, commit_transaction,
+ write_op);
/*
* Wait for all previously submitted IO to complete.
*/
+ spin_lock(&journal->j_list_lock);
while (commit_transaction->t_locked_list) {
struct buffer_head *bh;
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
spin_lock(&journal->j_list_lock);
}
+ if (unlikely(!buffer_uptodate(bh))) {
+ if (!trylock_page(bh->b_page)) {
+ spin_unlock(&journal->j_list_lock);
+ lock_page(bh->b_page);
+ spin_lock(&journal->j_list_lock);
+ }
+ if (bh->b_page->mapping)
+ set_bit(AS_EIO, &bh->b_page->mapping->flags);
+
+ unlock_page(bh->b_page);
+ SetPageError(bh->b_page);
+ err = -EIO;
+ }
if (!inverted_lock(journal, bh)) {
put_bh(bh);
spin_lock(&journal->j_list_lock);
continue;
}
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+ if (buffer_jbd(bh) && bh2jh(bh) == jh &&
+ jh->b_transaction == commit_transaction &&
+ jh->b_jlist == BJ_Locked) {
__journal_unfile_buffer(jh);
jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
} else {
jbd_unlock_bh_state(bh);
}
- put_bh(bh);
+ release_data_buffer(bh);
cond_resched_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
- if (err)
- __journal_abort_hard(journal);
+ if (err) {
+ char b[BDEVNAME_SIZE];
- journal_write_revoke_records(journal, commit_transaction);
+ printk(KERN_WARNING
+ "JBD: Detected IO errors while flushing file data "
+ "on %s\n", bdevname(journal->j_fs_dev, b));
+ if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+ journal_abort(journal, err);
+ err = 0;
+ }
- jbd_debug(3, "JBD: commit phase 2\n");
+ journal_write_revoke_records(journal, commit_transaction, write_op);
/*
* If we found any dirty or locked buffers, then we should have
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
*/
+ spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_COMMIT;
+ spin_unlock(&journal->j_state_lock);
+
+ J_ASSERT(commit_transaction->t_nr_buffers <=
+ commit_transaction->t_outstanding_credits);
descriptor = NULL;
bufs = 0;
jh = commit_transaction->t_buffers;
/* If we're in abort mode, we just un-journal the buffer and
- release it for background writing. */
+ release it. */
if (is_journal_aborted(journal)) {
+ clear_buffer_jbddirty(jh2bh(jh));
JBUFFER_TRACE(jh, "journal is aborting: refile");
journal_refile_buffer(journal, jh);
/* If that was the last one, we need to clean up
descriptor = journal_get_descriptor_buffer(journal);
if (!descriptor) {
- __journal_abort_hard(journal);
+ journal_abort(journal, -EIO);
continue;
}
and repeat this loop: we'll fall into the
refile-on-abort condition above. */
if (err) {
- __journal_abort_hard(journal);
+ journal_abort(journal, err);
continue;
}
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(WRITE, bh);
+ submit_bh(write_op, bh);
}
cond_resched();
/* AKPM: bforget here */
}
+ if (err)
+ journal_abort(journal, err);
+
jbd_debug(3, "JBD: commit phase 6\n");
+ /* All metadata is written, now write commit record and do cleanup */
+ spin_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_COMMIT);
+ commit_transaction->t_state = T_COMMIT_RECORD;
+ spin_unlock(&journal->j_state_lock);
+
if (journal_write_commit_record(journal, commit_transaction))
err = -EIO;
if (err)
- __journal_abort_hard(journal);
+ journal_abort(journal, err);
/* End of a transaction! Finally, we can do checkpoint
processing: any buffers committed as a result of this
* Otherwise, we can just throw away the frozen data now.
*/
if (jh->b_committed_data) {
- kfree(jh->b_committed_data);
+ jbd_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
jh->b_frozen_data = NULL;
}
} else if (jh->b_frozen_data) {
- kfree(jh->b_frozen_data);
+ jbd_free(jh->b_frozen_data, bh->b_size);
jh->b_frozen_data = NULL;
}
/* A buffer which has been freed while still being
* journaled by a previous transaction may end up still
* being dirty here, but we want to avoid writing back
- * that buffer in the future now that the last use has
- * been committed. That's not only a performance gain,
- * it also stops aliasing problems if the buffer is left
- * behind for writeback and gets reallocated for another
+ * that buffer in the future after the "add to orphan"
+ * operation been committed, That's not only a performance
+ * gain, it also stops aliasing problems if the buffer is
+ * left behind for writeback and gets reallocated for another
* use in a different page. */
- if (buffer_freed(bh)) {
+ if (buffer_freed(bh) && !jh->b_next_transaction) {
clear_buffer_freed(bh);
clear_buffer_jbddirty(bh);
}
if (buffer_jbddirty(bh)) {
JBUFFER_TRACE(jh, "add to new checkpointing trans");
__journal_insert_checkpoint(jh, commit_transaction);
+ if (is_journal_aborted(journal))
+ clear_buffer_jbddirty(bh);
JBUFFER_TRACE(jh, "refile for checkpoint writeback");
__journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
} else {
J_ASSERT_BH(bh, !buffer_dirty(bh));
- J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh); /* needs a brelse */
- release_buffer_page(bh);
+ /* The buffer on BJ_Forget list and not jbddirty means
+ * it has been freed by this transaction and hence it
+ * could not have been reallocated until this
+ * transaction has committed. *BUT* it could be
+ * reallocated once we have written all the data to
+ * disk and before we process the buffer on BJ_Forget
+ * list. */
+ JBUFFER_TRACE(jh, "refile or unfile freed buffer");
+ __journal_refile_buffer(jh);
+ if (!jh->b_transaction) {
+ jbd_unlock_bh_state(bh);
+ /* needs a brelse */
+ journal_remove_journal_head(bh);
+ release_buffer_page(bh);
+ } else
+ jbd_unlock_bh_state(bh);
}
cond_resched_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
/*
- * This is a bit sleazy. We borrow j_list_lock to protect
- * journal->j_committing_transaction in __journal_remove_checkpoint.
- * Really, __journal_remove_checkpoint should be using j_state_lock but
- * it's a bit hassle to hold that across __journal_remove_checkpoint
+ * This is a bit sleazy. We use j_list_lock to protect transition
+ * of a transaction into T_FINISHED state and calling
+ * __journal_drop_transaction(). Otherwise we could race with
+ * other checkpointing code processing the transaction...
*/
spin_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
jbd_debug(3, "JBD: commit phase 8\n");
- J_ASSERT(commit_transaction->t_state == T_COMMIT);
+ J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
commit_transaction->t_state = T_FINISHED;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
+ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+ /*
+ * weight the commit time higher than the average time so we don't
+ * react too strongly to vast changes in commit time
+ */
+ if (likely(journal->j_average_commit_time))
+ journal->j_average_commit_time = (commit_time*3 +
+ journal->j_average_commit_time) / 4;
+ else
+ journal->j_average_commit_time = commit_time;
+
spin_unlock(&journal->j_state_lock);
if (commit_transaction->t_checkpoint_list == NULL &&