#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/highmem.h>
+#include <linux/hrtimer.h>
static void __journal_temp_unlink_buffer(struct journal_head *jh);
{
transaction->t_journal = journal;
transaction->t_state = T_RUNNING;
+ transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
alloc_transaction:
if (!journal->j_running_transaction) {
- new_transaction = jbd_kmalloc(sizeof(*new_transaction),
- GFP_NOFS);
+ new_transaction = kzalloc(sizeof(*new_transaction),
+ GFP_NOFS|__GFP_NOFAIL);
if (!new_transaction) {
ret = -ENOMEM;
goto out;
}
- memset(new_transaction, 0, sizeof(*new_transaction));
}
jbd_debug(3, "New handle %p going live.\n", handle);
jbd_free_handle(handle);
current->journal_info = NULL;
handle = ERR_PTR(err);
+ goto out;
}
- lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+ lock_map_acquire(&handle->h_lockdep_map);
+out:
return handle;
}
/**
- * int journal_restart() - restart a handle .
+ * int journal_restart() - restart a handle.
* @handle: handle to restart
* @nblocks: nr credits requested
*
goto done;
/*
+ * this is the first time this transaction is touching this buffer,
+ * reset the modified flag
+ */
+ jh->b_modified = 0;
+
+ /*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
*/
if (!frozen_buffer) {
printk(KERN_EMERG
"%s: OOM for frozen_buffer\n",
- __FUNCTION__);
+ __func__);
JBUFFER_TRACE(jh, "oom!");
error = -ENOMEM;
jbd_lock_bh_state(bh);
* int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
*
* Returns an error code or 0 on success.
*
if (jh->b_transaction == NULL) {
jh->b_transaction = transaction;
+
+ /* first access by this transaction */
+ jh->b_modified = 0;
+
JBUFFER_TRACE(jh, "file as BJ_Reserved");
__journal_file_buffer(jh, transaction, BJ_Reserved);
} else if (jh->b_transaction == journal->j_committing_transaction) {
+ /* first access by this transaction */
+ jh->b_modified = 0;
+
JBUFFER_TRACE(jh, "set next transaction");
jh->b_next_transaction = transaction;
}
}
/**
- * int journal_get_undo_access() - Notify intent to modify metadata with
- * non-rewindable consequences
+ * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences
* @handle: transaction
* @bh: buffer to undo
- * @credits: store the number of taken credits here (if not NULL)
*
* Sometimes there is a need to distinguish between metadata which has
* been committed to disk and that which has not. The ext3fs code uses
committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!committed_data) {
printk(KERN_EMERG "%s: No memory for committed data\n",
- __FUNCTION__);
+ __func__);
err = -ENOMEM;
goto out;
}
}
/**
- * int journal_dirty_data() - mark a buffer as containing dirty data which
- * needs to be flushed before we can commit the
- * current transaction.
+ * int journal_dirty_data() - mark a buffer as containing dirty data to be flushed
* @handle: transaction
* @bh: bufferhead to mark
*
+ * Description:
+ * Mark a buffer as containing dirty data which needs to be flushed before
+ * we can commit the current transaction.
+ *
* The buffer is placed on the transaction's data list and is marked as
* belonging to the transaction.
*
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
struct journal_head *jh;
+ int ret = 0;
if (is_handle_aborted(handle))
- return 0;
+ return ret;
jh = journal_add_journal_head(bh);
JBUFFER_TRACE(jh, "entry");
time if it is redirtied */
}
- /* journal_clean_data_list() may have got there first */
+ /*
+ * We cannot remove the buffer with io error from the
+ * committing transaction, because otherwise it would
+ * miss the error and the commit would not abort.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ ret = -EIO;
+ goto no_journal;
+ }
+
if (jh->b_transaction != NULL) {
JBUFFER_TRACE(jh, "unfile from commit");
__journal_temp_unlink_buffer(jh);
}
JBUFFER_TRACE(jh, "exit");
journal_put_journal_head(jh);
- return 0;
+ return ret;
}
/**
- * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * int journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
*
- * mark dirty metadata which needs to be journaled as part of the current
+ * Mark dirty metadata which needs to be journaled as part of the current
* transaction.
*
* The buffer is placed on the transaction's metadata list and is marked
}
/* That test should have eliminated the following case: */
- J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+ J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
JBUFFER_TRACE(jh, "file as BJ_Metadata");
spin_lock(&journal->j_list_lock);
struct journal_head *jh;
int drop_reserve = 0;
int err = 0;
+ int was_modified = 0;
BUFFER_TRACE(bh, "entry");
goto not_jbd;
}
+ /* keep track of wether or not this transaction modified us */
+ was_modified = jh->b_modified;
+
/*
* The buffer's going from the transaction, we must drop
* all references -bzzz
JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
- drop_reserve = 1;
+ /*
+ * we only want to drop a reference if this transaction
+ * modified the buffer
+ */
+ if (was_modified)
+ drop_reserve = 1;
/*
* We are no longer going to journal this buffer.
if (jh->b_next_transaction) {
J_ASSERT(jh->b_next_transaction == transaction);
jh->b_next_transaction = NULL;
- drop_reserve = 1;
+
+ /*
+ * only drop a reference if this transaction modified
+ * the buffer
+ */
+ if (was_modified)
+ drop_reserve = 1;
}
}
{
transaction_t *transaction = handle->h_transaction;
journal_t *journal = transaction->t_journal;
- int old_handle_count, err;
+ int err;
pid_t pid;
J_ASSERT(journal_current_handle() == handle);
* on IO anyway. Speeds up many-threaded, many-dir operations
* by 30x or more...
*
+ * We try and optimize the sleep time against what the underlying disk
+ * can do, instead of having a static sleep time. This is usefull for
+ * the case where our storage is so fast that it is more optimal to go
+ * ahead and force a flush and wait for the transaction to be committed
+ * than it is to wait for an arbitrary amount of time for new writers to
+ * join the transaction. We acheive this by measuring how long it takes
+ * to commit a transaction, and compare it with how long this
+ * transaction has been running, and if run time < commit time then we
+ * sleep for the delta and commit. This greatly helps super fast disks
+ * that would see slowdowns as more threads started doing fsyncs.
+ *
* But don't do this if this process was the most recent one to
* perform a synchronous write. We do this to detect the case where a
* single process is doing a stream of sync writes. No point in waiting
*/
pid = current->pid;
if (handle->h_sync && journal->j_last_sync_writer != pid) {
+ u64 commit_time, trans_time;
+
journal->j_last_sync_writer = pid;
- do {
- old_handle_count = transaction->t_handle_count;
- schedule_timeout_uninterruptible(1);
- } while (old_handle_count != transaction->t_handle_count);
+
+ spin_lock(&journal->j_state_lock);
+ commit_time = journal->j_average_commit_time;
+ spin_unlock(&journal->j_state_lock);
+
+ trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+ transaction->t_start_time));
+
+ commit_time = min_t(u64, commit_time,
+ 1000*jiffies_to_usecs(1));
+
+ if (trans_time < commit_time) {
+ ktime_t expires = ktime_add_ns(ktime_get(),
+ commit_time);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+ }
}
+ if (handle->h_sync)
+ transaction->t_synchronous_commit = 1;
current->journal_info = NULL;
spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
spin_unlock(&journal->j_state_lock);
}
- lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+ lock_map_release(&handle->h_lockdep_map);
jbd_free_handle(handle);
return err;
}
-/**int journal_force_commit() - force any uncommitted transactions
+/**
+ * int journal_force_commit() - force any uncommitted transactions
* @journal: journal to force
*
* For synchronous operations: force any uncommitted transactions
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
if (jh->b_jlist != BJ_None)
- J_ASSERT_JH(jh, transaction != 0);
+ J_ASSERT_JH(jh, transaction != NULL);
switch (jh->b_jlist) {
case BJ_None:
if (buffer_locked(bh) || buffer_dirty(bh))
goto out;
- if (jh->b_next_transaction != 0)
+ if (jh->b_next_transaction != NULL)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+ if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
/* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data");
journal_remove_journal_head(bh);
__brelse(bh);
}
- } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+ } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
return;
}
-
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
busy:
return ret;
}
}
/**
- * void journal_invalidatepage()
- * @journal: journal to use for flush...
+ * void journal_invalidatepage() - invalidate a journal page
+ * @journal: journal to use for flush
* @page: page to flush
* @offset: length of page to invalidate.
*
* Reap page buffers containing data after offset in page.
- *
*/
void journal_invalidatepage(journal_t *journal,
struct page *page,
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
- jh->b_transaction == 0);
+ jh->b_transaction == NULL);
if (jh->b_transaction && jh->b_jlist == jlist)
return;
jh->b_transaction = jh->b_next_transaction;
jh->b_next_transaction = NULL;
__journal_file_buffer(jh, jh->b_transaction,
- was_dirty ? BJ_Metadata : BJ_Reserved);
+ jh->b_modified ? BJ_Metadata : BJ_Reserved);
J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
if (was_dirty)