SAFE public projects git trees. - safe/jmp/linux-2.6/blob - fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24
  25 /*
  26  * Default IO end handler for temporary BJ_IO buffer_heads.
  27  */
  28 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  29 {
  30         BUFFER_TRACE(bh, "");
  31         if (uptodate)
  32                 set_buffer_uptodate(bh);
  33         else
  34                 clear_buffer_uptodate(bh);
  35         unlock_buffer(bh);
  36 }
  37
  38 /*
  39  * When an ext3-ordered file is truncated, it is possible that many pages are
  40  * not sucessfully freed, because they are attached to a committing transaction.
  41  * After the transaction commits, these pages are left on the LRU, with no
  42  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  43  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  44  * the numbers in /proc/meminfo look odd.
  45  *
  46  * So here, we have a buffer which has just come off the forget list.  Look to
  47  * see if we can strip all buffers from the backing page.
  48  *
  49  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  50  * caller provided us with a ref against the buffer, and we drop that here.
  51  */
  52 static void release_buffer_page(struct buffer_head *bh)
  53 {
  54         struct page *page;
  55
  56         if (buffer_dirty(bh))
  57                 goto nope;
  58         if (atomic_read(&bh->b_count) != 1)
  59                 goto nope;
  60         page = bh->b_page;
  61         if (!page)
  62                 goto nope;
  63         if (page->mapping)
  64                 goto nope;
  65
  66         /* OK, it's a truncated page */
  67         if (TestSetPageLocked(page))
  68                 goto nope;
  69
  70         page_cache_get(page);
  71         __brelse(bh);
  72         try_to_free_buffers(page);
  73         unlock_page(page);
  74         page_cache_release(page);
  75         return;
  76
  77 nope:
  78         __brelse(bh);
  79 }
  80
  81 /*
  82  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
  83  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
  84  * return 0.  j_list_lock is dropped in this case.
  85  */
  86 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
  87 {
  88         if (!jbd_trylock_bh_state(bh)) {
  89                 spin_unlock(&journal->j_list_lock);
  90                 schedule();
  91                 return 0;
  92         }
  93         return 1;
  94 }
  95
  96 /* Done it all: now write the commit record.  We should have
  97  * cleaned up our previous buffers by now, so if we are in abort
  98  * mode we can now just skip the rest of the journal write
  99  * entirely.
 100  *
 101  * Returns 1 if the journal needs to be aborted or 0 on success
 102  */
 103 static int journal_write_commit_record(journal_t *journal,
 104                                         transaction_t *commit_transaction)
 105 {
 106         struct journal_head *descriptor;
 107         struct buffer_head *bh;
 108         int i, ret;
 109         int barrier_done = 0;
 110
 111         if (is_journal_aborted(journal))
 112                 return 0;
 113
 114         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 115         if (!descriptor)
 116                 return 1;
 117
 118         bh = jh2bh(descriptor);
 119
 120         /* AKPM: buglet - add `i' to tmp! */
 121         for (i = 0; i < bh->b_size; i += 512) {
 122                 journal_header_t *tmp = (journal_header_t*)bh->b_data;
 123                 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 124                 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 125                 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 126         }
 127
 128         JBUFFER_TRACE(descriptor, "write commit block");
 129         set_buffer_dirty(bh);
 130         if (journal->j_flags & JBD2_BARRIER) {
 131                 set_buffer_ordered(bh);
 132                 barrier_done = 1;
 133         }
 134         ret = sync_dirty_buffer(bh);
 135         /* is it possible for another commit to fail at roughly
 136          * the same time as this one?  If so, we don't want to
 137          * trust the barrier flag in the super, but instead want
 138          * to remember if we sent a barrier request
 139          */
 140         if (ret == -EOPNOTSUPP && barrier_done) {
 141                 char b[BDEVNAME_SIZE];
 142
 143                 printk(KERN_WARNING
 144                         "JBD: barrier-based sync failed on %s - "
 145                         "disabling barriers\n",
 146                         bdevname(journal->j_dev, b));
 147                 spin_lock(&journal->j_state_lock);
 148                 journal->j_flags &= ~JBD2_BARRIER;
 149                 spin_unlock(&journal->j_state_lock);
 150
 151                 /* And try again, without the barrier */
 152                 clear_buffer_ordered(bh);
 153                 set_buffer_uptodate(bh);
 154                 set_buffer_dirty(bh);
 155                 ret = sync_dirty_buffer(bh);
 156         }
 157         put_bh(bh);             /* One for getblk() */
 158         jbd2_journal_put_journal_head(descriptor);
 159
 160         return (ret == -EIO);
 161 }
 162
 163 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
 164 {
 165         int i;
 166
 167         for (i = 0; i < bufs; i++) {
 168                 wbuf[i]->b_end_io = end_buffer_write_sync;
 169                 /* We use-up our safety reference in submit_bh() */
 170                 submit_bh(WRITE, wbuf[i]);
 171         }
 172 }
 173
 174 /*
 175  *  Submit all the data buffers to disk
 176  */
 177 static void journal_submit_data_buffers(journal_t *journal,
 178                                 transaction_t *commit_transaction)
 179 {
 180         struct journal_head *jh;
 181         struct buffer_head *bh;
 182         int locked;
 183         int bufs = 0;
 184         struct buffer_head **wbuf = journal->j_wbuf;
 185
 186         /*
 187          * Whenever we unlock the journal and sleep, things can get added
 188          * onto ->t_sync_datalist, so we have to keep looping back to
 189          * write_out_data until we *know* that the list is empty.
 190          *
 191          * Cleanup any flushed data buffers from the data list.  Even in
 192          * abort mode, we want to flush this out as soon as possible.
 193          */
 194 write_out_data:
 195         cond_resched();
 196         spin_lock(&journal->j_list_lock);
 197
 198         while (commit_transaction->t_sync_datalist) {
 199                 jh = commit_transaction->t_sync_datalist;
 200                 bh = jh2bh(jh);
 201                 locked = 0;
 202
 203                 /* Get reference just to make sure buffer does not disappear
 204                  * when we are forced to drop various locks */
 205                 get_bh(bh);
 206                 /* If the buffer is dirty, we need to submit IO and hence
 207                  * we need the buffer lock. We try to lock the buffer without
 208                  * blocking. If we fail, we need to drop j_list_lock and do
 209                  * blocking lock_buffer().
 210                  */
 211                 if (buffer_dirty(bh)) {
 212                         if (test_set_buffer_locked(bh)) {
 213                                 BUFFER_TRACE(bh, "needs blocking lock");
 214                                 spin_unlock(&journal->j_list_lock);
 215                                 /* Write out all data to prevent deadlocks */
 216                                 journal_do_submit_data(wbuf, bufs);
 217                                 bufs = 0;
 218                                 lock_buffer(bh);
 219                                 spin_lock(&journal->j_list_lock);
 220                         }
 221                         locked = 1;
 222                 }
 223                 /* We have to get bh_state lock. Again out of order, sigh. */
 224                 if (!inverted_lock(journal, bh)) {
 225                         jbd_lock_bh_state(bh);
 226                         spin_lock(&journal->j_list_lock);
 227                 }
 228                 /* Someone already cleaned up the buffer? */
 229                 if (!buffer_jbd(bh)
 230                         || jh->b_transaction != commit_transaction
 231                         || jh->b_jlist != BJ_SyncData) {
 232                         jbd_unlock_bh_state(bh);
 233                         if (locked)
 234                                 unlock_buffer(bh);
 235                         BUFFER_TRACE(bh, "already cleaned up");
 236                         put_bh(bh);
 237                         continue;
 238                 }
 239                 if (locked && test_clear_buffer_dirty(bh)) {
 240                         BUFFER_TRACE(bh, "needs writeout, adding to array");
 241                         wbuf[bufs++] = bh;
 242                         __jbd2_journal_file_buffer(jh, commit_transaction,
 243                                                 BJ_Locked);
 244                         jbd_unlock_bh_state(bh);
 245                         if (bufs == journal->j_wbufsize) {
 246                                 spin_unlock(&journal->j_list_lock);
 247                                 journal_do_submit_data(wbuf, bufs);
 248                                 bufs = 0;
 249                                 goto write_out_data;
 250                         }
 251                 } else if (!locked && buffer_locked(bh)) {
 252                         __jbd2_journal_file_buffer(jh, commit_transaction,
 253                                                 BJ_Locked);
 254                         jbd_unlock_bh_state(bh);
 255                         put_bh(bh);
 256                 } else {
 257                         BUFFER_TRACE(bh, "writeout complete: unfile");
 258                         __jbd2_journal_unfile_buffer(jh);
 259                         jbd_unlock_bh_state(bh);
 260                         if (locked)
 261                                 unlock_buffer(bh);
 262                         jbd2_journal_remove_journal_head(bh);
 263                         /* Once for our safety reference, once for
 264                          * jbd2_journal_remove_journal_head() */
 265                         put_bh(bh);
 266                         put_bh(bh);
 267                 }
 268
 269                 if (lock_need_resched(&journal->j_list_lock)) {
 270                         spin_unlock(&journal->j_list_lock);
 271                         goto write_out_data;
 272                 }
 273         }
 274         spin_unlock(&journal->j_list_lock);
 275         journal_do_submit_data(wbuf, bufs);
 276 }
 277
 278 static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 279                                    unsigned long long block)
 280 {
 281         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 282         if (tag_bytes > JBD2_TAG_SIZE32)
 283                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 284 }
 285
 286 /*
 287  * jbd2_journal_commit_transaction
 288  *
 289  * The primary function for committing a transaction to the log.  This
 290  * function is called by the journal thread to begin a complete commit.
 291  */
 292 void jbd2_journal_commit_transaction(journal_t *journal)
 293 {
 294         struct transaction_stats_s stats;
 295         transaction_t *commit_transaction;
 296         struct journal_head *jh, *new_jh, *descriptor;
 297         struct buffer_head **wbuf = journal->j_wbuf;
 298         int bufs;
 299         int flags;
 300         int err;
 301         unsigned long long blocknr;
 302         char *tagp = NULL;
 303         journal_header_t *header;
 304         journal_block_tag_t *tag = NULL;
 305         int space_left = 0;
 306         int first_tag = 0;
 307         int tag_flag;
 308         int i;
 309         int tag_bytes = journal_tag_bytes(journal);
 310
 311         /*
 312          * First job: lock down the current transaction and wait for
 313          * all outstanding updates to complete.
 314          */
 315
 316 #ifdef COMMIT_STATS
 317         spin_lock(&journal->j_list_lock);
 318         summarise_journal_usage(journal);
 319         spin_unlock(&journal->j_list_lock);
 320 #endif
 321
 322         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 323         if (journal->j_flags & JBD2_FLUSHED) {
 324                 jbd_debug(3, "super block updated\n");
 325                 jbd2_journal_update_superblock(journal, 1);
 326         } else {
 327                 jbd_debug(3, "superblock not updated\n");
 328         }
 329
 330         J_ASSERT(journal->j_running_transaction != NULL);
 331         J_ASSERT(journal->j_committing_transaction == NULL);
 332
 333         commit_transaction = journal->j_running_transaction;
 334         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 335
 336         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 337                         commit_transaction->t_tid);
 338
 339         spin_lock(&journal->j_state_lock);
 340         commit_transaction->t_state = T_LOCKED;
 341
 342         stats.u.run.rs_wait = commit_transaction->t_max_wait;
 343         stats.u.run.rs_locked = jiffies;
 344         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 345                                                 stats.u.run.rs_locked);
 346
 347         spin_lock(&commit_transaction->t_handle_lock);
 348         while (commit_transaction->t_updates) {
 349                 DEFINE_WAIT(wait);
 350
 351                 prepare_to_wait(&journal->j_wait_updates, &wait,
 352                                         TASK_UNINTERRUPTIBLE);
 353                 if (commit_transaction->t_updates) {
 354                         spin_unlock(&commit_transaction->t_handle_lock);
 355                         spin_unlock(&journal->j_state_lock);
 356                         schedule();
 357                         spin_lock(&journal->j_state_lock);
 358                         spin_lock(&commit_transaction->t_handle_lock);
 359                 }
 360                 finish_wait(&journal->j_wait_updates, &wait);
 361         }
 362         spin_unlock(&commit_transaction->t_handle_lock);
 363
 364         J_ASSERT (commit_transaction->t_outstanding_credits <=
 365                         journal->j_max_transaction_buffers);
 366
 367         /*
 368          * First thing we are allowed to do is to discard any remaining
 369          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 370          * that there are no such buffers: if a large filesystem
 371          * operation like a truncate needs to split itself over multiple
 372          * transactions, then it may try to do a jbd2_journal_restart() while
 373          * there are still BJ_Reserved buffers outstanding.  These must
 374          * be released cleanly from the current transaction.
 375          *
 376          * In this case, the filesystem must still reserve write access
 377          * again before modifying the buffer in the new transaction, but
 378          * we do not require it to remember exactly which old buffers it
 379          * has reserved.  This is consistent with the existing behaviour
 380          * that multiple jbd2_journal_get_write_access() calls to the same
 381          * buffer are perfectly permissable.
 382          */
 383         while (commit_transaction->t_reserved_list) {
 384                 jh = commit_transaction->t_reserved_list;
 385                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 386                 /*
 387                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 388                  * leave undo-committed data.
 389                  */
 390                 if (jh->b_committed_data) {
 391                         struct buffer_head *bh = jh2bh(jh);
 392
 393                         jbd_lock_bh_state(bh);
 394                         jbd2_free(jh->b_committed_data, bh->b_size);
 395                         jh->b_committed_data = NULL;
 396                         jbd_unlock_bh_state(bh);
 397                 }
 398                 jbd2_journal_refile_buffer(journal, jh);
 399         }
 400
 401         /*
 402          * Now try to drop any written-back buffers from the journal's
 403          * checkpoint lists.  We do this *before* commit because it potentially
 404          * frees some memory
 405          */
 406         spin_lock(&journal->j_list_lock);
 407         __jbd2_journal_clean_checkpoint_list(journal);
 408         spin_unlock(&journal->j_list_lock);
 409
 410         jbd_debug (3, "JBD: commit phase 1\n");
 411
 412         /*
 413          * Switch to a new revoke table.
 414          */
 415         jbd2_journal_switch_revoke_table(journal);
 416
 417         stats.u.run.rs_flushing = jiffies;
 418         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 419                                                stats.u.run.rs_flushing);
 420
 421         commit_transaction->t_state = T_FLUSH;
 422         journal->j_committing_transaction = commit_transaction;
 423         journal->j_running_transaction = NULL;
 424         commit_transaction->t_log_start = journal->j_head;
 425         wake_up(&journal->j_wait_transaction_locked);
 426         spin_unlock(&journal->j_state_lock);
 427
 428         jbd_debug (3, "JBD: commit phase 2\n");
 429
 430         /*
 431          * First, drop modified flag: all accesses to the buffers
 432          * will be tracked for a new trasaction only -bzzz
 433          */
 434         spin_lock(&journal->j_list_lock);
 435         if (commit_transaction->t_buffers) {
 436                 new_jh = jh = commit_transaction->t_buffers->b_tnext;
 437                 do {
 438                         J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
 439                                         new_jh->b_modified == 0);
 440                         new_jh->b_modified = 0;
 441                         new_jh = new_jh->b_tnext;
 442                 } while (new_jh != jh);
 443         }
 444         spin_unlock(&journal->j_list_lock);
 445
 446         /*
 447          * Now start flushing things to disk, in the order they appear
 448          * on the transaction lists.  Data blocks go first.
 449          */
 450         err = 0;
 451         journal_submit_data_buffers(journal, commit_transaction);
 452
 453         /*
 454          * Wait for all previously submitted IO to complete.
 455          */
 456         spin_lock(&journal->j_list_lock);
 457         while (commit_transaction->t_locked_list) {
 458                 struct buffer_head *bh;
 459
 460                 jh = commit_transaction->t_locked_list->b_tprev;
 461                 bh = jh2bh(jh);
 462                 get_bh(bh);
 463                 if (buffer_locked(bh)) {
 464                         spin_unlock(&journal->j_list_lock);
 465                         wait_on_buffer(bh);
 466                         if (unlikely(!buffer_uptodate(bh)))
 467                                 err = -EIO;
 468                         spin_lock(&journal->j_list_lock);
 469                 }
 470                 if (!inverted_lock(journal, bh)) {
 471                         put_bh(bh);
 472                         spin_lock(&journal->j_list_lock);
 473                         continue;
 474                 }
 475                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
 476                         __jbd2_journal_unfile_buffer(jh);
 477                         jbd_unlock_bh_state(bh);
 478                         jbd2_journal_remove_journal_head(bh);
 479                         put_bh(bh);
 480                 } else {
 481                         jbd_unlock_bh_state(bh);
 482                 }
 483                 put_bh(bh);
 484                 cond_resched_lock(&journal->j_list_lock);
 485         }
 486         spin_unlock(&journal->j_list_lock);
 487
 488         if (err)
 489                 jbd2_journal_abort(journal, err);
 490
 491         jbd2_journal_write_revoke_records(journal, commit_transaction);
 492
 493         jbd_debug(3, "JBD: commit phase 2\n");
 494
 495         /*
 496          * If we found any dirty or locked buffers, then we should have
 497          * looped back up to the write_out_data label.  If there weren't
 498          * any then journal_clean_data_list should have wiped the list
 499          * clean by now, so check that it is in fact empty.
 500          */
 501         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
 502
 503         jbd_debug (3, "JBD: commit phase 3\n");
 504
 505         /*
 506          * Way to go: we have now written out all of the data for a
 507          * transaction!  Now comes the tricky part: we need to write out
 508          * metadata.  Loop over the transaction's entire buffer list:
 509          */
 510         commit_transaction->t_state = T_COMMIT;
 511
 512         stats.u.run.rs_logging = jiffies;
 513         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 514                                                  stats.u.run.rs_logging);
 515         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 516         stats.u.run.rs_blocks_logged = 0;
 517
 518         descriptor = NULL;
 519         bufs = 0;
 520         while (commit_transaction->t_buffers) {
 521
 522                 /* Find the next buffer to be journaled... */
 523
 524                 jh = commit_transaction->t_buffers;
 525
 526                 /* If we're in abort mode, we just un-journal the buffer and
 527                    release it for background writing. */
 528
 529                 if (is_journal_aborted(journal)) {
 530                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 531                         jbd2_journal_refile_buffer(journal, jh);
 532                         /* If that was the last one, we need to clean up
 533                          * any descriptor buffers which may have been
 534                          * already allocated, even if we are now
 535                          * aborting. */
 536                         if (!commit_transaction->t_buffers)
 537                                 goto start_journal_io;
 538                         continue;
 539                 }
 540
 541                 /* Make sure we have a descriptor block in which to
 542                    record the metadata buffer. */
 543
 544                 if (!descriptor) {
 545                         struct buffer_head *bh;
 546
 547                         J_ASSERT (bufs == 0);
 548
 549                         jbd_debug(4, "JBD: get descriptor\n");
 550
 551                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 552                         if (!descriptor) {
 553                                 jbd2_journal_abort(journal, -EIO);
 554                                 continue;
 555                         }
 556
 557                         bh = jh2bh(descriptor);
 558                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 559                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 560                         header = (journal_header_t *)&bh->b_data[0];
 561                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 562                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 563                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 564
 565                         tagp = &bh->b_data[sizeof(journal_header_t)];
 566                         space_left = bh->b_size - sizeof(journal_header_t);
 567                         first_tag = 1;
 568                         set_buffer_jwrite(bh);
 569                         set_buffer_dirty(bh);
 570                         wbuf[bufs++] = bh;
 571
 572                         /* Record it so that we can wait for IO
 573                            completion later */
 574                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 575                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 576                                         BJ_LogCtl);
 577                 }
 578
 579                 /* Where is the buffer to be written? */
 580
 581                 err = jbd2_journal_next_log_block(journal, &blocknr);
 582                 /* If the block mapping failed, just abandon the buffer
 583                    and repeat this loop: we'll fall into the
 584                    refile-on-abort condition above. */
 585                 if (err) {
 586                         jbd2_journal_abort(journal, err);
 587                         continue;
 588                 }
 589
 590                 /*
 591                  * start_this_handle() uses t_outstanding_credits to determine
 592                  * the free space in the log, but this counter is changed
 593                  * by jbd2_journal_next_log_block() also.
 594                  */
 595                 commit_transaction->t_outstanding_credits--;
 596
 597                 /* Bump b_count to prevent truncate from stumbling over
 598                    the shadowed buffer!  @@@ This can go if we ever get
 599                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 600                 atomic_inc(&jh2bh(jh)->b_count);
 601
 602                 /* Make a temporary IO buffer with which to write it out
 603                    (this will requeue both the metadata buffer and the
 604                    temporary IO buffer). new_bh goes on BJ_IO*/
 605
 606                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 607                 /*
 608                  * akpm: jbd2_journal_write_metadata_buffer() sets
 609                  * new_bh->b_transaction to commit_transaction.
 610                  * We need to clean this up before we release new_bh
 611                  * (which is of type BJ_IO)
 612                  */
 613                 JBUFFER_TRACE(jh, "ph3: write metadata");
 614                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 615                                                       jh, &new_jh, blocknr);
 616                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 617                 wbuf[bufs++] = jh2bh(new_jh);
 618
 619                 /* Record the new block's tag in the current descriptor
 620                    buffer */
 621
 622                 tag_flag = 0;
 623                 if (flags & 1)
 624                         tag_flag |= JBD2_FLAG_ESCAPE;
 625                 if (!first_tag)
 626                         tag_flag |= JBD2_FLAG_SAME_UUID;
 627
 628                 tag = (journal_block_tag_t *) tagp;
 629                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 630                 tag->t_flags = cpu_to_be32(tag_flag);
 631                 tagp += tag_bytes;
 632                 space_left -= tag_bytes;
 633
 634                 if (first_tag) {
 635                         memcpy (tagp, journal->j_uuid, 16);
 636                         tagp += 16;
 637                         space_left -= 16;
 638                         first_tag = 0;
 639                 }
 640
 641                 /* If there's no more to do, or if the descriptor is full,
 642                    let the IO rip! */
 643
 644                 if (bufs == journal->j_wbufsize ||
 645                     commit_transaction->t_buffers == NULL ||
 646                     space_left < tag_bytes + 16) {
 647
 648                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 649
 650                         /* Write an end-of-descriptor marker before
 651                            submitting the IOs.  "tag" still points to
 652                            the last tag we set up. */
 653
 654                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 655
 656 start_journal_io:
 657                         for (i = 0; i < bufs; i++) {
 658                                 struct buffer_head *bh = wbuf[i];
 659                                 lock_buffer(bh);
 660                                 clear_buffer_dirty(bh);
 661                                 set_buffer_uptodate(bh);
 662                                 bh->b_end_io = journal_end_buffer_io_sync;
 663                                 submit_bh(WRITE, bh);
 664                         }
 665                         cond_resched();
 666                         stats.u.run.rs_blocks_logged += bufs;
 667
 668                         /* Force a new descriptor to be generated next
 669                            time round the loop. */
 670                         descriptor = NULL;
 671                         bufs = 0;
 672                 }
 673         }
 674
 675         /* Lo and behold: we have just managed to send a transaction to
 676            the log.  Before we can commit it, wait for the IO so far to
 677            complete.  Control buffers being written are on the
 678            transaction's t_log_list queue, and metadata buffers are on
 679            the t_iobuf_list queue.
 680
 681            Wait for the buffers in reverse order.  That way we are
 682            less likely to be woken up until all IOs have completed, and
 683            so we incur less scheduling load.
 684         */
 685
 686         jbd_debug(3, "JBD: commit phase 4\n");
 687
 688         /*
 689          * akpm: these are BJ_IO, and j_list_lock is not needed.
 690          * See __journal_try_to_free_buffer.
 691          */
 692 wait_for_iobuf:
 693         while (commit_transaction->t_iobuf_list != NULL) {
 694                 struct buffer_head *bh;
 695
 696                 jh = commit_transaction->t_iobuf_list->b_tprev;
 697                 bh = jh2bh(jh);
 698                 if (buffer_locked(bh)) {
 699                         wait_on_buffer(bh);
 700                         goto wait_for_iobuf;
 701                 }
 702                 if (cond_resched())
 703                         goto wait_for_iobuf;
 704
 705                 if (unlikely(!buffer_uptodate(bh)))
 706                         err = -EIO;
 707
 708                 clear_buffer_jwrite(bh);
 709
 710                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 711                 jbd2_journal_unfile_buffer(journal, jh);
 712
 713                 /*
 714                  * ->t_iobuf_list should contain only dummy buffer_heads
 715                  * which were created by jbd2_journal_write_metadata_buffer().
 716                  */
 717                 BUFFER_TRACE(bh, "dumping temporary bh");
 718                 jbd2_journal_put_journal_head(jh);
 719                 __brelse(bh);
 720                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 721                 free_buffer_head(bh);
 722
 723                 /* We also have to unlock and free the corresponding
 724                    shadowed buffer */
 725                 jh = commit_transaction->t_shadow_list->b_tprev;
 726                 bh = jh2bh(jh);
 727                 clear_bit(BH_JWrite, &bh->b_state);
 728                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 729
 730                 /* The metadata is now released for reuse, but we need
 731                    to remember it against this transaction so that when
 732                    we finally commit, we can do any checkpointing
 733                    required. */
 734                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 735                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 736                 /* Wake up any transactions which were waiting for this
 737                    IO to complete */
 738                 wake_up_bit(&bh->b_state, BH_Unshadow);
 739                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 740                 __brelse(bh);
 741         }
 742
 743         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 744
 745         jbd_debug(3, "JBD: commit phase 5\n");
 746
 747         /* Here we wait for the revoke record and descriptor record buffers */
 748  wait_for_ctlbuf:
 749         while (commit_transaction->t_log_list != NULL) {
 750                 struct buffer_head *bh;
 751
 752                 jh = commit_transaction->t_log_list->b_tprev;
 753                 bh = jh2bh(jh);
 754                 if (buffer_locked(bh)) {
 755                         wait_on_buffer(bh);
 756                         goto wait_for_ctlbuf;
 757                 }
 758                 if (cond_resched())
 759                         goto wait_for_ctlbuf;
 760
 761                 if (unlikely(!buffer_uptodate(bh)))
 762                         err = -EIO;
 763
 764                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 765                 clear_buffer_jwrite(bh);
 766                 jbd2_journal_unfile_buffer(journal, jh);
 767                 jbd2_journal_put_journal_head(jh);
 768                 __brelse(bh);           /* One for getblk */
 769                 /* AKPM: bforget here */
 770         }
 771
 772         jbd_debug(3, "JBD: commit phase 6\n");
 773
 774         if (journal_write_commit_record(journal, commit_transaction))
 775                 err = -EIO;
 776
 777         if (err)
 778                 jbd2_journal_abort(journal, err);
 779
 780         /* End of a transaction!  Finally, we can do checkpoint
 781            processing: any buffers committed as a result of this
 782            transaction can be removed from any checkpoint list it was on
 783            before. */
 784
 785         jbd_debug(3, "JBD: commit phase 7\n");
 786
 787         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
 788         J_ASSERT(commit_transaction->t_buffers == NULL);
 789         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 790         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 791         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 792         J_ASSERT(commit_transaction->t_log_list == NULL);
 793
 794 restart_loop:
 795         /*
 796          * As there are other places (journal_unmap_buffer()) adding buffers
 797          * to this list we have to be careful and hold the j_list_lock.
 798          */
 799         spin_lock(&journal->j_list_lock);
 800         while (commit_transaction->t_forget) {
 801                 transaction_t *cp_transaction;
 802                 struct buffer_head *bh;
 803
 804                 jh = commit_transaction->t_forget;
 805                 spin_unlock(&journal->j_list_lock);
 806                 bh = jh2bh(jh);
 807                 jbd_lock_bh_state(bh);
 808                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 809                         jh->b_transaction == journal->j_running_transaction);
 810
 811                 /*
 812                  * If there is undo-protected committed data against
 813                  * this buffer, then we can remove it now.  If it is a
 814                  * buffer needing such protection, the old frozen_data
 815                  * field now points to a committed version of the
 816                  * buffer, so rotate that field to the new committed
 817                  * data.
 818                  *
 819                  * Otherwise, we can just throw away the frozen data now.
 820                  */
 821                 if (jh->b_committed_data) {
 822                         jbd2_free(jh->b_committed_data, bh->b_size);
 823                         jh->b_committed_data = NULL;
 824                         if (jh->b_frozen_data) {
 825                                 jh->b_committed_data = jh->b_frozen_data;
 826                                 jh->b_frozen_data = NULL;
 827                         }
 828                 } else if (jh->b_frozen_data) {
 829                         jbd2_free(jh->b_frozen_data, bh->b_size);
 830                         jh->b_frozen_data = NULL;
 831                 }
 832
 833                 spin_lock(&journal->j_list_lock);
 834                 cp_transaction = jh->b_cp_transaction;
 835                 if (cp_transaction) {
 836                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 837                         cp_transaction->t_chp_stats.cs_dropped++;
 838                         __jbd2_journal_remove_checkpoint(jh);
 839                 }
 840
 841                 /* Only re-checkpoint the buffer_head if it is marked
 842                  * dirty.  If the buffer was added to the BJ_Forget list
 843                  * by jbd2_journal_forget, it may no longer be dirty and
 844                  * there's no point in keeping a checkpoint record for
 845                  * it. */
 846
 847                 /* A buffer which has been freed while still being
 848                  * journaled by a previous transaction may end up still
 849                  * being dirty here, but we want to avoid writing back
 850                  * that buffer in the future now that the last use has
 851                  * been committed.  That's not only a performance gain,
 852                  * it also stops aliasing problems if the buffer is left
 853                  * behind for writeback and gets reallocated for another
 854                  * use in a different page. */
 855                 if (buffer_freed(bh)) {
 856                         clear_buffer_freed(bh);
 857                         clear_buffer_jbddirty(bh);
 858                 }
 859
 860                 if (buffer_jbddirty(bh)) {
 861                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 862                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 863                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 864                         __jbd2_journal_refile_buffer(jh);
 865                         jbd_unlock_bh_state(bh);
 866                 } else {
 867                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 868                         /* The buffer on BJ_Forget list and not jbddirty means
 869                          * it has been freed by this transaction and hence it
 870                          * could not have been reallocated until this
 871                          * transaction has committed. *BUT* it could be
 872                          * reallocated once we have written all the data to
 873                          * disk and before we process the buffer on BJ_Forget
 874                          * list. */
 875                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 876                         __jbd2_journal_refile_buffer(jh);
 877                         if (!jh->b_transaction) {
 878                                 jbd_unlock_bh_state(bh);
 879                                  /* needs a brelse */
 880                                 jbd2_journal_remove_journal_head(bh);
 881                                 release_buffer_page(bh);
 882                         } else
 883                                 jbd_unlock_bh_state(bh);
 884                 }
 885                 cond_resched_lock(&journal->j_list_lock);
 886         }
 887         spin_unlock(&journal->j_list_lock);
 888         /*
 889          * This is a bit sleazy.  We use j_list_lock to protect transition
 890          * of a transaction into T_FINISHED state and calling
 891          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 892          * other checkpointing code processing the transaction...
 893          */
 894         spin_lock(&journal->j_state_lock);
 895         spin_lock(&journal->j_list_lock);
 896         /*
 897          * Now recheck if some buffers did not get attached to the transaction
 898          * while the lock was dropped...
 899          */
 900         if (commit_transaction->t_forget) {
 901                 spin_unlock(&journal->j_list_lock);
 902                 spin_unlock(&journal->j_state_lock);
 903                 goto restart_loop;
 904         }
 905
 906         /* Done with this transaction! */
 907
 908         jbd_debug(3, "JBD: commit phase 8\n");
 909
 910         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 911
 912         commit_transaction->t_start = jiffies;
 913         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
 914                                                 commit_transaction->t_start);
 915
 916         /*
 917          * File the transaction for history
 918          */
 919         stats.ts_type = JBD2_STATS_RUN;
 920         stats.ts_tid = commit_transaction->t_tid;
 921         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
 922         spin_lock(&journal->j_history_lock);
 923         memcpy(journal->j_history + journal->j_history_cur, &stats,
 924                         sizeof(stats));
 925         if (++journal->j_history_cur == journal->j_history_max)
 926                 journal->j_history_cur = 0;
 927
 928         /*
 929          * Calculate overall stats
 930          */
 931         journal->j_stats.ts_tid++;
 932         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
 933         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
 934         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
 935         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
 936         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
 937         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
 938         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
 939         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
 940         spin_unlock(&journal->j_history_lock);
 941
 942         commit_transaction->t_state = T_FINISHED;
 943         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 944         journal->j_commit_sequence = commit_transaction->t_tid;
 945         journal->j_committing_transaction = NULL;
 946         spin_unlock(&journal->j_state_lock);
 947
 948         if (commit_transaction->t_checkpoint_list == NULL &&
 949             commit_transaction->t_checkpoint_io_list == NULL) {
 950                 __jbd2_journal_drop_transaction(journal, commit_transaction);
 951         } else {
 952                 if (journal->j_checkpoint_transactions == NULL) {
 953                         journal->j_checkpoint_transactions = commit_transaction;
 954                         commit_transaction->t_cpnext = commit_transaction;
 955                         commit_transaction->t_cpprev = commit_transaction;
 956                 } else {
 957                         commit_transaction->t_cpnext =
 958                                 journal->j_checkpoint_transactions;
 959                         commit_transaction->t_cpprev =
 960                                 commit_transaction->t_cpnext->t_cpprev;
 961                         commit_transaction->t_cpnext->t_cpprev =
 962                                 commit_transaction;
 963                         commit_transaction->t_cpprev->t_cpnext =
 964                                 commit_transaction;
 965                 }
 966         }
 967         spin_unlock(&journal->j_list_lock);
 968
 969         jbd_debug(1, "JBD: commit %d complete, head %d\n",
 970                   journal->j_commit_sequence, journal->j_tail_sequence);
 971
 972         wake_up(&journal->j_wait_done_commit);
 973 }