SAFE public projects git trees. - safe/jmp/linux-2.6/blob - fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25
  26 /*
  27  * Default IO end handler for temporary BJ_IO buffer_heads.
  28  */
  29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  30 {
  31         BUFFER_TRACE(bh, "");
  32         if (uptodate)
  33                 set_buffer_uptodate(bh);
  34         else
  35                 clear_buffer_uptodate(bh);
  36         unlock_buffer(bh);
  37 }
  38
  39 /*
  40  * When an ext4 file is truncated, it is possible that some pages are not
  41  * successfully freed, because they are attached to a committing transaction.
  42  * After the transaction commits, these pages are left on the LRU, with no
  43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  45  * the numbers in /proc/meminfo look odd.
  46  *
  47  * So here, we have a buffer which has just come off the forget list.  Look to
  48  * see if we can strip all buffers from the backing page.
  49  *
  50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  51  * caller provided us with a ref against the buffer, and we drop that here.
  52  */
  53 static void release_buffer_page(struct buffer_head *bh)
  54 {
  55         struct page *page;
  56
  57         if (buffer_dirty(bh))
  58                 goto nope;
  59         if (atomic_read(&bh->b_count) != 1)
  60                 goto nope;
  61         page = bh->b_page;
  62         if (!page)
  63                 goto nope;
  64         if (page->mapping)
  65                 goto nope;
  66
  67         /* OK, it's a truncated page */
  68         if (TestSetPageLocked(page))
  69                 goto nope;
  70
  71         page_cache_get(page);
  72         __brelse(bh);
  73         try_to_free_buffers(page);
  74         unlock_page(page);
  75         page_cache_release(page);
  76         return;
  77
  78 nope:
  79         __brelse(bh);
  80 }
  81
  82 /*
  83  * Done it all: now submit the commit record.  We should have
  84  * cleaned up our previous buffers by now, so if we are in abort
  85  * mode we can now just skip the rest of the journal write
  86  * entirely.
  87  *
  88  * Returns 1 if the journal needs to be aborted or 0 on success
  89  */
  90 static int journal_submit_commit_record(journal_t *journal,
  91                                         transaction_t *commit_transaction,
  92                                         struct buffer_head **cbh,
  93                                         __u32 crc32_sum)
  94 {
  95         struct journal_head *descriptor;
  96         struct commit_header *tmp;
  97         struct buffer_head *bh;
  98         int ret;
  99         int barrier_done = 0;
 100         struct timespec now = current_kernel_time();
 101
 102         if (is_journal_aborted(journal))
 103                 return 0;
 104
 105         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 106         if (!descriptor)
 107                 return 1;
 108
 109         bh = jh2bh(descriptor);
 110
 111         tmp = (struct commit_header *)bh->b_data;
 112         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 113         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 114         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 115         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 116         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 117
 118         if (JBD2_HAS_COMPAT_FEATURE(journal,
 119                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 120                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 121                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 122                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 123         }
 124
 125         JBUFFER_TRACE(descriptor, "submit commit block");
 126         lock_buffer(bh);
 127         get_bh(bh);
 128         set_buffer_dirty(bh);
 129         set_buffer_uptodate(bh);
 130         bh->b_end_io = journal_end_buffer_io_sync;
 131
 132         if (journal->j_flags & JBD2_BARRIER &&
 133                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
 134                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 135                 set_buffer_ordered(bh);
 136                 barrier_done = 1;
 137         }
 138         ret = submit_bh(WRITE, bh);
 139         if (barrier_done)
 140                 clear_buffer_ordered(bh);
 141
 142         /* is it possible for another commit to fail at roughly
 143          * the same time as this one?  If so, we don't want to
 144          * trust the barrier flag in the super, but instead want
 145          * to remember if we sent a barrier request
 146          */
 147         if (ret == -EOPNOTSUPP && barrier_done) {
 148                 char b[BDEVNAME_SIZE];
 149
 150                 printk(KERN_WARNING
 151                         "JBD: barrier-based sync failed on %s - "
 152                         "disabling barriers\n",
 153                         bdevname(journal->j_dev, b));
 154                 spin_lock(&journal->j_state_lock);
 155                 journal->j_flags &= ~JBD2_BARRIER;
 156                 spin_unlock(&journal->j_state_lock);
 157
 158                 /* And try again, without the barrier */
 159                 lock_buffer(bh);
 160                 set_buffer_uptodate(bh);
 161                 set_buffer_dirty(bh);
 162                 ret = submit_bh(WRITE, bh);
 163         }
 164         *cbh = bh;
 165         return ret;
 166 }
 167
 168 /*
 169  * This function along with journal_submit_commit_record
 170  * allows to write the commit record asynchronously.
 171  */
 172 static int journal_wait_on_commit_record(struct buffer_head *bh)
 173 {
 174         int ret = 0;
 175
 176         clear_buffer_dirty(bh);
 177         wait_on_buffer(bh);
 178
 179         if (unlikely(!buffer_uptodate(bh)))
 180                 ret = -EIO;
 181         put_bh(bh);            /* One for getblk() */
 182         jbd2_journal_put_journal_head(bh2jh(bh));
 183
 184         return ret;
 185 }
 186
 187 /*
 188  * Submit all the data buffers of inode associated with the transaction to
 189  * disk.
 190  *
 191  * We are in a committing transaction. Therefore no new inode can be added to
 192  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 193  * operate on from being released while we write out pages.
 194  */
 195 static int journal_submit_inode_data_buffers(journal_t *journal,
 196                 transaction_t *commit_transaction)
 197 {
 198         struct jbd2_inode *jinode;
 199         int err, ret = 0;
 200         struct address_space *mapping;
 201
 202         spin_lock(&journal->j_list_lock);
 203         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 204                 mapping = jinode->i_vfs_inode->i_mapping;
 205                 jinode->i_flags |= JI_COMMIT_RUNNING;
 206                 spin_unlock(&journal->j_list_lock);
 207                 err = filemap_fdatawrite_range(mapping, 0,
 208                                         i_size_read(jinode->i_vfs_inode));
 209                 if (!ret)
 210                         ret = err;
 211                 spin_lock(&journal->j_list_lock);
 212                 J_ASSERT(jinode->i_transaction == commit_transaction);
 213                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 214                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 215         }
 216         spin_unlock(&journal->j_list_lock);
 217         return ret;
 218 }
 219
 220 /*
 221  * Wait for data submitted for writeout, refile inodes to proper
 222  * transaction if needed.
 223  *
 224  */
 225 static int journal_finish_inode_data_buffers(journal_t *journal,
 226                 transaction_t *commit_transaction)
 227 {
 228         struct jbd2_inode *jinode, *next_i;
 229         int err, ret = 0;
 230
 231         /* For locking, see the comment in journal_submit_inode_data_buffers() */
 232         spin_lock(&journal->j_list_lock);
 233         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 234                 jinode->i_flags |= JI_COMMIT_RUNNING;
 235                 spin_unlock(&journal->j_list_lock);
 236                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 237                 if (!ret)
 238                         ret = err;
 239                 spin_lock(&journal->j_list_lock);
 240                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 241                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 242         }
 243
 244         /* Now refile inode to proper lists */
 245         list_for_each_entry_safe(jinode, next_i,
 246                                  &commit_transaction->t_inode_list, i_list) {
 247                 list_del(&jinode->i_list);
 248                 if (jinode->i_next_transaction) {
 249                         jinode->i_transaction = jinode->i_next_transaction;
 250                         jinode->i_next_transaction = NULL;
 251                         list_add(&jinode->i_list,
 252                                 &jinode->i_transaction->t_inode_list);
 253                 } else {
 254                         jinode->i_transaction = NULL;
 255                 }
 256         }
 257         spin_unlock(&journal->j_list_lock);
 258
 259         return ret;
 260 }
 261
 262 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 263 {
 264         struct page *page = bh->b_page;
 265         char *addr;
 266         __u32 checksum;
 267
 268         addr = kmap_atomic(page, KM_USER0);
 269         checksum = crc32_be(crc32_sum,
 270                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 271         kunmap_atomic(addr, KM_USER0);
 272
 273         return checksum;
 274 }
 275
 276 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 277                                    unsigned long long block)
 278 {
 279         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 280         if (tag_bytes > JBD2_TAG_SIZE32)
 281                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 282 }
 283
 284 /*
 285  * jbd2_journal_commit_transaction
 286  *
 287  * The primary function for committing a transaction to the log.  This
 288  * function is called by the journal thread to begin a complete commit.
 289  */
 290 void jbd2_journal_commit_transaction(journal_t *journal)
 291 {
 292         struct transaction_stats_s stats;
 293         transaction_t *commit_transaction;
 294         struct journal_head *jh, *new_jh, *descriptor;
 295         struct buffer_head **wbuf = journal->j_wbuf;
 296         int bufs;
 297         int flags;
 298         int err;
 299         unsigned long long blocknr;
 300         char *tagp = NULL;
 301         journal_header_t *header;
 302         journal_block_tag_t *tag = NULL;
 303         int space_left = 0;
 304         int first_tag = 0;
 305         int tag_flag;
 306         int i;
 307         int tag_bytes = journal_tag_bytes(journal);
 308         struct buffer_head *cbh = NULL; /* For transactional checksums */
 309         __u32 crc32_sum = ~0;
 310
 311         /*
 312          * First job: lock down the current transaction and wait for
 313          * all outstanding updates to complete.
 314          */
 315
 316 #ifdef COMMIT_STATS
 317         spin_lock(&journal->j_list_lock);
 318         summarise_journal_usage(journal);
 319         spin_unlock(&journal->j_list_lock);
 320 #endif
 321
 322         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 323         if (journal->j_flags & JBD2_FLUSHED) {
 324                 jbd_debug(3, "super block updated\n");
 325                 jbd2_journal_update_superblock(journal, 1);
 326         } else {
 327                 jbd_debug(3, "superblock not updated\n");
 328         }
 329
 330         J_ASSERT(journal->j_running_transaction != NULL);
 331         J_ASSERT(journal->j_committing_transaction == NULL);
 332
 333         commit_transaction = journal->j_running_transaction;
 334         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 335
 336         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 337                         commit_transaction->t_tid);
 338
 339         spin_lock(&journal->j_state_lock);
 340         commit_transaction->t_state = T_LOCKED;
 341
 342         stats.u.run.rs_wait = commit_transaction->t_max_wait;
 343         stats.u.run.rs_locked = jiffies;
 344         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 345                                                 stats.u.run.rs_locked);
 346
 347         spin_lock(&commit_transaction->t_handle_lock);
 348         while (commit_transaction->t_updates) {
 349                 DEFINE_WAIT(wait);
 350
 351                 prepare_to_wait(&journal->j_wait_updates, &wait,
 352                                         TASK_UNINTERRUPTIBLE);
 353                 if (commit_transaction->t_updates) {
 354                         spin_unlock(&commit_transaction->t_handle_lock);
 355                         spin_unlock(&journal->j_state_lock);
 356                         schedule();
 357                         spin_lock(&journal->j_state_lock);
 358                         spin_lock(&commit_transaction->t_handle_lock);
 359                 }
 360                 finish_wait(&journal->j_wait_updates, &wait);
 361         }
 362         spin_unlock(&commit_transaction->t_handle_lock);
 363
 364         J_ASSERT (commit_transaction->t_outstanding_credits <=
 365                         journal->j_max_transaction_buffers);
 366
 367         /*
 368          * First thing we are allowed to do is to discard any remaining
 369          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 370          * that there are no such buffers: if a large filesystem
 371          * operation like a truncate needs to split itself over multiple
 372          * transactions, then it may try to do a jbd2_journal_restart() while
 373          * there are still BJ_Reserved buffers outstanding.  These must
 374          * be released cleanly from the current transaction.
 375          *
 376          * In this case, the filesystem must still reserve write access
 377          * again before modifying the buffer in the new transaction, but
 378          * we do not require it to remember exactly which old buffers it
 379          * has reserved.  This is consistent with the existing behaviour
 380          * that multiple jbd2_journal_get_write_access() calls to the same
 381          * buffer are perfectly permissable.
 382          */
 383         while (commit_transaction->t_reserved_list) {
 384                 jh = commit_transaction->t_reserved_list;
 385                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 386                 /*
 387                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 388                  * leave undo-committed data.
 389                  */
 390                 if (jh->b_committed_data) {
 391                         struct buffer_head *bh = jh2bh(jh);
 392
 393                         jbd_lock_bh_state(bh);
 394                         jbd2_free(jh->b_committed_data, bh->b_size);
 395                         jh->b_committed_data = NULL;
 396                         jbd_unlock_bh_state(bh);
 397                 }
 398                 jbd2_journal_refile_buffer(journal, jh);
 399         }
 400
 401         /*
 402          * Now try to drop any written-back buffers from the journal's
 403          * checkpoint lists.  We do this *before* commit because it potentially
 404          * frees some memory
 405          */
 406         spin_lock(&journal->j_list_lock);
 407         __jbd2_journal_clean_checkpoint_list(journal);
 408         spin_unlock(&journal->j_list_lock);
 409
 410         jbd_debug (3, "JBD: commit phase 1\n");
 411
 412         /*
 413          * Switch to a new revoke table.
 414          */
 415         jbd2_journal_switch_revoke_table(journal);
 416
 417         stats.u.run.rs_flushing = jiffies;
 418         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 419                                                stats.u.run.rs_flushing);
 420
 421         commit_transaction->t_state = T_FLUSH;
 422         journal->j_committing_transaction = commit_transaction;
 423         journal->j_running_transaction = NULL;
 424         commit_transaction->t_log_start = journal->j_head;
 425         wake_up(&journal->j_wait_transaction_locked);
 426         spin_unlock(&journal->j_state_lock);
 427
 428         jbd_debug (3, "JBD: commit phase 2\n");
 429
 430         /*
 431          * Now start flushing things to disk, in the order they appear
 432          * on the transaction lists.  Data blocks go first.
 433          */
 434         err = journal_submit_inode_data_buffers(journal, commit_transaction);
 435         if (err)
 436                 jbd2_journal_abort(journal, err);
 437
 438         jbd2_journal_write_revoke_records(journal, commit_transaction);
 439
 440         jbd_debug(3, "JBD: commit phase 2\n");
 441
 442         /*
 443          * Way to go: we have now written out all of the data for a
 444          * transaction!  Now comes the tricky part: we need to write out
 445          * metadata.  Loop over the transaction's entire buffer list:
 446          */
 447         spin_lock(&journal->j_state_lock);
 448         commit_transaction->t_state = T_COMMIT;
 449         spin_unlock(&journal->j_state_lock);
 450
 451         stats.u.run.rs_logging = jiffies;
 452         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 453                                                  stats.u.run.rs_logging);
 454         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 455         stats.u.run.rs_blocks_logged = 0;
 456
 457         J_ASSERT(commit_transaction->t_nr_buffers <=
 458                  commit_transaction->t_outstanding_credits);
 459
 460         err = 0;
 461         descriptor = NULL;
 462         bufs = 0;
 463         while (commit_transaction->t_buffers) {
 464
 465                 /* Find the next buffer to be journaled... */
 466
 467                 jh = commit_transaction->t_buffers;
 468
 469                 /* If we're in abort mode, we just un-journal the buffer and
 470                    release it for background writing. */
 471
 472                 if (is_journal_aborted(journal)) {
 473                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 474                         jbd2_journal_refile_buffer(journal, jh);
 475                         /* If that was the last one, we need to clean up
 476                          * any descriptor buffers which may have been
 477                          * already allocated, even if we are now
 478                          * aborting. */
 479                         if (!commit_transaction->t_buffers)
 480                                 goto start_journal_io;
 481                         continue;
 482                 }
 483
 484                 /* Make sure we have a descriptor block in which to
 485                    record the metadata buffer. */
 486
 487                 if (!descriptor) {
 488                         struct buffer_head *bh;
 489
 490                         J_ASSERT (bufs == 0);
 491
 492                         jbd_debug(4, "JBD: get descriptor\n");
 493
 494                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 495                         if (!descriptor) {
 496                                 jbd2_journal_abort(journal, -EIO);
 497                                 continue;
 498                         }
 499
 500                         bh = jh2bh(descriptor);
 501                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 502                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 503                         header = (journal_header_t *)&bh->b_data[0];
 504                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 505                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 506                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 507
 508                         tagp = &bh->b_data[sizeof(journal_header_t)];
 509                         space_left = bh->b_size - sizeof(journal_header_t);
 510                         first_tag = 1;
 511                         set_buffer_jwrite(bh);
 512                         set_buffer_dirty(bh);
 513                         wbuf[bufs++] = bh;
 514
 515                         /* Record it so that we can wait for IO
 516                            completion later */
 517                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 518                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 519                                         BJ_LogCtl);
 520                 }
 521
 522                 /* Where is the buffer to be written? */
 523
 524                 err = jbd2_journal_next_log_block(journal, &blocknr);
 525                 /* If the block mapping failed, just abandon the buffer
 526                    and repeat this loop: we'll fall into the
 527                    refile-on-abort condition above. */
 528                 if (err) {
 529                         jbd2_journal_abort(journal, err);
 530                         continue;
 531                 }
 532
 533                 /*
 534                  * start_this_handle() uses t_outstanding_credits to determine
 535                  * the free space in the log, but this counter is changed
 536                  * by jbd2_journal_next_log_block() also.
 537                  */
 538                 commit_transaction->t_outstanding_credits--;
 539
 540                 /* Bump b_count to prevent truncate from stumbling over
 541                    the shadowed buffer!  @@@ This can go if we ever get
 542                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 543                 atomic_inc(&jh2bh(jh)->b_count);
 544
 545                 /* Make a temporary IO buffer with which to write it out
 546                    (this will requeue both the metadata buffer and the
 547                    temporary IO buffer). new_bh goes on BJ_IO*/
 548
 549                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 550                 /*
 551                  * akpm: jbd2_journal_write_metadata_buffer() sets
 552                  * new_bh->b_transaction to commit_transaction.
 553                  * We need to clean this up before we release new_bh
 554                  * (which is of type BJ_IO)
 555                  */
 556                 JBUFFER_TRACE(jh, "ph3: write metadata");
 557                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 558                                                       jh, &new_jh, blocknr);
 559                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 560                 wbuf[bufs++] = jh2bh(new_jh);
 561
 562                 /* Record the new block's tag in the current descriptor
 563                    buffer */
 564
 565                 tag_flag = 0;
 566                 if (flags & 1)
 567                         tag_flag |= JBD2_FLAG_ESCAPE;
 568                 if (!first_tag)
 569                         tag_flag |= JBD2_FLAG_SAME_UUID;
 570
 571                 tag = (journal_block_tag_t *) tagp;
 572                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 573                 tag->t_flags = cpu_to_be32(tag_flag);
 574                 tagp += tag_bytes;
 575                 space_left -= tag_bytes;
 576
 577                 if (first_tag) {
 578                         memcpy (tagp, journal->j_uuid, 16);
 579                         tagp += 16;
 580                         space_left -= 16;
 581                         first_tag = 0;
 582                 }
 583
 584                 /* If there's no more to do, or if the descriptor is full,
 585                    let the IO rip! */
 586
 587                 if (bufs == journal->j_wbufsize ||
 588                     commit_transaction->t_buffers == NULL ||
 589                     space_left < tag_bytes + 16) {
 590
 591                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 592
 593                         /* Write an end-of-descriptor marker before
 594                            submitting the IOs.  "tag" still points to
 595                            the last tag we set up. */
 596
 597                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 598
 599 start_journal_io:
 600                         for (i = 0; i < bufs; i++) {
 601                                 struct buffer_head *bh = wbuf[i];
 602                                 /*
 603                                  * Compute checksum.
 604                                  */
 605                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 606                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 607                                         crc32_sum =
 608                                             jbd2_checksum_data(crc32_sum, bh);
 609                                 }
 610
 611                                 lock_buffer(bh);
 612                                 clear_buffer_dirty(bh);
 613                                 set_buffer_uptodate(bh);
 614                                 bh->b_end_io = journal_end_buffer_io_sync;
 615                                 submit_bh(WRITE, bh);
 616                         }
 617                         cond_resched();
 618                         stats.u.run.rs_blocks_logged += bufs;
 619
 620                         /* Force a new descriptor to be generated next
 621                            time round the loop. */
 622                         descriptor = NULL;
 623                         bufs = 0;
 624                 }
 625         }
 626
 627         /* Done it all: now write the commit record asynchronously. */
 628
 629         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 630                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 631                 err = journal_submit_commit_record(journal, commit_transaction,
 632                                                  &cbh, crc32_sum);
 633                 if (err)
 634                         __jbd2_journal_abort_hard(journal);
 635         }
 636
 637         /*
 638          * This is the right place to wait for data buffers both for ASYNC
 639          * and !ASYNC commit. If commit is ASYNC, we need to wait only after
 640          * the commit block went to disk (which happens above). If commit is
 641          * SYNC, we need to wait for data buffers before we start writing
 642          * commit block, which happens below in such setting.
 643          */
 644         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 645         if (err)
 646                 jbd2_journal_abort(journal, err);
 647
 648         /* Lo and behold: we have just managed to send a transaction to
 649            the log.  Before we can commit it, wait for the IO so far to
 650            complete.  Control buffers being written are on the
 651            transaction's t_log_list queue, and metadata buffers are on
 652            the t_iobuf_list queue.
 653
 654            Wait for the buffers in reverse order.  That way we are
 655            less likely to be woken up until all IOs have completed, and
 656            so we incur less scheduling load.
 657         */
 658
 659         jbd_debug(3, "JBD: commit phase 3\n");
 660
 661         /*
 662          * akpm: these are BJ_IO, and j_list_lock is not needed.
 663          * See __journal_try_to_free_buffer.
 664          */
 665 wait_for_iobuf:
 666         while (commit_transaction->t_iobuf_list != NULL) {
 667                 struct buffer_head *bh;
 668
 669                 jh = commit_transaction->t_iobuf_list->b_tprev;
 670                 bh = jh2bh(jh);
 671                 if (buffer_locked(bh)) {
 672                         wait_on_buffer(bh);
 673                         goto wait_for_iobuf;
 674                 }
 675                 if (cond_resched())
 676                         goto wait_for_iobuf;
 677
 678                 if (unlikely(!buffer_uptodate(bh)))
 679                         err = -EIO;
 680
 681                 clear_buffer_jwrite(bh);
 682
 683                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 684                 jbd2_journal_unfile_buffer(journal, jh);
 685
 686                 /*
 687                  * ->t_iobuf_list should contain only dummy buffer_heads
 688                  * which were created by jbd2_journal_write_metadata_buffer().
 689                  */
 690                 BUFFER_TRACE(bh, "dumping temporary bh");
 691                 jbd2_journal_put_journal_head(jh);
 692                 __brelse(bh);
 693                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 694                 free_buffer_head(bh);
 695
 696                 /* We also have to unlock and free the corresponding
 697                    shadowed buffer */
 698                 jh = commit_transaction->t_shadow_list->b_tprev;
 699                 bh = jh2bh(jh);
 700                 clear_bit(BH_JWrite, &bh->b_state);
 701                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 702
 703                 /* The metadata is now released for reuse, but we need
 704                    to remember it against this transaction so that when
 705                    we finally commit, we can do any checkpointing
 706                    required. */
 707                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 708                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 709                 /* Wake up any transactions which were waiting for this
 710                    IO to complete */
 711                 wake_up_bit(&bh->b_state, BH_Unshadow);
 712                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 713                 __brelse(bh);
 714         }
 715
 716         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 717
 718         jbd_debug(3, "JBD: commit phase 4\n");
 719
 720         /* Here we wait for the revoke record and descriptor record buffers */
 721  wait_for_ctlbuf:
 722         while (commit_transaction->t_log_list != NULL) {
 723                 struct buffer_head *bh;
 724
 725                 jh = commit_transaction->t_log_list->b_tprev;
 726                 bh = jh2bh(jh);
 727                 if (buffer_locked(bh)) {
 728                         wait_on_buffer(bh);
 729                         goto wait_for_ctlbuf;
 730                 }
 731                 if (cond_resched())
 732                         goto wait_for_ctlbuf;
 733
 734                 if (unlikely(!buffer_uptodate(bh)))
 735                         err = -EIO;
 736
 737                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 738                 clear_buffer_jwrite(bh);
 739                 jbd2_journal_unfile_buffer(journal, jh);
 740                 jbd2_journal_put_journal_head(jh);
 741                 __brelse(bh);           /* One for getblk */
 742                 /* AKPM: bforget here */
 743         }
 744
 745         jbd_debug(3, "JBD: commit phase 5\n");
 746
 747         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 748                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 749                 err = journal_submit_commit_record(journal, commit_transaction,
 750                                                 &cbh, crc32_sum);
 751                 if (err)
 752                         __jbd2_journal_abort_hard(journal);
 753         }
 754         if (!err && !is_journal_aborted(journal))
 755                 err = journal_wait_on_commit_record(cbh);
 756
 757         if (err)
 758                 jbd2_journal_abort(journal, err);
 759
 760         /* End of a transaction!  Finally, we can do checkpoint
 761            processing: any buffers committed as a result of this
 762            transaction can be removed from any checkpoint list it was on
 763            before. */
 764
 765         jbd_debug(3, "JBD: commit phase 6\n");
 766
 767         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 768         J_ASSERT(commit_transaction->t_buffers == NULL);
 769         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 770         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 771         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 772         J_ASSERT(commit_transaction->t_log_list == NULL);
 773
 774 restart_loop:
 775         /*
 776          * As there are other places (journal_unmap_buffer()) adding buffers
 777          * to this list we have to be careful and hold the j_list_lock.
 778          */
 779         spin_lock(&journal->j_list_lock);
 780         while (commit_transaction->t_forget) {
 781                 transaction_t *cp_transaction;
 782                 struct buffer_head *bh;
 783
 784                 jh = commit_transaction->t_forget;
 785                 spin_unlock(&journal->j_list_lock);
 786                 bh = jh2bh(jh);
 787                 jbd_lock_bh_state(bh);
 788                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 789                         jh->b_transaction == journal->j_running_transaction);
 790
 791                 /*
 792                  * If there is undo-protected committed data against
 793                  * this buffer, then we can remove it now.  If it is a
 794                  * buffer needing such protection, the old frozen_data
 795                  * field now points to a committed version of the
 796                  * buffer, so rotate that field to the new committed
 797                  * data.
 798                  *
 799                  * Otherwise, we can just throw away the frozen data now.
 800                  */
 801                 if (jh->b_committed_data) {
 802                         jbd2_free(jh->b_committed_data, bh->b_size);
 803                         jh->b_committed_data = NULL;
 804                         if (jh->b_frozen_data) {
 805                                 jh->b_committed_data = jh->b_frozen_data;
 806                                 jh->b_frozen_data = NULL;
 807                         }
 808                 } else if (jh->b_frozen_data) {
 809                         jbd2_free(jh->b_frozen_data, bh->b_size);
 810                         jh->b_frozen_data = NULL;
 811                 }
 812
 813                 spin_lock(&journal->j_list_lock);
 814                 cp_transaction = jh->b_cp_transaction;
 815                 if (cp_transaction) {
 816                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 817                         cp_transaction->t_chp_stats.cs_dropped++;
 818                         __jbd2_journal_remove_checkpoint(jh);
 819                 }
 820
 821                 /* Only re-checkpoint the buffer_head if it is marked
 822                  * dirty.  If the buffer was added to the BJ_Forget list
 823                  * by jbd2_journal_forget, it may no longer be dirty and
 824                  * there's no point in keeping a checkpoint record for
 825                  * it. */
 826
 827                 /* A buffer which has been freed while still being
 828                  * journaled by a previous transaction may end up still
 829                  * being dirty here, but we want to avoid writing back
 830                  * that buffer in the future now that the last use has
 831                  * been committed.  That's not only a performance gain,
 832                  * it also stops aliasing problems if the buffer is left
 833                  * behind for writeback and gets reallocated for another
 834                  * use in a different page. */
 835                 if (buffer_freed(bh)) {
 836                         clear_buffer_freed(bh);
 837                         clear_buffer_jbddirty(bh);
 838                 }
 839
 840                 if (buffer_jbddirty(bh)) {
 841                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 842                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 843                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 844                         __jbd2_journal_refile_buffer(jh);
 845                         jbd_unlock_bh_state(bh);
 846                 } else {
 847                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 848                         /* The buffer on BJ_Forget list and not jbddirty means
 849                          * it has been freed by this transaction and hence it
 850                          * could not have been reallocated until this
 851                          * transaction has committed. *BUT* it could be
 852                          * reallocated once we have written all the data to
 853                          * disk and before we process the buffer on BJ_Forget
 854                          * list. */
 855                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 856                         __jbd2_journal_refile_buffer(jh);
 857                         if (!jh->b_transaction) {
 858                                 jbd_unlock_bh_state(bh);
 859                                  /* needs a brelse */
 860                                 jbd2_journal_remove_journal_head(bh);
 861                                 release_buffer_page(bh);
 862                         } else
 863                                 jbd_unlock_bh_state(bh);
 864                 }
 865                 cond_resched_lock(&journal->j_list_lock);
 866         }
 867         spin_unlock(&journal->j_list_lock);
 868         /*
 869          * This is a bit sleazy.  We use j_list_lock to protect transition
 870          * of a transaction into T_FINISHED state and calling
 871          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 872          * other checkpointing code processing the transaction...
 873          */
 874         spin_lock(&journal->j_state_lock);
 875         spin_lock(&journal->j_list_lock);
 876         /*
 877          * Now recheck if some buffers did not get attached to the transaction
 878          * while the lock was dropped...
 879          */
 880         if (commit_transaction->t_forget) {
 881                 spin_unlock(&journal->j_list_lock);
 882                 spin_unlock(&journal->j_state_lock);
 883                 goto restart_loop;
 884         }
 885
 886         /* Done with this transaction! */
 887
 888         jbd_debug(3, "JBD: commit phase 7\n");
 889
 890         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 891
 892         commit_transaction->t_start = jiffies;
 893         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
 894                                                 commit_transaction->t_start);
 895
 896         /*
 897          * File the transaction for history
 898          */
 899         stats.ts_type = JBD2_STATS_RUN;
 900         stats.ts_tid = commit_transaction->t_tid;
 901         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
 902         spin_lock(&journal->j_history_lock);
 903         memcpy(journal->j_history + journal->j_history_cur, &stats,
 904                         sizeof(stats));
 905         if (++journal->j_history_cur == journal->j_history_max)
 906                 journal->j_history_cur = 0;
 907
 908         /*
 909          * Calculate overall stats
 910          */
 911         journal->j_stats.ts_tid++;
 912         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
 913         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
 914         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
 915         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
 916         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
 917         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
 918         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
 919         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
 920         spin_unlock(&journal->j_history_lock);
 921
 922         commit_transaction->t_state = T_FINISHED;
 923         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 924         journal->j_commit_sequence = commit_transaction->t_tid;
 925         journal->j_committing_transaction = NULL;
 926         spin_unlock(&journal->j_state_lock);
 927
 928         if (commit_transaction->t_checkpoint_list == NULL &&
 929             commit_transaction->t_checkpoint_io_list == NULL) {
 930                 __jbd2_journal_drop_transaction(journal, commit_transaction);
 931         } else {
 932                 if (journal->j_checkpoint_transactions == NULL) {
 933                         journal->j_checkpoint_transactions = commit_transaction;
 934                         commit_transaction->t_cpnext = commit_transaction;
 935                         commit_transaction->t_cpprev = commit_transaction;
 936                 } else {
 937                         commit_transaction->t_cpnext =
 938                                 journal->j_checkpoint_transactions;
 939                         commit_transaction->t_cpprev =
 940                                 commit_transaction->t_cpnext->t_cpprev;
 941                         commit_transaction->t_cpnext->t_cpprev =
 942                                 commit_transaction;
 943                         commit_transaction->t_cpprev->t_cpnext =
 944                                 commit_transaction;
 945                 }
 946         }
 947         spin_unlock(&journal->j_list_lock);
 948
 949         jbd_debug(1, "JBD: commit %d complete, head %d\n",
 950                   journal->j_commit_sequence, journal->j_tail_sequence);
 951
 952         wake_up(&journal->j_wait_done_commit);
 953 }