SAFE public projects git trees. - safe/jmp/linux-2.6/blob - fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/marker.h>
  20 #include <linux/errno.h>
  21 #include <linux/slab.h>
  22 #include <linux/mm.h>
  23 #include <linux/pagemap.h>
  24 #include <linux/jiffies.h>
  25 #include <linux/crc32.h>
  26 #include <linux/writeback.h>
  27 #include <linux/backing-dev.h>
  28
  29 /*
  30  * Default IO end handler for temporary BJ_IO buffer_heads.
  31  */
  32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  33 {
  34         BUFFER_TRACE(bh, "");
  35         if (uptodate)
  36                 set_buffer_uptodate(bh);
  37         else
  38                 clear_buffer_uptodate(bh);
  39         unlock_buffer(bh);
  40 }
  41
  42 /*
  43  * When an ext4 file is truncated, it is possible that some pages are not
  44  * successfully freed, because they are attached to a committing transaction.
  45  * After the transaction commits, these pages are left on the LRU, with no
  46  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  47  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  48  * the numbers in /proc/meminfo look odd.
  49  *
  50  * So here, we have a buffer which has just come off the forget list.  Look to
  51  * see if we can strip all buffers from the backing page.
  52  *
  53  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  54  * caller provided us with a ref against the buffer, and we drop that here.
  55  */
  56 static void release_buffer_page(struct buffer_head *bh)
  57 {
  58         struct page *page;
  59
  60         if (buffer_dirty(bh))
  61                 goto nope;
  62         if (atomic_read(&bh->b_count) != 1)
  63                 goto nope;
  64         page = bh->b_page;
  65         if (!page)
  66                 goto nope;
  67         if (page->mapping)
  68                 goto nope;
  69
  70         /* OK, it's a truncated page */
  71         if (!trylock_page(page))
  72                 goto nope;
  73
  74         page_cache_get(page);
  75         __brelse(bh);
  76         try_to_free_buffers(page);
  77         unlock_page(page);
  78         page_cache_release(page);
  79         return;
  80
  81 nope:
  82         __brelse(bh);
  83 }
  84
  85 /*
  86  * Done it all: now submit the commit record.  We should have
  87  * cleaned up our previous buffers by now, so if we are in abort
  88  * mode we can now just skip the rest of the journal write
  89  * entirely.
  90  *
  91  * Returns 1 if the journal needs to be aborted or 0 on success
  92  */
  93 static int journal_submit_commit_record(journal_t *journal,
  94                                         transaction_t *commit_transaction,
  95                                         struct buffer_head **cbh,
  96                                         __u32 crc32_sum)
  97 {
  98         struct journal_head *descriptor;
  99         struct commit_header *tmp;
 100         struct buffer_head *bh;
 101         int ret;
 102         int barrier_done = 0;
 103         struct timespec now = current_kernel_time();
 104
 105         if (is_journal_aborted(journal))
 106                 return 0;
 107
 108         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 109         if (!descriptor)
 110                 return 1;
 111
 112         bh = jh2bh(descriptor);
 113
 114         tmp = (struct commit_header *)bh->b_data;
 115         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 116         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 117         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 118         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 119         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 120
 121         if (JBD2_HAS_COMPAT_FEATURE(journal,
 122                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 123                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 124                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 125                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 126         }
 127
 128         JBUFFER_TRACE(descriptor, "submit commit block");
 129         lock_buffer(bh);
 130         get_bh(bh);
 131         set_buffer_dirty(bh);
 132         set_buffer_uptodate(bh);
 133         bh->b_end_io = journal_end_buffer_io_sync;
 134
 135         if (journal->j_flags & JBD2_BARRIER &&
 136                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
 137                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 138                 set_buffer_ordered(bh);
 139                 barrier_done = 1;
 140         }
 141         ret = submit_bh(WRITE, bh);
 142         if (barrier_done)
 143                 clear_buffer_ordered(bh);
 144
 145         /* is it possible for another commit to fail at roughly
 146          * the same time as this one?  If so, we don't want to
 147          * trust the barrier flag in the super, but instead want
 148          * to remember if we sent a barrier request
 149          */
 150         if (ret == -EOPNOTSUPP && barrier_done) {
 151                 printk(KERN_WARNING
 152                        "JBD: barrier-based sync failed on %s - "
 153                        "disabling barriers\n", journal->j_devname);
 154                 spin_lock(&journal->j_state_lock);
 155                 journal->j_flags &= ~JBD2_BARRIER;
 156                 spin_unlock(&journal->j_state_lock);
 157
 158                 /* And try again, without the barrier */
 159                 lock_buffer(bh);
 160                 set_buffer_uptodate(bh);
 161                 set_buffer_dirty(bh);
 162                 ret = submit_bh(WRITE, bh);
 163         }
 164         *cbh = bh;
 165         return ret;
 166 }
 167
 168 /*
 169  * This function along with journal_submit_commit_record
 170  * allows to write the commit record asynchronously.
 171  */
 172 static int journal_wait_on_commit_record(struct buffer_head *bh)
 173 {
 174         int ret = 0;
 175
 176         clear_buffer_dirty(bh);
 177         wait_on_buffer(bh);
 178
 179         if (unlikely(!buffer_uptodate(bh)))
 180                 ret = -EIO;
 181         put_bh(bh);            /* One for getblk() */
 182         jbd2_journal_put_journal_head(bh2jh(bh));
 183
 184         return ret;
 185 }
 186
 187 /*
 188  * write the filemap data using writepage() address_space_operations.
 189  * We don't do block allocation here even for delalloc. We don't
 190  * use writepages() because with dealyed allocation we may be doing
 191  * block allocation in writepages().
 192  */
 193 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 194 {
 195         int ret;
 196         struct writeback_control wbc = {
 197                 .sync_mode =  WB_SYNC_ALL,
 198                 .nr_to_write = mapping->nrpages * 2,
 199                 .range_start = 0,
 200                 .range_end = i_size_read(mapping->host),
 201                 .for_writepages = 1,
 202         };
 203
 204         ret = generic_writepages(mapping, &wbc);
 205         return ret;
 206 }
 207
 208 /*
 209  * Submit all the data buffers of inode associated with the transaction to
 210  * disk.
 211  *
 212  * We are in a committing transaction. Therefore no new inode can be added to
 213  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 214  * operate on from being released while we write out pages.
 215  */
 216 static int journal_submit_data_buffers(journal_t *journal,
 217                 transaction_t *commit_transaction)
 218 {
 219         struct jbd2_inode *jinode;
 220         int err, ret = 0;
 221         struct address_space *mapping;
 222
 223         spin_lock(&journal->j_list_lock);
 224         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 225                 mapping = jinode->i_vfs_inode->i_mapping;
 226                 jinode->i_flags |= JI_COMMIT_RUNNING;
 227                 spin_unlock(&journal->j_list_lock);
 228                 /*
 229                  * submit the inode data buffers. We use writepage
 230                  * instead of writepages. Because writepages can do
 231                  * block allocation  with delalloc. We need to write
 232                  * only allocated blocks here.
 233                  */
 234                 err = journal_submit_inode_data_buffers(mapping);
 235                 if (!ret)
 236                         ret = err;
 237                 spin_lock(&journal->j_list_lock);
 238                 J_ASSERT(jinode->i_transaction == commit_transaction);
 239                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 240                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 241         }
 242         spin_unlock(&journal->j_list_lock);
 243         return ret;
 244 }
 245
 246 /*
 247  * Wait for data submitted for writeout, refile inodes to proper
 248  * transaction if needed.
 249  *
 250  */
 251 static int journal_finish_inode_data_buffers(journal_t *journal,
 252                 transaction_t *commit_transaction)
 253 {
 254         struct jbd2_inode *jinode, *next_i;
 255         int err, ret = 0;
 256
 257         /* For locking, see the comment in journal_submit_data_buffers() */
 258         spin_lock(&journal->j_list_lock);
 259         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 260                 jinode->i_flags |= JI_COMMIT_RUNNING;
 261                 spin_unlock(&journal->j_list_lock);
 262                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 263                 if (err) {
 264                         /*
 265                          * Because AS_EIO is cleared by
 266                          * wait_on_page_writeback_range(), set it again so
 267                          * that user process can get -EIO from fsync().
 268                          */
 269                         set_bit(AS_EIO,
 270                                 &jinode->i_vfs_inode->i_mapping->flags);
 271
 272                         if (!ret)
 273                                 ret = err;
 274                 }
 275                 spin_lock(&journal->j_list_lock);
 276                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 277                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 278         }
 279
 280         /* Now refile inode to proper lists */
 281         list_for_each_entry_safe(jinode, next_i,
 282                                  &commit_transaction->t_inode_list, i_list) {
 283                 list_del(&jinode->i_list);
 284                 if (jinode->i_next_transaction) {
 285                         jinode->i_transaction = jinode->i_next_transaction;
 286                         jinode->i_next_transaction = NULL;
 287                         list_add(&jinode->i_list,
 288                                 &jinode->i_transaction->t_inode_list);
 289                 } else {
 290                         jinode->i_transaction = NULL;
 291                 }
 292         }
 293         spin_unlock(&journal->j_list_lock);
 294
 295         return ret;
 296 }
 297
 298 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 299 {
 300         struct page *page = bh->b_page;
 301         char *addr;
 302         __u32 checksum;
 303
 304         addr = kmap_atomic(page, KM_USER0);
 305         checksum = crc32_be(crc32_sum,
 306                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 307         kunmap_atomic(addr, KM_USER0);
 308
 309         return checksum;
 310 }
 311
 312 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 313                                    unsigned long long block)
 314 {
 315         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 316         if (tag_bytes > JBD2_TAG_SIZE32)
 317                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 318 }
 319
 320 /*
 321  * jbd2_journal_commit_transaction
 322  *
 323  * The primary function for committing a transaction to the log.  This
 324  * function is called by the journal thread to begin a complete commit.
 325  */
 326 void jbd2_journal_commit_transaction(journal_t *journal)
 327 {
 328         struct transaction_stats_s stats;
 329         transaction_t *commit_transaction;
 330         struct journal_head *jh, *new_jh, *descriptor;
 331         struct buffer_head **wbuf = journal->j_wbuf;
 332         int bufs;
 333         int flags;
 334         int err;
 335         unsigned long long blocknr;
 336         char *tagp = NULL;
 337         journal_header_t *header;
 338         journal_block_tag_t *tag = NULL;
 339         int space_left = 0;
 340         int first_tag = 0;
 341         int tag_flag;
 342         int i;
 343         int tag_bytes = journal_tag_bytes(journal);
 344         struct buffer_head *cbh = NULL; /* For transactional checksums */
 345         __u32 crc32_sum = ~0;
 346
 347         /*
 348          * First job: lock down the current transaction and wait for
 349          * all outstanding updates to complete.
 350          */
 351
 352 #ifdef COMMIT_STATS
 353         spin_lock(&journal->j_list_lock);
 354         summarise_journal_usage(journal);
 355         spin_unlock(&journal->j_list_lock);
 356 #endif
 357
 358         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 359         if (journal->j_flags & JBD2_FLUSHED) {
 360                 jbd_debug(3, "super block updated\n");
 361                 jbd2_journal_update_superblock(journal, 1);
 362         } else {
 363                 jbd_debug(3, "superblock not updated\n");
 364         }
 365
 366         J_ASSERT(journal->j_running_transaction != NULL);
 367         J_ASSERT(journal->j_committing_transaction == NULL);
 368
 369         commit_transaction = journal->j_running_transaction;
 370         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 371
 372         trace_mark(jbd2_start_commit, "dev %s transaction %d",
 373                    journal->j_devname, commit_transaction->t_tid);
 374         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 375                         commit_transaction->t_tid);
 376
 377         spin_lock(&journal->j_state_lock);
 378         commit_transaction->t_state = T_LOCKED;
 379
 380         stats.u.run.rs_wait = commit_transaction->t_max_wait;
 381         stats.u.run.rs_locked = jiffies;
 382         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 383                                                 stats.u.run.rs_locked);
 384
 385         spin_lock(&commit_transaction->t_handle_lock);
 386         while (commit_transaction->t_updates) {
 387                 DEFINE_WAIT(wait);
 388
 389                 prepare_to_wait(&journal->j_wait_updates, &wait,
 390                                         TASK_UNINTERRUPTIBLE);
 391                 if (commit_transaction->t_updates) {
 392                         spin_unlock(&commit_transaction->t_handle_lock);
 393                         spin_unlock(&journal->j_state_lock);
 394                         schedule();
 395                         spin_lock(&journal->j_state_lock);
 396                         spin_lock(&commit_transaction->t_handle_lock);
 397                 }
 398                 finish_wait(&journal->j_wait_updates, &wait);
 399         }
 400         spin_unlock(&commit_transaction->t_handle_lock);
 401
 402         J_ASSERT (commit_transaction->t_outstanding_credits <=
 403                         journal->j_max_transaction_buffers);
 404
 405         /*
 406          * First thing we are allowed to do is to discard any remaining
 407          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 408          * that there are no such buffers: if a large filesystem
 409          * operation like a truncate needs to split itself over multiple
 410          * transactions, then it may try to do a jbd2_journal_restart() while
 411          * there are still BJ_Reserved buffers outstanding.  These must
 412          * be released cleanly from the current transaction.
 413          *
 414          * In this case, the filesystem must still reserve write access
 415          * again before modifying the buffer in the new transaction, but
 416          * we do not require it to remember exactly which old buffers it
 417          * has reserved.  This is consistent with the existing behaviour
 418          * that multiple jbd2_journal_get_write_access() calls to the same
 419          * buffer are perfectly permissable.
 420          */
 421         while (commit_transaction->t_reserved_list) {
 422                 jh = commit_transaction->t_reserved_list;
 423                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 424                 /*
 425                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 426                  * leave undo-committed data.
 427                  */
 428                 if (jh->b_committed_data) {
 429                         struct buffer_head *bh = jh2bh(jh);
 430
 431                         jbd_lock_bh_state(bh);
 432                         jbd2_free(jh->b_committed_data, bh->b_size);
 433                         jh->b_committed_data = NULL;
 434                         jbd_unlock_bh_state(bh);
 435                 }
 436                 jbd2_journal_refile_buffer(journal, jh);
 437         }
 438
 439         /*
 440          * Now try to drop any written-back buffers from the journal's
 441          * checkpoint lists.  We do this *before* commit because it potentially
 442          * frees some memory
 443          */
 444         spin_lock(&journal->j_list_lock);
 445         __jbd2_journal_clean_checkpoint_list(journal);
 446         spin_unlock(&journal->j_list_lock);
 447
 448         jbd_debug (3, "JBD: commit phase 1\n");
 449
 450         /*
 451          * Switch to a new revoke table.
 452          */
 453         jbd2_journal_switch_revoke_table(journal);
 454
 455         stats.u.run.rs_flushing = jiffies;
 456         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 457                                                stats.u.run.rs_flushing);
 458
 459         commit_transaction->t_state = T_FLUSH;
 460         journal->j_committing_transaction = commit_transaction;
 461         journal->j_running_transaction = NULL;
 462         commit_transaction->t_log_start = journal->j_head;
 463         wake_up(&journal->j_wait_transaction_locked);
 464         spin_unlock(&journal->j_state_lock);
 465
 466         jbd_debug (3, "JBD: commit phase 2\n");
 467
 468         /*
 469          * Now start flushing things to disk, in the order they appear
 470          * on the transaction lists.  Data blocks go first.
 471          */
 472         err = journal_submit_data_buffers(journal, commit_transaction);
 473         if (err)
 474                 jbd2_journal_abort(journal, err);
 475
 476         jbd2_journal_write_revoke_records(journal, commit_transaction);
 477
 478         jbd_debug(3, "JBD: commit phase 2\n");
 479
 480         /*
 481          * Way to go: we have now written out all of the data for a
 482          * transaction!  Now comes the tricky part: we need to write out
 483          * metadata.  Loop over the transaction's entire buffer list:
 484          */
 485         spin_lock(&journal->j_state_lock);
 486         commit_transaction->t_state = T_COMMIT;
 487         spin_unlock(&journal->j_state_lock);
 488
 489         stats.u.run.rs_logging = jiffies;
 490         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 491                                                  stats.u.run.rs_logging);
 492         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 493         stats.u.run.rs_blocks_logged = 0;
 494
 495         J_ASSERT(commit_transaction->t_nr_buffers <=
 496                  commit_transaction->t_outstanding_credits);
 497
 498         err = 0;
 499         descriptor = NULL;
 500         bufs = 0;
 501         while (commit_transaction->t_buffers) {
 502
 503                 /* Find the next buffer to be journaled... */
 504
 505                 jh = commit_transaction->t_buffers;
 506
 507                 /* If we're in abort mode, we just un-journal the buffer and
 508                    release it for background writing. */
 509
 510                 if (is_journal_aborted(journal)) {
 511                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 512                         jbd2_journal_refile_buffer(journal, jh);
 513                         /* If that was the last one, we need to clean up
 514                          * any descriptor buffers which may have been
 515                          * already allocated, even if we are now
 516                          * aborting. */
 517                         if (!commit_transaction->t_buffers)
 518                                 goto start_journal_io;
 519                         continue;
 520                 }
 521
 522                 /* Make sure we have a descriptor block in which to
 523                    record the metadata buffer. */
 524
 525                 if (!descriptor) {
 526                         struct buffer_head *bh;
 527
 528                         J_ASSERT (bufs == 0);
 529
 530                         jbd_debug(4, "JBD: get descriptor\n");
 531
 532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 533                         if (!descriptor) {
 534                                 jbd2_journal_abort(journal, -EIO);
 535                                 continue;
 536                         }
 537
 538                         bh = jh2bh(descriptor);
 539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 541                         header = (journal_header_t *)&bh->b_data[0];
 542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 545
 546                         tagp = &bh->b_data[sizeof(journal_header_t)];
 547                         space_left = bh->b_size - sizeof(journal_header_t);
 548                         first_tag = 1;
 549                         set_buffer_jwrite(bh);
 550                         set_buffer_dirty(bh);
 551                         wbuf[bufs++] = bh;
 552
 553                         /* Record it so that we can wait for IO
 554                            completion later */
 555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 557                                         BJ_LogCtl);
 558                 }
 559
 560                 /* Where is the buffer to be written? */
 561
 562                 err = jbd2_journal_next_log_block(journal, &blocknr);
 563                 /* If the block mapping failed, just abandon the buffer
 564                    and repeat this loop: we'll fall into the
 565                    refile-on-abort condition above. */
 566                 if (err) {
 567                         jbd2_journal_abort(journal, err);
 568                         continue;
 569                 }
 570
 571                 /*
 572                  * start_this_handle() uses t_outstanding_credits to determine
 573                  * the free space in the log, but this counter is changed
 574                  * by jbd2_journal_next_log_block() also.
 575                  */
 576                 commit_transaction->t_outstanding_credits--;
 577
 578                 /* Bump b_count to prevent truncate from stumbling over
 579                    the shadowed buffer!  @@@ This can go if we ever get
 580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 581                 atomic_inc(&jh2bh(jh)->b_count);
 582
 583                 /* Make a temporary IO buffer with which to write it out
 584                    (this will requeue both the metadata buffer and the
 585                    temporary IO buffer). new_bh goes on BJ_IO*/
 586
 587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 588                 /*
 589                  * akpm: jbd2_journal_write_metadata_buffer() sets
 590                  * new_bh->b_transaction to commit_transaction.
 591                  * We need to clean this up before we release new_bh
 592                  * (which is of type BJ_IO)
 593                  */
 594                 JBUFFER_TRACE(jh, "ph3: write metadata");
 595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 596                                                       jh, &new_jh, blocknr);
 597                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 598                 wbuf[bufs++] = jh2bh(new_jh);
 599
 600                 /* Record the new block's tag in the current descriptor
 601                    buffer */
 602
 603                 tag_flag = 0;
 604                 if (flags & 1)
 605                         tag_flag |= JBD2_FLAG_ESCAPE;
 606                 if (!first_tag)
 607                         tag_flag |= JBD2_FLAG_SAME_UUID;
 608
 609                 tag = (journal_block_tag_t *) tagp;
 610                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 611                 tag->t_flags = cpu_to_be32(tag_flag);
 612                 tagp += tag_bytes;
 613                 space_left -= tag_bytes;
 614
 615                 if (first_tag) {
 616                         memcpy (tagp, journal->j_uuid, 16);
 617                         tagp += 16;
 618                         space_left -= 16;
 619                         first_tag = 0;
 620                 }
 621
 622                 /* If there's no more to do, or if the descriptor is full,
 623                    let the IO rip! */
 624
 625                 if (bufs == journal->j_wbufsize ||
 626                     commit_transaction->t_buffers == NULL ||
 627                     space_left < tag_bytes + 16) {
 628
 629                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 630
 631                         /* Write an end-of-descriptor marker before
 632                            submitting the IOs.  "tag" still points to
 633                            the last tag we set up. */
 634
 635                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 636
 637 start_journal_io:
 638                         for (i = 0; i < bufs; i++) {
 639                                 struct buffer_head *bh = wbuf[i];
 640                                 /*
 641                                  * Compute checksum.
 642                                  */
 643                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 644                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 645                                         crc32_sum =
 646                                             jbd2_checksum_data(crc32_sum, bh);
 647                                 }
 648
 649                                 lock_buffer(bh);
 650                                 clear_buffer_dirty(bh);
 651                                 set_buffer_uptodate(bh);
 652                                 bh->b_end_io = journal_end_buffer_io_sync;
 653                                 submit_bh(WRITE, bh);
 654                         }
 655                         cond_resched();
 656                         stats.u.run.rs_blocks_logged += bufs;
 657
 658                         /* Force a new descriptor to be generated next
 659                            time round the loop. */
 660                         descriptor = NULL;
 661                         bufs = 0;
 662                 }
 663         }
 664
 665         /* Done it all: now write the commit record asynchronously. */
 666
 667         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 668                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 669                 err = journal_submit_commit_record(journal, commit_transaction,
 670                                                  &cbh, crc32_sum);
 671                 if (err)
 672                         __jbd2_journal_abort_hard(journal);
 673         }
 674
 675         /*
 676          * This is the right place to wait for data buffers both for ASYNC
 677          * and !ASYNC commit. If commit is ASYNC, we need to wait only after
 678          * the commit block went to disk (which happens above). If commit is
 679          * SYNC, we need to wait for data buffers before we start writing
 680          * commit block, which happens below in such setting.
 681          */
 682         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 683         if (err) {
 684                 printk(KERN_WARNING
 685                         "JBD2: Detected IO errors while flushing file data "
 686                        "on %s\n", journal->j_devname);
 687                 err = 0;
 688         }
 689
 690         /* Lo and behold: we have just managed to send a transaction to
 691            the log.  Before we can commit it, wait for the IO so far to
 692            complete.  Control buffers being written are on the
 693            transaction's t_log_list queue, and metadata buffers are on
 694            the t_iobuf_list queue.
 695
 696            Wait for the buffers in reverse order.  That way we are
 697            less likely to be woken up until all IOs have completed, and
 698            so we incur less scheduling load.
 699         */
 700
 701         jbd_debug(3, "JBD: commit phase 3\n");
 702
 703         /*
 704          * akpm: these are BJ_IO, and j_list_lock is not needed.
 705          * See __journal_try_to_free_buffer.
 706          */
 707 wait_for_iobuf:
 708         while (commit_transaction->t_iobuf_list != NULL) {
 709                 struct buffer_head *bh;
 710
 711                 jh = commit_transaction->t_iobuf_list->b_tprev;
 712                 bh = jh2bh(jh);
 713                 if (buffer_locked(bh)) {
 714                         wait_on_buffer(bh);
 715                         goto wait_for_iobuf;
 716                 }
 717                 if (cond_resched())
 718                         goto wait_for_iobuf;
 719
 720                 if (unlikely(!buffer_uptodate(bh)))
 721                         err = -EIO;
 722
 723                 clear_buffer_jwrite(bh);
 724
 725                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 726                 jbd2_journal_unfile_buffer(journal, jh);
 727
 728                 /*
 729                  * ->t_iobuf_list should contain only dummy buffer_heads
 730                  * which were created by jbd2_journal_write_metadata_buffer().
 731                  */
 732                 BUFFER_TRACE(bh, "dumping temporary bh");
 733                 jbd2_journal_put_journal_head(jh);
 734                 __brelse(bh);
 735                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 736                 free_buffer_head(bh);
 737
 738                 /* We also have to unlock and free the corresponding
 739                    shadowed buffer */
 740                 jh = commit_transaction->t_shadow_list->b_tprev;
 741                 bh = jh2bh(jh);
 742                 clear_bit(BH_JWrite, &bh->b_state);
 743                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 744
 745                 /* The metadata is now released for reuse, but we need
 746                    to remember it against this transaction so that when
 747                    we finally commit, we can do any checkpointing
 748                    required. */
 749                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 750                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 751                 /* Wake up any transactions which were waiting for this
 752                    IO to complete */
 753                 wake_up_bit(&bh->b_state, BH_Unshadow);
 754                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 755                 __brelse(bh);
 756         }
 757
 758         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 759
 760         jbd_debug(3, "JBD: commit phase 4\n");
 761
 762         /* Here we wait for the revoke record and descriptor record buffers */
 763  wait_for_ctlbuf:
 764         while (commit_transaction->t_log_list != NULL) {
 765                 struct buffer_head *bh;
 766
 767                 jh = commit_transaction->t_log_list->b_tprev;
 768                 bh = jh2bh(jh);
 769                 if (buffer_locked(bh)) {
 770                         wait_on_buffer(bh);
 771                         goto wait_for_ctlbuf;
 772                 }
 773                 if (cond_resched())
 774                         goto wait_for_ctlbuf;
 775
 776                 if (unlikely(!buffer_uptodate(bh)))
 777                         err = -EIO;
 778
 779                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 780                 clear_buffer_jwrite(bh);
 781                 jbd2_journal_unfile_buffer(journal, jh);
 782                 jbd2_journal_put_journal_head(jh);
 783                 __brelse(bh);           /* One for getblk */
 784                 /* AKPM: bforget here */
 785         }
 786
 787         jbd_debug(3, "JBD: commit phase 5\n");
 788
 789         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 790                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 791                 err = journal_submit_commit_record(journal, commit_transaction,
 792                                                 &cbh, crc32_sum);
 793                 if (err)
 794                         __jbd2_journal_abort_hard(journal);
 795         }
 796         if (!err && !is_journal_aborted(journal))
 797                 err = journal_wait_on_commit_record(cbh);
 798
 799         if (err)
 800                 jbd2_journal_abort(journal, err);
 801
 802         /* End of a transaction!  Finally, we can do checkpoint
 803            processing: any buffers committed as a result of this
 804            transaction can be removed from any checkpoint list it was on
 805            before. */
 806
 807         jbd_debug(3, "JBD: commit phase 6\n");
 808
 809         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 810         J_ASSERT(commit_transaction->t_buffers == NULL);
 811         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 812         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 813         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 814         J_ASSERT(commit_transaction->t_log_list == NULL);
 815
 816 restart_loop:
 817         /*
 818          * As there are other places (journal_unmap_buffer()) adding buffers
 819          * to this list we have to be careful and hold the j_list_lock.
 820          */
 821         spin_lock(&journal->j_list_lock);
 822         while (commit_transaction->t_forget) {
 823                 transaction_t *cp_transaction;
 824                 struct buffer_head *bh;
 825
 826                 jh = commit_transaction->t_forget;
 827                 spin_unlock(&journal->j_list_lock);
 828                 bh = jh2bh(jh);
 829                 jbd_lock_bh_state(bh);
 830                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 831                         jh->b_transaction == journal->j_running_transaction);
 832
 833                 /*
 834                  * If there is undo-protected committed data against
 835                  * this buffer, then we can remove it now.  If it is a
 836                  * buffer needing such protection, the old frozen_data
 837                  * field now points to a committed version of the
 838                  * buffer, so rotate that field to the new committed
 839                  * data.
 840                  *
 841                  * Otherwise, we can just throw away the frozen data now.
 842                  */
 843                 if (jh->b_committed_data) {
 844                         jbd2_free(jh->b_committed_data, bh->b_size);
 845                         jh->b_committed_data = NULL;
 846                         if (jh->b_frozen_data) {
 847                                 jh->b_committed_data = jh->b_frozen_data;
 848                                 jh->b_frozen_data = NULL;
 849                         }
 850                 } else if (jh->b_frozen_data) {
 851                         jbd2_free(jh->b_frozen_data, bh->b_size);
 852                         jh->b_frozen_data = NULL;
 853                 }
 854
 855                 spin_lock(&journal->j_list_lock);
 856                 cp_transaction = jh->b_cp_transaction;
 857                 if (cp_transaction) {
 858                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 859                         cp_transaction->t_chp_stats.cs_dropped++;
 860                         __jbd2_journal_remove_checkpoint(jh);
 861                 }
 862
 863                 /* Only re-checkpoint the buffer_head if it is marked
 864                  * dirty.  If the buffer was added to the BJ_Forget list
 865                  * by jbd2_journal_forget, it may no longer be dirty and
 866                  * there's no point in keeping a checkpoint record for
 867                  * it. */
 868
 869                 /* A buffer which has been freed while still being
 870                  * journaled by a previous transaction may end up still
 871                  * being dirty here, but we want to avoid writing back
 872                  * that buffer in the future now that the last use has
 873                  * been committed.  That's not only a performance gain,
 874                  * it also stops aliasing problems if the buffer is left
 875                  * behind for writeback and gets reallocated for another
 876                  * use in a different page. */
 877                 if (buffer_freed(bh)) {
 878                         clear_buffer_freed(bh);
 879                         clear_buffer_jbddirty(bh);
 880                 }
 881
 882                 if (buffer_jbddirty(bh)) {
 883                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 884                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 885                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 886                         __jbd2_journal_refile_buffer(jh);
 887                         jbd_unlock_bh_state(bh);
 888                 } else {
 889                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 890                         /* The buffer on BJ_Forget list and not jbddirty means
 891                          * it has been freed by this transaction and hence it
 892                          * could not have been reallocated until this
 893                          * transaction has committed. *BUT* it could be
 894                          * reallocated once we have written all the data to
 895                          * disk and before we process the buffer on BJ_Forget
 896                          * list. */
 897                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 898                         __jbd2_journal_refile_buffer(jh);
 899                         if (!jh->b_transaction) {
 900                                 jbd_unlock_bh_state(bh);
 901                                  /* needs a brelse */
 902                                 jbd2_journal_remove_journal_head(bh);
 903                                 release_buffer_page(bh);
 904                         } else
 905                                 jbd_unlock_bh_state(bh);
 906                 }
 907                 cond_resched_lock(&journal->j_list_lock);
 908         }
 909         spin_unlock(&journal->j_list_lock);
 910         /*
 911          * This is a bit sleazy.  We use j_list_lock to protect transition
 912          * of a transaction into T_FINISHED state and calling
 913          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 914          * other checkpointing code processing the transaction...
 915          */
 916         spin_lock(&journal->j_state_lock);
 917         spin_lock(&journal->j_list_lock);
 918         /*
 919          * Now recheck if some buffers did not get attached to the transaction
 920          * while the lock was dropped...
 921          */
 922         if (commit_transaction->t_forget) {
 923                 spin_unlock(&journal->j_list_lock);
 924                 spin_unlock(&journal->j_state_lock);
 925                 goto restart_loop;
 926         }
 927
 928         /* Done with this transaction! */
 929
 930         jbd_debug(3, "JBD: commit phase 7\n");
 931
 932         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 933
 934         commit_transaction->t_start = jiffies;
 935         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
 936                                                 commit_transaction->t_start);
 937
 938         /*
 939          * File the transaction for history
 940          */
 941         stats.ts_type = JBD2_STATS_RUN;
 942         stats.ts_tid = commit_transaction->t_tid;
 943         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
 944         spin_lock(&journal->j_history_lock);
 945         memcpy(journal->j_history + journal->j_history_cur, &stats,
 946                         sizeof(stats));
 947         if (++journal->j_history_cur == journal->j_history_max)
 948                 journal->j_history_cur = 0;
 949
 950         /*
 951          * Calculate overall stats
 952          */
 953         journal->j_stats.ts_tid++;
 954         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
 955         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
 956         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
 957         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
 958         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
 959         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
 960         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
 961         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
 962         spin_unlock(&journal->j_history_lock);
 963
 964         commit_transaction->t_state = T_FINISHED;
 965         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 966         journal->j_commit_sequence = commit_transaction->t_tid;
 967         journal->j_committing_transaction = NULL;
 968         spin_unlock(&journal->j_state_lock);
 969
 970         if (commit_transaction->t_checkpoint_list == NULL &&
 971             commit_transaction->t_checkpoint_io_list == NULL) {
 972                 __jbd2_journal_drop_transaction(journal, commit_transaction);
 973         } else {
 974                 if (journal->j_checkpoint_transactions == NULL) {
 975                         journal->j_checkpoint_transactions = commit_transaction;
 976                         commit_transaction->t_cpnext = commit_transaction;
 977                         commit_transaction->t_cpprev = commit_transaction;
 978                 } else {
 979                         commit_transaction->t_cpnext =
 980                                 journal->j_checkpoint_transactions;
 981                         commit_transaction->t_cpprev =
 982                                 commit_transaction->t_cpnext->t_cpprev;
 983                         commit_transaction->t_cpnext->t_cpprev =
 984                                 commit_transaction;
 985                         commit_transaction->t_cpprev->t_cpnext =
 986                                 commit_transaction;
 987                 }
 988         }
 989         spin_unlock(&journal->j_list_lock);
 990
 991         trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
 992                    journal->j_devname, commit_transaction->t_tid,
 993                    journal->j_tail_sequence);
 994         jbd_debug(1, "JBD: commit %d complete, head %d\n",
 995                   journal->j_commit_sequence, journal->j_tail_sequence);
 996
 997         wake_up(&journal->j_wait_done_commit);
 998 }