SAFE public projects git trees. - safe/jmp/linux-2.6/blob - fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25 #include <linux/writeback.h>
  26 #include <linux/backing-dev.h>
  27
  28 /*
  29  * Default IO end handler for temporary BJ_IO buffer_heads.
  30  */
  31 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  32 {
  33         BUFFER_TRACE(bh, "");
  34         if (uptodate)
  35                 set_buffer_uptodate(bh);
  36         else
  37                 clear_buffer_uptodate(bh);
  38         unlock_buffer(bh);
  39 }
  40
  41 /*
  42  * When an ext4 file is truncated, it is possible that some pages are not
  43  * successfully freed, because they are attached to a committing transaction.
  44  * After the transaction commits, these pages are left on the LRU, with no
  45  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  46  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  47  * the numbers in /proc/meminfo look odd.
  48  *
  49  * So here, we have a buffer which has just come off the forget list.  Look to
  50  * see if we can strip all buffers from the backing page.
  51  *
  52  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  53  * caller provided us with a ref against the buffer, and we drop that here.
  54  */
  55 static void release_buffer_page(struct buffer_head *bh)
  56 {
  57         struct page *page;
  58
  59         if (buffer_dirty(bh))
  60                 goto nope;
  61         if (atomic_read(&bh->b_count) != 1)
  62                 goto nope;
  63         page = bh->b_page;
  64         if (!page)
  65                 goto nope;
  66         if (page->mapping)
  67                 goto nope;
  68
  69         /* OK, it's a truncated page */
  70         if (TestSetPageLocked(page))
  71                 goto nope;
  72
  73         page_cache_get(page);
  74         __brelse(bh);
  75         try_to_free_buffers(page);
  76         unlock_page(page);
  77         page_cache_release(page);
  78         return;
  79
  80 nope:
  81         __brelse(bh);
  82 }
  83
  84 /*
  85  * Done it all: now submit the commit record.  We should have
  86  * cleaned up our previous buffers by now, so if we are in abort
  87  * mode we can now just skip the rest of the journal write
  88  * entirely.
  89  *
  90  * Returns 1 if the journal needs to be aborted or 0 on success
  91  */
  92 static int journal_submit_commit_record(journal_t *journal,
  93                                         transaction_t *commit_transaction,
  94                                         struct buffer_head **cbh,
  95                                         __u32 crc32_sum)
  96 {
  97         struct journal_head *descriptor;
  98         struct commit_header *tmp;
  99         struct buffer_head *bh;
 100         int ret;
 101         int barrier_done = 0;
 102         struct timespec now = current_kernel_time();
 103
 104         if (is_journal_aborted(journal))
 105                 return 0;
 106
 107         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 108         if (!descriptor)
 109                 return 1;
 110
 111         bh = jh2bh(descriptor);
 112
 113         tmp = (struct commit_header *)bh->b_data;
 114         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 115         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 116         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 117         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 118         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 119
 120         if (JBD2_HAS_COMPAT_FEATURE(journal,
 121                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 122                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 123                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 124                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 125         }
 126
 127         JBUFFER_TRACE(descriptor, "submit commit block");
 128         lock_buffer(bh);
 129         get_bh(bh);
 130         set_buffer_dirty(bh);
 131         set_buffer_uptodate(bh);
 132         bh->b_end_io = journal_end_buffer_io_sync;
 133
 134         if (journal->j_flags & JBD2_BARRIER &&
 135                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
 136                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 137                 set_buffer_ordered(bh);
 138                 barrier_done = 1;
 139         }
 140         ret = submit_bh(WRITE, bh);
 141         if (barrier_done)
 142                 clear_buffer_ordered(bh);
 143
 144         /* is it possible for another commit to fail at roughly
 145          * the same time as this one?  If so, we don't want to
 146          * trust the barrier flag in the super, but instead want
 147          * to remember if we sent a barrier request
 148          */
 149         if (ret == -EOPNOTSUPP && barrier_done) {
 150                 char b[BDEVNAME_SIZE];
 151
 152                 printk(KERN_WARNING
 153                         "JBD: barrier-based sync failed on %s - "
 154                         "disabling barriers\n",
 155                         bdevname(journal->j_dev, b));
 156                 spin_lock(&journal->j_state_lock);
 157                 journal->j_flags &= ~JBD2_BARRIER;
 158                 spin_unlock(&journal->j_state_lock);
 159
 160                 /* And try again, without the barrier */
 161                 lock_buffer(bh);
 162                 set_buffer_uptodate(bh);
 163                 set_buffer_dirty(bh);
 164                 ret = submit_bh(WRITE, bh);
 165         }
 166         *cbh = bh;
 167         return ret;
 168 }
 169
 170 /*
 171  * This function along with journal_submit_commit_record
 172  * allows to write the commit record asynchronously.
 173  */
 174 static int journal_wait_on_commit_record(struct buffer_head *bh)
 175 {
 176         int ret = 0;
 177
 178         clear_buffer_dirty(bh);
 179         wait_on_buffer(bh);
 180
 181         if (unlikely(!buffer_uptodate(bh)))
 182                 ret = -EIO;
 183         put_bh(bh);            /* One for getblk() */
 184         jbd2_journal_put_journal_head(bh2jh(bh));
 185
 186         return ret;
 187 }
 188
 189 /*
 190  * write the filemap data using writepage() address_space_operations.
 191  * We don't do block allocation here even for delalloc. We don't
 192  * use writepages() because with dealyed allocation we may be doing
 193  * block allocation in writepages().
 194  */
 195 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 196 {
 197         int ret;
 198         struct writeback_control wbc = {
 199                 .sync_mode =  WB_SYNC_ALL,
 200                 .nr_to_write = mapping->nrpages * 2,
 201                 .range_start = 0,
 202                 .range_end = i_size_read(mapping->host),
 203                 .for_writepages = 1,
 204         };
 205
 206         ret = generic_writepages(mapping, &wbc);
 207         return ret;
 208 }
 209
 210 /*
 211  * Submit all the data buffers of inode associated with the transaction to
 212  * disk.
 213  *
 214  * We are in a committing transaction. Therefore no new inode can be added to
 215  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 216  * operate on from being released while we write out pages.
 217  */
 218 static int journal_submit_data_buffers(journal_t *journal,
 219                 transaction_t *commit_transaction)
 220 {
 221         struct jbd2_inode *jinode;
 222         int err, ret = 0;
 223         struct address_space *mapping;
 224
 225         spin_lock(&journal->j_list_lock);
 226         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 227                 mapping = jinode->i_vfs_inode->i_mapping;
 228                 jinode->i_flags |= JI_COMMIT_RUNNING;
 229                 spin_unlock(&journal->j_list_lock);
 230                 /*
 231                  * submit the inode data buffers. We use writepage
 232                  * instead of writepages. Because writepages can do
 233                  * block allocation  with delalloc. We need to write
 234                  * only allocated blocks here.
 235                  */
 236                 err = journal_submit_inode_data_buffers(mapping);
 237                 if (!ret)
 238                         ret = err;
 239                 spin_lock(&journal->j_list_lock);
 240                 J_ASSERT(jinode->i_transaction == commit_transaction);
 241                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 242                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 243         }
 244         spin_unlock(&journal->j_list_lock);
 245         return ret;
 246 }
 247
 248 /*
 249  * Wait for data submitted for writeout, refile inodes to proper
 250  * transaction if needed.
 251  *
 252  */
 253 static int journal_finish_inode_data_buffers(journal_t *journal,
 254                 transaction_t *commit_transaction)
 255 {
 256         struct jbd2_inode *jinode, *next_i;
 257         int err, ret = 0;
 258
 259         /* For locking, see the comment in journal_submit_data_buffers() */
 260         spin_lock(&journal->j_list_lock);
 261         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 262                 jinode->i_flags |= JI_COMMIT_RUNNING;
 263                 spin_unlock(&journal->j_list_lock);
 264                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 265                 if (err) {
 266                         /*
 267                          * Because AS_EIO is cleared by
 268                          * wait_on_page_writeback_range(), set it again so
 269                          * that user process can get -EIO from fsync().
 270                          */
 271                         set_bit(AS_EIO,
 272                                 &jinode->i_vfs_inode->i_mapping->flags);
 273
 274                         if (!ret)
 275                                 ret = err;
 276                 }
 277                 spin_lock(&journal->j_list_lock);
 278                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 279                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 280         }
 281
 282         /* Now refile inode to proper lists */
 283         list_for_each_entry_safe(jinode, next_i,
 284                                  &commit_transaction->t_inode_list, i_list) {
 285                 list_del(&jinode->i_list);
 286                 if (jinode->i_next_transaction) {
 287                         jinode->i_transaction = jinode->i_next_transaction;
 288                         jinode->i_next_transaction = NULL;
 289                         list_add(&jinode->i_list,
 290                                 &jinode->i_transaction->t_inode_list);
 291                 } else {
 292                         jinode->i_transaction = NULL;
 293                 }
 294         }
 295         spin_unlock(&journal->j_list_lock);
 296
 297         return ret;
 298 }
 299
 300 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 301 {
 302         struct page *page = bh->b_page;
 303         char *addr;
 304         __u32 checksum;
 305
 306         addr = kmap_atomic(page, KM_USER0);
 307         checksum = crc32_be(crc32_sum,
 308                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 309         kunmap_atomic(addr, KM_USER0);
 310
 311         return checksum;
 312 }
 313
 314 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 315                                    unsigned long long block)
 316 {
 317         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 318         if (tag_bytes > JBD2_TAG_SIZE32)
 319                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 320 }
 321
 322 /*
 323  * jbd2_journal_commit_transaction
 324  *
 325  * The primary function for committing a transaction to the log.  This
 326  * function is called by the journal thread to begin a complete commit.
 327  */
 328 void jbd2_journal_commit_transaction(journal_t *journal)
 329 {
 330         struct transaction_stats_s stats;
 331         transaction_t *commit_transaction;
 332         struct journal_head *jh, *new_jh, *descriptor;
 333         struct buffer_head **wbuf = journal->j_wbuf;
 334         int bufs;
 335         int flags;
 336         int err;
 337         unsigned long long blocknr;
 338         char *tagp = NULL;
 339         journal_header_t *header;
 340         journal_block_tag_t *tag = NULL;
 341         int space_left = 0;
 342         int first_tag = 0;
 343         int tag_flag;
 344         int i;
 345         int tag_bytes = journal_tag_bytes(journal);
 346         struct buffer_head *cbh = NULL; /* For transactional checksums */
 347         __u32 crc32_sum = ~0;
 348
 349         /*
 350          * First job: lock down the current transaction and wait for
 351          * all outstanding updates to complete.
 352          */
 353
 354 #ifdef COMMIT_STATS
 355         spin_lock(&journal->j_list_lock);
 356         summarise_journal_usage(journal);
 357         spin_unlock(&journal->j_list_lock);
 358 #endif
 359
 360         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 361         if (journal->j_flags & JBD2_FLUSHED) {
 362                 jbd_debug(3, "super block updated\n");
 363                 jbd2_journal_update_superblock(journal, 1);
 364         } else {
 365                 jbd_debug(3, "superblock not updated\n");
 366         }
 367
 368         J_ASSERT(journal->j_running_transaction != NULL);
 369         J_ASSERT(journal->j_committing_transaction == NULL);
 370
 371         commit_transaction = journal->j_running_transaction;
 372         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 373
 374         jbd_debug(1, "JBD: starting commit of transaction %d\n",
 375                         commit_transaction->t_tid);
 376
 377         spin_lock(&journal->j_state_lock);
 378         commit_transaction->t_state = T_LOCKED;
 379
 380         stats.u.run.rs_wait = commit_transaction->t_max_wait;
 381         stats.u.run.rs_locked = jiffies;
 382         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 383                                                 stats.u.run.rs_locked);
 384
 385         spin_lock(&commit_transaction->t_handle_lock);
 386         while (commit_transaction->t_updates) {
 387                 DEFINE_WAIT(wait);
 388
 389                 prepare_to_wait(&journal->j_wait_updates, &wait,
 390                                         TASK_UNINTERRUPTIBLE);
 391                 if (commit_transaction->t_updates) {
 392                         spin_unlock(&commit_transaction->t_handle_lock);
 393                         spin_unlock(&journal->j_state_lock);
 394                         schedule();
 395                         spin_lock(&journal->j_state_lock);
 396                         spin_lock(&commit_transaction->t_handle_lock);
 397                 }
 398                 finish_wait(&journal->j_wait_updates, &wait);
 399         }
 400         spin_unlock(&commit_transaction->t_handle_lock);
 401
 402         J_ASSERT (commit_transaction->t_outstanding_credits <=
 403                         journal->j_max_transaction_buffers);
 404
 405         /*
 406          * First thing we are allowed to do is to discard any remaining
 407          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 408          * that there are no such buffers: if a large filesystem
 409          * operation like a truncate needs to split itself over multiple
 410          * transactions, then it may try to do a jbd2_journal_restart() while
 411          * there are still BJ_Reserved buffers outstanding.  These must
 412          * be released cleanly from the current transaction.
 413          *
 414          * In this case, the filesystem must still reserve write access
 415          * again before modifying the buffer in the new transaction, but
 416          * we do not require it to remember exactly which old buffers it
 417          * has reserved.  This is consistent with the existing behaviour
 418          * that multiple jbd2_journal_get_write_access() calls to the same
 419          * buffer are perfectly permissable.
 420          */
 421         while (commit_transaction->t_reserved_list) {
 422                 jh = commit_transaction->t_reserved_list;
 423                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 424                 /*
 425                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 426                  * leave undo-committed data.
 427                  */
 428                 if (jh->b_committed_data) {
 429                         struct buffer_head *bh = jh2bh(jh);
 430
 431                         jbd_lock_bh_state(bh);
 432                         jbd2_free(jh->b_committed_data, bh->b_size);
 433                         jh->b_committed_data = NULL;
 434                         jbd_unlock_bh_state(bh);
 435                 }
 436                 jbd2_journal_refile_buffer(journal, jh);
 437         }
 438
 439         /*
 440          * Now try to drop any written-back buffers from the journal's
 441          * checkpoint lists.  We do this *before* commit because it potentially
 442          * frees some memory
 443          */
 444         spin_lock(&journal->j_list_lock);
 445         __jbd2_journal_clean_checkpoint_list(journal);
 446         spin_unlock(&journal->j_list_lock);
 447
 448         jbd_debug (3, "JBD: commit phase 1\n");
 449
 450         /*
 451          * Switch to a new revoke table.
 452          */
 453         jbd2_journal_switch_revoke_table(journal);
 454
 455         stats.u.run.rs_flushing = jiffies;
 456         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
 457                                                stats.u.run.rs_flushing);
 458
 459         commit_transaction->t_state = T_FLUSH;
 460         journal->j_committing_transaction = commit_transaction;
 461         journal->j_running_transaction = NULL;
 462         commit_transaction->t_log_start = journal->j_head;
 463         wake_up(&journal->j_wait_transaction_locked);
 464         spin_unlock(&journal->j_state_lock);
 465
 466         jbd_debug (3, "JBD: commit phase 2\n");
 467
 468         /*
 469          * Now start flushing things to disk, in the order they appear
 470          * on the transaction lists.  Data blocks go first.
 471          */
 472         err = journal_submit_data_buffers(journal, commit_transaction);
 473         if (err)
 474                 jbd2_journal_abort(journal, err);
 475
 476         jbd2_journal_write_revoke_records(journal, commit_transaction);
 477
 478         jbd_debug(3, "JBD: commit phase 2\n");
 479
 480         /*
 481          * Way to go: we have now written out all of the data for a
 482          * transaction!  Now comes the tricky part: we need to write out
 483          * metadata.  Loop over the transaction's entire buffer list:
 484          */
 485         spin_lock(&journal->j_state_lock);
 486         commit_transaction->t_state = T_COMMIT;
 487         spin_unlock(&journal->j_state_lock);
 488
 489         stats.u.run.rs_logging = jiffies;
 490         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
 491                                                  stats.u.run.rs_logging);
 492         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
 493         stats.u.run.rs_blocks_logged = 0;
 494
 495         J_ASSERT(commit_transaction->t_nr_buffers <=
 496                  commit_transaction->t_outstanding_credits);
 497
 498         err = 0;
 499         descriptor = NULL;
 500         bufs = 0;
 501         while (commit_transaction->t_buffers) {
 502
 503                 /* Find the next buffer to be journaled... */
 504
 505                 jh = commit_transaction->t_buffers;
 506
 507                 /* If we're in abort mode, we just un-journal the buffer and
 508                    release it for background writing. */
 509
 510                 if (is_journal_aborted(journal)) {
 511                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 512                         jbd2_journal_refile_buffer(journal, jh);
 513                         /* If that was the last one, we need to clean up
 514                          * any descriptor buffers which may have been
 515                          * already allocated, even if we are now
 516                          * aborting. */
 517                         if (!commit_transaction->t_buffers)
 518                                 goto start_journal_io;
 519                         continue;
 520                 }
 521
 522                 /* Make sure we have a descriptor block in which to
 523                    record the metadata buffer. */
 524
 525                 if (!descriptor) {
 526                         struct buffer_head *bh;
 527
 528                         J_ASSERT (bufs == 0);
 529
 530                         jbd_debug(4, "JBD: get descriptor\n");
 531
 532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 533                         if (!descriptor) {
 534                                 jbd2_journal_abort(journal, -EIO);
 535                                 continue;
 536                         }
 537
 538                         bh = jh2bh(descriptor);
 539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
 540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 541                         header = (journal_header_t *)&bh->b_data[0];
 542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 545
 546                         tagp = &bh->b_data[sizeof(journal_header_t)];
 547                         space_left = bh->b_size - sizeof(journal_header_t);
 548                         first_tag = 1;
 549                         set_buffer_jwrite(bh);
 550                         set_buffer_dirty(bh);
 551                         wbuf[bufs++] = bh;
 552
 553                         /* Record it so that we can wait for IO
 554                            completion later */
 555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 557                                         BJ_LogCtl);
 558                 }
 559
 560                 /* Where is the buffer to be written? */
 561
 562                 err = jbd2_journal_next_log_block(journal, &blocknr);
 563                 /* If the block mapping failed, just abandon the buffer
 564                    and repeat this loop: we'll fall into the
 565                    refile-on-abort condition above. */
 566                 if (err) {
 567                         jbd2_journal_abort(journal, err);
 568                         continue;
 569                 }
 570
 571                 /*
 572                  * start_this_handle() uses t_outstanding_credits to determine
 573                  * the free space in the log, but this counter is changed
 574                  * by jbd2_journal_next_log_block() also.
 575                  */
 576                 commit_transaction->t_outstanding_credits--;
 577
 578                 /* Bump b_count to prevent truncate from stumbling over
 579                    the shadowed buffer!  @@@ This can go if we ever get
 580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 581                 atomic_inc(&jh2bh(jh)->b_count);
 582
 583                 /* Make a temporary IO buffer with which to write it out
 584                    (this will requeue both the metadata buffer and the
 585                    temporary IO buffer). new_bh goes on BJ_IO*/
 586
 587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 588                 /*
 589                  * akpm: jbd2_journal_write_metadata_buffer() sets
 590                  * new_bh->b_transaction to commit_transaction.
 591                  * We need to clean this up before we release new_bh
 592                  * (which is of type BJ_IO)
 593                  */
 594                 JBUFFER_TRACE(jh, "ph3: write metadata");
 595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 596                                                       jh, &new_jh, blocknr);
 597                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 598                 wbuf[bufs++] = jh2bh(new_jh);
 599
 600                 /* Record the new block's tag in the current descriptor
 601                    buffer */
 602
 603                 tag_flag = 0;
 604                 if (flags & 1)
 605                         tag_flag |= JBD2_FLAG_ESCAPE;
 606                 if (!first_tag)
 607                         tag_flag |= JBD2_FLAG_SAME_UUID;
 608
 609                 tag = (journal_block_tag_t *) tagp;
 610                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 611                 tag->t_flags = cpu_to_be32(tag_flag);
 612                 tagp += tag_bytes;
 613                 space_left -= tag_bytes;
 614
 615                 if (first_tag) {
 616                         memcpy (tagp, journal->j_uuid, 16);
 617                         tagp += 16;
 618                         space_left -= 16;
 619                         first_tag = 0;
 620                 }
 621
 622                 /* If there's no more to do, or if the descriptor is full,
 623                    let the IO rip! */
 624
 625                 if (bufs == journal->j_wbufsize ||
 626                     commit_transaction->t_buffers == NULL ||
 627                     space_left < tag_bytes + 16) {
 628
 629                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
 630
 631                         /* Write an end-of-descriptor marker before
 632                            submitting the IOs.  "tag" still points to
 633                            the last tag we set up. */
 634
 635                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 636
 637 start_journal_io:
 638                         for (i = 0; i < bufs; i++) {
 639                                 struct buffer_head *bh = wbuf[i];
 640                                 /*
 641                                  * Compute checksum.
 642                                  */
 643                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 644                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 645                                         crc32_sum =
 646                                             jbd2_checksum_data(crc32_sum, bh);
 647                                 }
 648
 649                                 lock_buffer(bh);
 650                                 clear_buffer_dirty(bh);
 651                                 set_buffer_uptodate(bh);
 652                                 bh->b_end_io = journal_end_buffer_io_sync;
 653                                 submit_bh(WRITE, bh);
 654                         }
 655                         cond_resched();
 656                         stats.u.run.rs_blocks_logged += bufs;
 657
 658                         /* Force a new descriptor to be generated next
 659                            time round the loop. */
 660                         descriptor = NULL;
 661                         bufs = 0;
 662                 }
 663         }
 664
 665         /* Done it all: now write the commit record asynchronously. */
 666
 667         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 668                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 669                 err = journal_submit_commit_record(journal, commit_transaction,
 670                                                  &cbh, crc32_sum);
 671                 if (err)
 672                         __jbd2_journal_abort_hard(journal);
 673         }
 674
 675         /*
 676          * This is the right place to wait for data buffers both for ASYNC
 677          * and !ASYNC commit. If commit is ASYNC, we need to wait only after
 678          * the commit block went to disk (which happens above). If commit is
 679          * SYNC, we need to wait for data buffers before we start writing
 680          * commit block, which happens below in such setting.
 681          */
 682         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 683         if (err) {
 684                 char b[BDEVNAME_SIZE];
 685
 686                 printk(KERN_WARNING
 687                         "JBD2: Detected IO errors while flushing file data "
 688                         "on %s\n", bdevname(journal->j_fs_dev, b));
 689                 err = 0;
 690         }
 691
 692         /* Lo and behold: we have just managed to send a transaction to
 693            the log.  Before we can commit it, wait for the IO so far to
 694            complete.  Control buffers being written are on the
 695            transaction's t_log_list queue, and metadata buffers are on
 696            the t_iobuf_list queue.
 697
 698            Wait for the buffers in reverse order.  That way we are
 699            less likely to be woken up until all IOs have completed, and
 700            so we incur less scheduling load.
 701         */
 702
 703         jbd_debug(3, "JBD: commit phase 3\n");
 704
 705         /*
 706          * akpm: these are BJ_IO, and j_list_lock is not needed.
 707          * See __journal_try_to_free_buffer.
 708          */
 709 wait_for_iobuf:
 710         while (commit_transaction->t_iobuf_list != NULL) {
 711                 struct buffer_head *bh;
 712
 713                 jh = commit_transaction->t_iobuf_list->b_tprev;
 714                 bh = jh2bh(jh);
 715                 if (buffer_locked(bh)) {
 716                         wait_on_buffer(bh);
 717                         goto wait_for_iobuf;
 718                 }
 719                 if (cond_resched())
 720                         goto wait_for_iobuf;
 721
 722                 if (unlikely(!buffer_uptodate(bh)))
 723                         err = -EIO;
 724
 725                 clear_buffer_jwrite(bh);
 726
 727                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 728                 jbd2_journal_unfile_buffer(journal, jh);
 729
 730                 /*
 731                  * ->t_iobuf_list should contain only dummy buffer_heads
 732                  * which were created by jbd2_journal_write_metadata_buffer().
 733                  */
 734                 BUFFER_TRACE(bh, "dumping temporary bh");
 735                 jbd2_journal_put_journal_head(jh);
 736                 __brelse(bh);
 737                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 738                 free_buffer_head(bh);
 739
 740                 /* We also have to unlock and free the corresponding
 741                    shadowed buffer */
 742                 jh = commit_transaction->t_shadow_list->b_tprev;
 743                 bh = jh2bh(jh);
 744                 clear_bit(BH_JWrite, &bh->b_state);
 745                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 746
 747                 /* The metadata is now released for reuse, but we need
 748                    to remember it against this transaction so that when
 749                    we finally commit, we can do any checkpointing
 750                    required. */
 751                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 752                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 753                 /* Wake up any transactions which were waiting for this
 754                    IO to complete */
 755                 wake_up_bit(&bh->b_state, BH_Unshadow);
 756                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 757                 __brelse(bh);
 758         }
 759
 760         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 761
 762         jbd_debug(3, "JBD: commit phase 4\n");
 763
 764         /* Here we wait for the revoke record and descriptor record buffers */
 765  wait_for_ctlbuf:
 766         while (commit_transaction->t_log_list != NULL) {
 767                 struct buffer_head *bh;
 768
 769                 jh = commit_transaction->t_log_list->b_tprev;
 770                 bh = jh2bh(jh);
 771                 if (buffer_locked(bh)) {
 772                         wait_on_buffer(bh);
 773                         goto wait_for_ctlbuf;
 774                 }
 775                 if (cond_resched())
 776                         goto wait_for_ctlbuf;
 777
 778                 if (unlikely(!buffer_uptodate(bh)))
 779                         err = -EIO;
 780
 781                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 782                 clear_buffer_jwrite(bh);
 783                 jbd2_journal_unfile_buffer(journal, jh);
 784                 jbd2_journal_put_journal_head(jh);
 785                 __brelse(bh);           /* One for getblk */
 786                 /* AKPM: bforget here */
 787         }
 788
 789         jbd_debug(3, "JBD: commit phase 5\n");
 790
 791         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 792                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 793                 err = journal_submit_commit_record(journal, commit_transaction,
 794                                                 &cbh, crc32_sum);
 795                 if (err)
 796                         __jbd2_journal_abort_hard(journal);
 797         }
 798         if (!err && !is_journal_aborted(journal))
 799                 err = journal_wait_on_commit_record(cbh);
 800
 801         if (err)
 802                 jbd2_journal_abort(journal, err);
 803
 804         /* End of a transaction!  Finally, we can do checkpoint
 805            processing: any buffers committed as a result of this
 806            transaction can be removed from any checkpoint list it was on
 807            before. */
 808
 809         jbd_debug(3, "JBD: commit phase 6\n");
 810
 811         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 812         J_ASSERT(commit_transaction->t_buffers == NULL);
 813         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 814         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 815         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 816         J_ASSERT(commit_transaction->t_log_list == NULL);
 817
 818 restart_loop:
 819         /*
 820          * As there are other places (journal_unmap_buffer()) adding buffers
 821          * to this list we have to be careful and hold the j_list_lock.
 822          */
 823         spin_lock(&journal->j_list_lock);
 824         while (commit_transaction->t_forget) {
 825                 transaction_t *cp_transaction;
 826                 struct buffer_head *bh;
 827
 828                 jh = commit_transaction->t_forget;
 829                 spin_unlock(&journal->j_list_lock);
 830                 bh = jh2bh(jh);
 831                 jbd_lock_bh_state(bh);
 832                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
 833                         jh->b_transaction == journal->j_running_transaction);
 834
 835                 /*
 836                  * If there is undo-protected committed data against
 837                  * this buffer, then we can remove it now.  If it is a
 838                  * buffer needing such protection, the old frozen_data
 839                  * field now points to a committed version of the
 840                  * buffer, so rotate that field to the new committed
 841                  * data.
 842                  *
 843                  * Otherwise, we can just throw away the frozen data now.
 844                  */
 845                 if (jh->b_committed_data) {
 846                         jbd2_free(jh->b_committed_data, bh->b_size);
 847                         jh->b_committed_data = NULL;
 848                         if (jh->b_frozen_data) {
 849                                 jh->b_committed_data = jh->b_frozen_data;
 850                                 jh->b_frozen_data = NULL;
 851                         }
 852                 } else if (jh->b_frozen_data) {
 853                         jbd2_free(jh->b_frozen_data, bh->b_size);
 854                         jh->b_frozen_data = NULL;
 855                 }
 856
 857                 spin_lock(&journal->j_list_lock);
 858                 cp_transaction = jh->b_cp_transaction;
 859                 if (cp_transaction) {
 860                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 861                         cp_transaction->t_chp_stats.cs_dropped++;
 862                         __jbd2_journal_remove_checkpoint(jh);
 863                 }
 864
 865                 /* Only re-checkpoint the buffer_head if it is marked
 866                  * dirty.  If the buffer was added to the BJ_Forget list
 867                  * by jbd2_journal_forget, it may no longer be dirty and
 868                  * there's no point in keeping a checkpoint record for
 869                  * it. */
 870
 871                 /* A buffer which has been freed while still being
 872                  * journaled by a previous transaction may end up still
 873                  * being dirty here, but we want to avoid writing back
 874                  * that buffer in the future now that the last use has
 875                  * been committed.  That's not only a performance gain,
 876                  * it also stops aliasing problems if the buffer is left
 877                  * behind for writeback and gets reallocated for another
 878                  * use in a different page. */
 879                 if (buffer_freed(bh)) {
 880                         clear_buffer_freed(bh);
 881                         clear_buffer_jbddirty(bh);
 882                 }
 883
 884                 if (buffer_jbddirty(bh)) {
 885                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 886                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 887                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 888                         __jbd2_journal_refile_buffer(jh);
 889                         jbd_unlock_bh_state(bh);
 890                 } else {
 891                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 892                         /* The buffer on BJ_Forget list and not jbddirty means
 893                          * it has been freed by this transaction and hence it
 894                          * could not have been reallocated until this
 895                          * transaction has committed. *BUT* it could be
 896                          * reallocated once we have written all the data to
 897                          * disk and before we process the buffer on BJ_Forget
 898                          * list. */
 899                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
 900                         __jbd2_journal_refile_buffer(jh);
 901                         if (!jh->b_transaction) {
 902                                 jbd_unlock_bh_state(bh);
 903                                  /* needs a brelse */
 904                                 jbd2_journal_remove_journal_head(bh);
 905                                 release_buffer_page(bh);
 906                         } else
 907                                 jbd_unlock_bh_state(bh);
 908                 }
 909                 cond_resched_lock(&journal->j_list_lock);
 910         }
 911         spin_unlock(&journal->j_list_lock);
 912         /*
 913          * This is a bit sleazy.  We use j_list_lock to protect transition
 914          * of a transaction into T_FINISHED state and calling
 915          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 916          * other checkpointing code processing the transaction...
 917          */
 918         spin_lock(&journal->j_state_lock);
 919         spin_lock(&journal->j_list_lock);
 920         /*
 921          * Now recheck if some buffers did not get attached to the transaction
 922          * while the lock was dropped...
 923          */
 924         if (commit_transaction->t_forget) {
 925                 spin_unlock(&journal->j_list_lock);
 926                 spin_unlock(&journal->j_state_lock);
 927                 goto restart_loop;
 928         }
 929
 930         /* Done with this transaction! */
 931
 932         jbd_debug(3, "JBD: commit phase 7\n");
 933
 934         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 935
 936         commit_transaction->t_start = jiffies;
 937         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
 938                                                 commit_transaction->t_start);
 939
 940         /*
 941          * File the transaction for history
 942          */
 943         stats.ts_type = JBD2_STATS_RUN;
 944         stats.ts_tid = commit_transaction->t_tid;
 945         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
 946         spin_lock(&journal->j_history_lock);
 947         memcpy(journal->j_history + journal->j_history_cur, &stats,
 948                         sizeof(stats));
 949         if (++journal->j_history_cur == journal->j_history_max)
 950                 journal->j_history_cur = 0;
 951
 952         /*
 953          * Calculate overall stats
 954          */
 955         journal->j_stats.ts_tid++;
 956         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
 957         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
 958         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
 959         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
 960         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
 961         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
 962         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
 963         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
 964         spin_unlock(&journal->j_history_lock);
 965
 966         commit_transaction->t_state = T_FINISHED;
 967         J_ASSERT(commit_transaction == journal->j_committing_transaction);
 968         journal->j_commit_sequence = commit_transaction->t_tid;
 969         journal->j_committing_transaction = NULL;
 970         spin_unlock(&journal->j_state_lock);
 971
 972         if (commit_transaction->t_checkpoint_list == NULL &&
 973             commit_transaction->t_checkpoint_io_list == NULL) {
 974                 __jbd2_journal_drop_transaction(journal, commit_transaction);
 975         } else {
 976                 if (journal->j_checkpoint_transactions == NULL) {
 977                         journal->j_checkpoint_transactions = commit_transaction;
 978                         commit_transaction->t_cpnext = commit_transaction;
 979                         commit_transaction->t_cpprev = commit_transaction;
 980                 } else {
 981                         commit_transaction->t_cpnext =
 982                                 journal->j_checkpoint_transactions;
 983                         commit_transaction->t_cpprev =
 984                                 commit_transaction->t_cpnext->t_cpprev;
 985                         commit_transaction->t_cpnext->t_cpprev =
 986                                 commit_transaction;
 987                         commit_transaction->t_cpprev->t_cpnext =
 988                                 commit_transaction;
 989                 }
 990         }
 991         spin_unlock(&journal->j_list_lock);
 992
 993         jbd_debug(1, "JBD: commit %d complete, head %d\n",
 994                   journal->j_commit_sequence, journal->j_tail_sequence);
 995
 996         wake_up(&journal->j_wait_done_commit);
 997 }