ext4: Add debugging markers that can be used by systemtap
[safe/jmp/linux-2.6] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/marker.h>
20 #include <linux/errno.h>
21 #include <linux/slab.h>
22 #include <linux/mm.h>
23 #include <linux/pagemap.h>
24 #include <linux/jiffies.h>
25 #include <linux/crc32.h>
26 #include <linux/writeback.h>
27 #include <linux/backing-dev.h>
28
29 /*
30  * Default IO end handler for temporary BJ_IO buffer_heads.
31  */
32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33 {
34         BUFFER_TRACE(bh, "");
35         if (uptodate)
36                 set_buffer_uptodate(bh);
37         else
38                 clear_buffer_uptodate(bh);
39         unlock_buffer(bh);
40 }
41
42 /*
43  * When an ext4 file is truncated, it is possible that some pages are not
44  * successfully freed, because they are attached to a committing transaction.
45  * After the transaction commits, these pages are left on the LRU, with no
46  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
47  * by the VM, but their apparent absence upsets the VM accounting, and it makes
48  * the numbers in /proc/meminfo look odd.
49  *
50  * So here, we have a buffer which has just come off the forget list.  Look to
51  * see if we can strip all buffers from the backing page.
52  *
53  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
54  * caller provided us with a ref against the buffer, and we drop that here.
55  */
56 static void release_buffer_page(struct buffer_head *bh)
57 {
58         struct page *page;
59
60         if (buffer_dirty(bh))
61                 goto nope;
62         if (atomic_read(&bh->b_count) != 1)
63                 goto nope;
64         page = bh->b_page;
65         if (!page)
66                 goto nope;
67         if (page->mapping)
68                 goto nope;
69
70         /* OK, it's a truncated page */
71         if (!trylock_page(page))
72                 goto nope;
73
74         page_cache_get(page);
75         __brelse(bh);
76         try_to_free_buffers(page);
77         unlock_page(page);
78         page_cache_release(page);
79         return;
80
81 nope:
82         __brelse(bh);
83 }
84
85 /*
86  * Done it all: now submit the commit record.  We should have
87  * cleaned up our previous buffers by now, so if we are in abort
88  * mode we can now just skip the rest of the journal write
89  * entirely.
90  *
91  * Returns 1 if the journal needs to be aborted or 0 on success
92  */
93 static int journal_submit_commit_record(journal_t *journal,
94                                         transaction_t *commit_transaction,
95                                         struct buffer_head **cbh,
96                                         __u32 crc32_sum)
97 {
98         struct journal_head *descriptor;
99         struct commit_header *tmp;
100         struct buffer_head *bh;
101         int ret;
102         int barrier_done = 0;
103         struct timespec now = current_kernel_time();
104
105         if (is_journal_aborted(journal))
106                 return 0;
107
108         descriptor = jbd2_journal_get_descriptor_buffer(journal);
109         if (!descriptor)
110                 return 1;
111
112         bh = jh2bh(descriptor);
113
114         tmp = (struct commit_header *)bh->b_data;
115         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
116         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
117         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
118         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
119         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
120
121         if (JBD2_HAS_COMPAT_FEATURE(journal,
122                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
123                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
124                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
125                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
126         }
127
128         JBUFFER_TRACE(descriptor, "submit commit block");
129         lock_buffer(bh);
130         get_bh(bh);
131         set_buffer_dirty(bh);
132         set_buffer_uptodate(bh);
133         bh->b_end_io = journal_end_buffer_io_sync;
134
135         if (journal->j_flags & JBD2_BARRIER &&
136                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138                 set_buffer_ordered(bh);
139                 barrier_done = 1;
140         }
141         ret = submit_bh(WRITE, bh);
142         if (barrier_done)
143                 clear_buffer_ordered(bh);
144
145         /* is it possible for another commit to fail at roughly
146          * the same time as this one?  If so, we don't want to
147          * trust the barrier flag in the super, but instead want
148          * to remember if we sent a barrier request
149          */
150         if (ret == -EOPNOTSUPP && barrier_done) {
151                 printk(KERN_WARNING
152                        "JBD: barrier-based sync failed on %s - "
153                        "disabling barriers\n", journal->j_devname);
154                 spin_lock(&journal->j_state_lock);
155                 journal->j_flags &= ~JBD2_BARRIER;
156                 spin_unlock(&journal->j_state_lock);
157
158                 /* And try again, without the barrier */
159                 lock_buffer(bh);
160                 set_buffer_uptodate(bh);
161                 set_buffer_dirty(bh);
162                 ret = submit_bh(WRITE, bh);
163         }
164         *cbh = bh;
165         return ret;
166 }
167
168 /*
169  * This function along with journal_submit_commit_record
170  * allows to write the commit record asynchronously.
171  */
172 static int journal_wait_on_commit_record(struct buffer_head *bh)
173 {
174         int ret = 0;
175
176         clear_buffer_dirty(bh);
177         wait_on_buffer(bh);
178
179         if (unlikely(!buffer_uptodate(bh)))
180                 ret = -EIO;
181         put_bh(bh);            /* One for getblk() */
182         jbd2_journal_put_journal_head(bh2jh(bh));
183
184         return ret;
185 }
186
187 /*
188  * write the filemap data using writepage() address_space_operations.
189  * We don't do block allocation here even for delalloc. We don't
190  * use writepages() because with dealyed allocation we may be doing
191  * block allocation in writepages().
192  */
193 static int journal_submit_inode_data_buffers(struct address_space *mapping)
194 {
195         int ret;
196         struct writeback_control wbc = {
197                 .sync_mode =  WB_SYNC_ALL,
198                 .nr_to_write = mapping->nrpages * 2,
199                 .range_start = 0,
200                 .range_end = i_size_read(mapping->host),
201                 .for_writepages = 1,
202         };
203
204         ret = generic_writepages(mapping, &wbc);
205         return ret;
206 }
207
208 /*
209  * Submit all the data buffers of inode associated with the transaction to
210  * disk.
211  *
212  * We are in a committing transaction. Therefore no new inode can be added to
213  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
214  * operate on from being released while we write out pages.
215  */
216 static int journal_submit_data_buffers(journal_t *journal,
217                 transaction_t *commit_transaction)
218 {
219         struct jbd2_inode *jinode;
220         int err, ret = 0;
221         struct address_space *mapping;
222
223         spin_lock(&journal->j_list_lock);
224         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
225                 mapping = jinode->i_vfs_inode->i_mapping;
226                 jinode->i_flags |= JI_COMMIT_RUNNING;
227                 spin_unlock(&journal->j_list_lock);
228                 /*
229                  * submit the inode data buffers. We use writepage
230                  * instead of writepages. Because writepages can do
231                  * block allocation  with delalloc. We need to write
232                  * only allocated blocks here.
233                  */
234                 err = journal_submit_inode_data_buffers(mapping);
235                 if (!ret)
236                         ret = err;
237                 spin_lock(&journal->j_list_lock);
238                 J_ASSERT(jinode->i_transaction == commit_transaction);
239                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
240                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
241         }
242         spin_unlock(&journal->j_list_lock);
243         return ret;
244 }
245
246 /*
247  * Wait for data submitted for writeout, refile inodes to proper
248  * transaction if needed.
249  *
250  */
251 static int journal_finish_inode_data_buffers(journal_t *journal,
252                 transaction_t *commit_transaction)
253 {
254         struct jbd2_inode *jinode, *next_i;
255         int err, ret = 0;
256
257         /* For locking, see the comment in journal_submit_data_buffers() */
258         spin_lock(&journal->j_list_lock);
259         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
260                 jinode->i_flags |= JI_COMMIT_RUNNING;
261                 spin_unlock(&journal->j_list_lock);
262                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
263                 if (err) {
264                         /*
265                          * Because AS_EIO is cleared by
266                          * wait_on_page_writeback_range(), set it again so
267                          * that user process can get -EIO from fsync().
268                          */
269                         set_bit(AS_EIO,
270                                 &jinode->i_vfs_inode->i_mapping->flags);
271
272                         if (!ret)
273                                 ret = err;
274                 }
275                 spin_lock(&journal->j_list_lock);
276                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
277                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
278         }
279
280         /* Now refile inode to proper lists */
281         list_for_each_entry_safe(jinode, next_i,
282                                  &commit_transaction->t_inode_list, i_list) {
283                 list_del(&jinode->i_list);
284                 if (jinode->i_next_transaction) {
285                         jinode->i_transaction = jinode->i_next_transaction;
286                         jinode->i_next_transaction = NULL;
287                         list_add(&jinode->i_list,
288                                 &jinode->i_transaction->t_inode_list);
289                 } else {
290                         jinode->i_transaction = NULL;
291                 }
292         }
293         spin_unlock(&journal->j_list_lock);
294
295         return ret;
296 }
297
298 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
299 {
300         struct page *page = bh->b_page;
301         char *addr;
302         __u32 checksum;
303
304         addr = kmap_atomic(page, KM_USER0);
305         checksum = crc32_be(crc32_sum,
306                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
307         kunmap_atomic(addr, KM_USER0);
308
309         return checksum;
310 }
311
312 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
313                                    unsigned long long block)
314 {
315         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
316         if (tag_bytes > JBD2_TAG_SIZE32)
317                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
318 }
319
320 /*
321  * jbd2_journal_commit_transaction
322  *
323  * The primary function for committing a transaction to the log.  This
324  * function is called by the journal thread to begin a complete commit.
325  */
326 void jbd2_journal_commit_transaction(journal_t *journal)
327 {
328         struct transaction_stats_s stats;
329         transaction_t *commit_transaction;
330         struct journal_head *jh, *new_jh, *descriptor;
331         struct buffer_head **wbuf = journal->j_wbuf;
332         int bufs;
333         int flags;
334         int err;
335         unsigned long long blocknr;
336         char *tagp = NULL;
337         journal_header_t *header;
338         journal_block_tag_t *tag = NULL;
339         int space_left = 0;
340         int first_tag = 0;
341         int tag_flag;
342         int i;
343         int tag_bytes = journal_tag_bytes(journal);
344         struct buffer_head *cbh = NULL; /* For transactional checksums */
345         __u32 crc32_sum = ~0;
346
347         /*
348          * First job: lock down the current transaction and wait for
349          * all outstanding updates to complete.
350          */
351
352 #ifdef COMMIT_STATS
353         spin_lock(&journal->j_list_lock);
354         summarise_journal_usage(journal);
355         spin_unlock(&journal->j_list_lock);
356 #endif
357
358         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
359         if (journal->j_flags & JBD2_FLUSHED) {
360                 jbd_debug(3, "super block updated\n");
361                 jbd2_journal_update_superblock(journal, 1);
362         } else {
363                 jbd_debug(3, "superblock not updated\n");
364         }
365
366         J_ASSERT(journal->j_running_transaction != NULL);
367         J_ASSERT(journal->j_committing_transaction == NULL);
368
369         commit_transaction = journal->j_running_transaction;
370         J_ASSERT(commit_transaction->t_state == T_RUNNING);
371
372         trace_mark(jbd2_start_commit, "dev %s transaction %d",
373                    journal->j_devname, commit_transaction->t_tid);
374         jbd_debug(1, "JBD: starting commit of transaction %d\n",
375                         commit_transaction->t_tid);
376
377         spin_lock(&journal->j_state_lock);
378         commit_transaction->t_state = T_LOCKED;
379
380         stats.u.run.rs_wait = commit_transaction->t_max_wait;
381         stats.u.run.rs_locked = jiffies;
382         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
383                                                 stats.u.run.rs_locked);
384
385         spin_lock(&commit_transaction->t_handle_lock);
386         while (commit_transaction->t_updates) {
387                 DEFINE_WAIT(wait);
388
389                 prepare_to_wait(&journal->j_wait_updates, &wait,
390                                         TASK_UNINTERRUPTIBLE);
391                 if (commit_transaction->t_updates) {
392                         spin_unlock(&commit_transaction->t_handle_lock);
393                         spin_unlock(&journal->j_state_lock);
394                         schedule();
395                         spin_lock(&journal->j_state_lock);
396                         spin_lock(&commit_transaction->t_handle_lock);
397                 }
398                 finish_wait(&journal->j_wait_updates, &wait);
399         }
400         spin_unlock(&commit_transaction->t_handle_lock);
401
402         J_ASSERT (commit_transaction->t_outstanding_credits <=
403                         journal->j_max_transaction_buffers);
404
405         /*
406          * First thing we are allowed to do is to discard any remaining
407          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
408          * that there are no such buffers: if a large filesystem
409          * operation like a truncate needs to split itself over multiple
410          * transactions, then it may try to do a jbd2_journal_restart() while
411          * there are still BJ_Reserved buffers outstanding.  These must
412          * be released cleanly from the current transaction.
413          *
414          * In this case, the filesystem must still reserve write access
415          * again before modifying the buffer in the new transaction, but
416          * we do not require it to remember exactly which old buffers it
417          * has reserved.  This is consistent with the existing behaviour
418          * that multiple jbd2_journal_get_write_access() calls to the same
419          * buffer are perfectly permissable.
420          */
421         while (commit_transaction->t_reserved_list) {
422                 jh = commit_transaction->t_reserved_list;
423                 JBUFFER_TRACE(jh, "reserved, unused: refile");
424                 /*
425                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
426                  * leave undo-committed data.
427                  */
428                 if (jh->b_committed_data) {
429                         struct buffer_head *bh = jh2bh(jh);
430
431                         jbd_lock_bh_state(bh);
432                         jbd2_free(jh->b_committed_data, bh->b_size);
433                         jh->b_committed_data = NULL;
434                         jbd_unlock_bh_state(bh);
435                 }
436                 jbd2_journal_refile_buffer(journal, jh);
437         }
438
439         /*
440          * Now try to drop any written-back buffers from the journal's
441          * checkpoint lists.  We do this *before* commit because it potentially
442          * frees some memory
443          */
444         spin_lock(&journal->j_list_lock);
445         __jbd2_journal_clean_checkpoint_list(journal);
446         spin_unlock(&journal->j_list_lock);
447
448         jbd_debug (3, "JBD: commit phase 1\n");
449
450         /*
451          * Switch to a new revoke table.
452          */
453         jbd2_journal_switch_revoke_table(journal);
454
455         stats.u.run.rs_flushing = jiffies;
456         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
457                                                stats.u.run.rs_flushing);
458
459         commit_transaction->t_state = T_FLUSH;
460         journal->j_committing_transaction = commit_transaction;
461         journal->j_running_transaction = NULL;
462         commit_transaction->t_log_start = journal->j_head;
463         wake_up(&journal->j_wait_transaction_locked);
464         spin_unlock(&journal->j_state_lock);
465
466         jbd_debug (3, "JBD: commit phase 2\n");
467
468         /*
469          * Now start flushing things to disk, in the order they appear
470          * on the transaction lists.  Data blocks go first.
471          */
472         err = journal_submit_data_buffers(journal, commit_transaction);
473         if (err)
474                 jbd2_journal_abort(journal, err);
475
476         jbd2_journal_write_revoke_records(journal, commit_transaction);
477
478         jbd_debug(3, "JBD: commit phase 2\n");
479
480         /*
481          * Way to go: we have now written out all of the data for a
482          * transaction!  Now comes the tricky part: we need to write out
483          * metadata.  Loop over the transaction's entire buffer list:
484          */
485         spin_lock(&journal->j_state_lock);
486         commit_transaction->t_state = T_COMMIT;
487         spin_unlock(&journal->j_state_lock);
488
489         stats.u.run.rs_logging = jiffies;
490         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
491                                                  stats.u.run.rs_logging);
492         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
493         stats.u.run.rs_blocks_logged = 0;
494
495         J_ASSERT(commit_transaction->t_nr_buffers <=
496                  commit_transaction->t_outstanding_credits);
497
498         err = 0;
499         descriptor = NULL;
500         bufs = 0;
501         while (commit_transaction->t_buffers) {
502
503                 /* Find the next buffer to be journaled... */
504
505                 jh = commit_transaction->t_buffers;
506
507                 /* If we're in abort mode, we just un-journal the buffer and
508                    release it for background writing. */
509
510                 if (is_journal_aborted(journal)) {
511                         JBUFFER_TRACE(jh, "journal is aborting: refile");
512                         jbd2_journal_refile_buffer(journal, jh);
513                         /* If that was the last one, we need to clean up
514                          * any descriptor buffers which may have been
515                          * already allocated, even if we are now
516                          * aborting. */
517                         if (!commit_transaction->t_buffers)
518                                 goto start_journal_io;
519                         continue;
520                 }
521
522                 /* Make sure we have a descriptor block in which to
523                    record the metadata buffer. */
524
525                 if (!descriptor) {
526                         struct buffer_head *bh;
527
528                         J_ASSERT (bufs == 0);
529
530                         jbd_debug(4, "JBD: get descriptor\n");
531
532                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
533                         if (!descriptor) {
534                                 jbd2_journal_abort(journal, -EIO);
535                                 continue;
536                         }
537
538                         bh = jh2bh(descriptor);
539                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
540                                 (unsigned long long)bh->b_blocknr, bh->b_data);
541                         header = (journal_header_t *)&bh->b_data[0];
542                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
543                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
544                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
545
546                         tagp = &bh->b_data[sizeof(journal_header_t)];
547                         space_left = bh->b_size - sizeof(journal_header_t);
548                         first_tag = 1;
549                         set_buffer_jwrite(bh);
550                         set_buffer_dirty(bh);
551                         wbuf[bufs++] = bh;
552
553                         /* Record it so that we can wait for IO
554                            completion later */
555                         BUFFER_TRACE(bh, "ph3: file as descriptor");
556                         jbd2_journal_file_buffer(descriptor, commit_transaction,
557                                         BJ_LogCtl);
558                 }
559
560                 /* Where is the buffer to be written? */
561
562                 err = jbd2_journal_next_log_block(journal, &blocknr);
563                 /* If the block mapping failed, just abandon the buffer
564                    and repeat this loop: we'll fall into the
565                    refile-on-abort condition above. */
566                 if (err) {
567                         jbd2_journal_abort(journal, err);
568                         continue;
569                 }
570
571                 /*
572                  * start_this_handle() uses t_outstanding_credits to determine
573                  * the free space in the log, but this counter is changed
574                  * by jbd2_journal_next_log_block() also.
575                  */
576                 commit_transaction->t_outstanding_credits--;
577
578                 /* Bump b_count to prevent truncate from stumbling over
579                    the shadowed buffer!  @@@ This can go if we ever get
580                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
581                 atomic_inc(&jh2bh(jh)->b_count);
582
583                 /* Make a temporary IO buffer with which to write it out
584                    (this will requeue both the metadata buffer and the
585                    temporary IO buffer). new_bh goes on BJ_IO*/
586
587                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
588                 /*
589                  * akpm: jbd2_journal_write_metadata_buffer() sets
590                  * new_bh->b_transaction to commit_transaction.
591                  * We need to clean this up before we release new_bh
592                  * (which is of type BJ_IO)
593                  */
594                 JBUFFER_TRACE(jh, "ph3: write metadata");
595                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
596                                                       jh, &new_jh, blocknr);
597                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
598                 wbuf[bufs++] = jh2bh(new_jh);
599
600                 /* Record the new block's tag in the current descriptor
601                    buffer */
602
603                 tag_flag = 0;
604                 if (flags & 1)
605                         tag_flag |= JBD2_FLAG_ESCAPE;
606                 if (!first_tag)
607                         tag_flag |= JBD2_FLAG_SAME_UUID;
608
609                 tag = (journal_block_tag_t *) tagp;
610                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
611                 tag->t_flags = cpu_to_be32(tag_flag);
612                 tagp += tag_bytes;
613                 space_left -= tag_bytes;
614
615                 if (first_tag) {
616                         memcpy (tagp, journal->j_uuid, 16);
617                         tagp += 16;
618                         space_left -= 16;
619                         first_tag = 0;
620                 }
621
622                 /* If there's no more to do, or if the descriptor is full,
623                    let the IO rip! */
624
625                 if (bufs == journal->j_wbufsize ||
626                     commit_transaction->t_buffers == NULL ||
627                     space_left < tag_bytes + 16) {
628
629                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
630
631                         /* Write an end-of-descriptor marker before
632                            submitting the IOs.  "tag" still points to
633                            the last tag we set up. */
634
635                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
636
637 start_journal_io:
638                         for (i = 0; i < bufs; i++) {
639                                 struct buffer_head *bh = wbuf[i];
640                                 /*
641                                  * Compute checksum.
642                                  */
643                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
644                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
645                                         crc32_sum =
646                                             jbd2_checksum_data(crc32_sum, bh);
647                                 }
648
649                                 lock_buffer(bh);
650                                 clear_buffer_dirty(bh);
651                                 set_buffer_uptodate(bh);
652                                 bh->b_end_io = journal_end_buffer_io_sync;
653                                 submit_bh(WRITE, bh);
654                         }
655                         cond_resched();
656                         stats.u.run.rs_blocks_logged += bufs;
657
658                         /* Force a new descriptor to be generated next
659                            time round the loop. */
660                         descriptor = NULL;
661                         bufs = 0;
662                 }
663         }
664
665         /* Done it all: now write the commit record asynchronously. */
666
667         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
668                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
669                 err = journal_submit_commit_record(journal, commit_transaction,
670                                                  &cbh, crc32_sum);
671                 if (err)
672                         __jbd2_journal_abort_hard(journal);
673         }
674
675         /*
676          * This is the right place to wait for data buffers both for ASYNC
677          * and !ASYNC commit. If commit is ASYNC, we need to wait only after
678          * the commit block went to disk (which happens above). If commit is
679          * SYNC, we need to wait for data buffers before we start writing
680          * commit block, which happens below in such setting.
681          */
682         err = journal_finish_inode_data_buffers(journal, commit_transaction);
683         if (err) {
684                 printk(KERN_WARNING
685                         "JBD2: Detected IO errors while flushing file data "
686                        "on %s\n", journal->j_devname);
687                 err = 0;
688         }
689
690         /* Lo and behold: we have just managed to send a transaction to
691            the log.  Before we can commit it, wait for the IO so far to
692            complete.  Control buffers being written are on the
693            transaction's t_log_list queue, and metadata buffers are on
694            the t_iobuf_list queue.
695
696            Wait for the buffers in reverse order.  That way we are
697            less likely to be woken up until all IOs have completed, and
698            so we incur less scheduling load.
699         */
700
701         jbd_debug(3, "JBD: commit phase 3\n");
702
703         /*
704          * akpm: these are BJ_IO, and j_list_lock is not needed.
705          * See __journal_try_to_free_buffer.
706          */
707 wait_for_iobuf:
708         while (commit_transaction->t_iobuf_list != NULL) {
709                 struct buffer_head *bh;
710
711                 jh = commit_transaction->t_iobuf_list->b_tprev;
712                 bh = jh2bh(jh);
713                 if (buffer_locked(bh)) {
714                         wait_on_buffer(bh);
715                         goto wait_for_iobuf;
716                 }
717                 if (cond_resched())
718                         goto wait_for_iobuf;
719
720                 if (unlikely(!buffer_uptodate(bh)))
721                         err = -EIO;
722
723                 clear_buffer_jwrite(bh);
724
725                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
726                 jbd2_journal_unfile_buffer(journal, jh);
727
728                 /*
729                  * ->t_iobuf_list should contain only dummy buffer_heads
730                  * which were created by jbd2_journal_write_metadata_buffer().
731                  */
732                 BUFFER_TRACE(bh, "dumping temporary bh");
733                 jbd2_journal_put_journal_head(jh);
734                 __brelse(bh);
735                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
736                 free_buffer_head(bh);
737
738                 /* We also have to unlock and free the corresponding
739                    shadowed buffer */
740                 jh = commit_transaction->t_shadow_list->b_tprev;
741                 bh = jh2bh(jh);
742                 clear_bit(BH_JWrite, &bh->b_state);
743                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
744
745                 /* The metadata is now released for reuse, but we need
746                    to remember it against this transaction so that when
747                    we finally commit, we can do any checkpointing
748                    required. */
749                 JBUFFER_TRACE(jh, "file as BJ_Forget");
750                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
751                 /* Wake up any transactions which were waiting for this
752                    IO to complete */
753                 wake_up_bit(&bh->b_state, BH_Unshadow);
754                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
755                 __brelse(bh);
756         }
757
758         J_ASSERT (commit_transaction->t_shadow_list == NULL);
759
760         jbd_debug(3, "JBD: commit phase 4\n");
761
762         /* Here we wait for the revoke record and descriptor record buffers */
763  wait_for_ctlbuf:
764         while (commit_transaction->t_log_list != NULL) {
765                 struct buffer_head *bh;
766
767                 jh = commit_transaction->t_log_list->b_tprev;
768                 bh = jh2bh(jh);
769                 if (buffer_locked(bh)) {
770                         wait_on_buffer(bh);
771                         goto wait_for_ctlbuf;
772                 }
773                 if (cond_resched())
774                         goto wait_for_ctlbuf;
775
776                 if (unlikely(!buffer_uptodate(bh)))
777                         err = -EIO;
778
779                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
780                 clear_buffer_jwrite(bh);
781                 jbd2_journal_unfile_buffer(journal, jh);
782                 jbd2_journal_put_journal_head(jh);
783                 __brelse(bh);           /* One for getblk */
784                 /* AKPM: bforget here */
785         }
786
787         jbd_debug(3, "JBD: commit phase 5\n");
788
789         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
790                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
791                 err = journal_submit_commit_record(journal, commit_transaction,
792                                                 &cbh, crc32_sum);
793                 if (err)
794                         __jbd2_journal_abort_hard(journal);
795         }
796         if (!err && !is_journal_aborted(journal))
797                 err = journal_wait_on_commit_record(cbh);
798
799         if (err)
800                 jbd2_journal_abort(journal, err);
801
802         /* End of a transaction!  Finally, we can do checkpoint
803            processing: any buffers committed as a result of this
804            transaction can be removed from any checkpoint list it was on
805            before. */
806
807         jbd_debug(3, "JBD: commit phase 6\n");
808
809         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
810         J_ASSERT(commit_transaction->t_buffers == NULL);
811         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
812         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
813         J_ASSERT(commit_transaction->t_shadow_list == NULL);
814         J_ASSERT(commit_transaction->t_log_list == NULL);
815
816 restart_loop:
817         /*
818          * As there are other places (journal_unmap_buffer()) adding buffers
819          * to this list we have to be careful and hold the j_list_lock.
820          */
821         spin_lock(&journal->j_list_lock);
822         while (commit_transaction->t_forget) {
823                 transaction_t *cp_transaction;
824                 struct buffer_head *bh;
825
826                 jh = commit_transaction->t_forget;
827                 spin_unlock(&journal->j_list_lock);
828                 bh = jh2bh(jh);
829                 jbd_lock_bh_state(bh);
830                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
831                         jh->b_transaction == journal->j_running_transaction);
832
833                 /*
834                  * If there is undo-protected committed data against
835                  * this buffer, then we can remove it now.  If it is a
836                  * buffer needing such protection, the old frozen_data
837                  * field now points to a committed version of the
838                  * buffer, so rotate that field to the new committed
839                  * data.
840                  *
841                  * Otherwise, we can just throw away the frozen data now.
842                  */
843                 if (jh->b_committed_data) {
844                         jbd2_free(jh->b_committed_data, bh->b_size);
845                         jh->b_committed_data = NULL;
846                         if (jh->b_frozen_data) {
847                                 jh->b_committed_data = jh->b_frozen_data;
848                                 jh->b_frozen_data = NULL;
849                         }
850                 } else if (jh->b_frozen_data) {
851                         jbd2_free(jh->b_frozen_data, bh->b_size);
852                         jh->b_frozen_data = NULL;
853                 }
854
855                 spin_lock(&journal->j_list_lock);
856                 cp_transaction = jh->b_cp_transaction;
857                 if (cp_transaction) {
858                         JBUFFER_TRACE(jh, "remove from old cp transaction");
859                         cp_transaction->t_chp_stats.cs_dropped++;
860                         __jbd2_journal_remove_checkpoint(jh);
861                 }
862
863                 /* Only re-checkpoint the buffer_head if it is marked
864                  * dirty.  If the buffer was added to the BJ_Forget list
865                  * by jbd2_journal_forget, it may no longer be dirty and
866                  * there's no point in keeping a checkpoint record for
867                  * it. */
868
869                 /* A buffer which has been freed while still being
870                  * journaled by a previous transaction may end up still
871                  * being dirty here, but we want to avoid writing back
872                  * that buffer in the future now that the last use has
873                  * been committed.  That's not only a performance gain,
874                  * it also stops aliasing problems if the buffer is left
875                  * behind for writeback and gets reallocated for another
876                  * use in a different page. */
877                 if (buffer_freed(bh)) {
878                         clear_buffer_freed(bh);
879                         clear_buffer_jbddirty(bh);
880                 }
881
882                 if (buffer_jbddirty(bh)) {
883                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
884                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
885                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
886                         __jbd2_journal_refile_buffer(jh);
887                         jbd_unlock_bh_state(bh);
888                 } else {
889                         J_ASSERT_BH(bh, !buffer_dirty(bh));
890                         /* The buffer on BJ_Forget list and not jbddirty means
891                          * it has been freed by this transaction and hence it
892                          * could not have been reallocated until this
893                          * transaction has committed. *BUT* it could be
894                          * reallocated once we have written all the data to
895                          * disk and before we process the buffer on BJ_Forget
896                          * list. */
897                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
898                         __jbd2_journal_refile_buffer(jh);
899                         if (!jh->b_transaction) {
900                                 jbd_unlock_bh_state(bh);
901                                  /* needs a brelse */
902                                 jbd2_journal_remove_journal_head(bh);
903                                 release_buffer_page(bh);
904                         } else
905                                 jbd_unlock_bh_state(bh);
906                 }
907                 cond_resched_lock(&journal->j_list_lock);
908         }
909         spin_unlock(&journal->j_list_lock);
910         /*
911          * This is a bit sleazy.  We use j_list_lock to protect transition
912          * of a transaction into T_FINISHED state and calling
913          * __jbd2_journal_drop_transaction(). Otherwise we could race with
914          * other checkpointing code processing the transaction...
915          */
916         spin_lock(&journal->j_state_lock);
917         spin_lock(&journal->j_list_lock);
918         /*
919          * Now recheck if some buffers did not get attached to the transaction
920          * while the lock was dropped...
921          */
922         if (commit_transaction->t_forget) {
923                 spin_unlock(&journal->j_list_lock);
924                 spin_unlock(&journal->j_state_lock);
925                 goto restart_loop;
926         }
927
928         /* Done with this transaction! */
929
930         jbd_debug(3, "JBD: commit phase 7\n");
931
932         J_ASSERT(commit_transaction->t_state == T_COMMIT);
933
934         commit_transaction->t_start = jiffies;
935         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
936                                                 commit_transaction->t_start);
937
938         /*
939          * File the transaction for history
940          */
941         stats.ts_type = JBD2_STATS_RUN;
942         stats.ts_tid = commit_transaction->t_tid;
943         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
944         spin_lock(&journal->j_history_lock);
945         memcpy(journal->j_history + journal->j_history_cur, &stats,
946                         sizeof(stats));
947         if (++journal->j_history_cur == journal->j_history_max)
948                 journal->j_history_cur = 0;
949
950         /*
951          * Calculate overall stats
952          */
953         journal->j_stats.ts_tid++;
954         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
955         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
956         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
957         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
958         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
959         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
960         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
961         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
962         spin_unlock(&journal->j_history_lock);
963
964         commit_transaction->t_state = T_FINISHED;
965         J_ASSERT(commit_transaction == journal->j_committing_transaction);
966         journal->j_commit_sequence = commit_transaction->t_tid;
967         journal->j_committing_transaction = NULL;
968         spin_unlock(&journal->j_state_lock);
969
970         if (commit_transaction->t_checkpoint_list == NULL &&
971             commit_transaction->t_checkpoint_io_list == NULL) {
972                 __jbd2_journal_drop_transaction(journal, commit_transaction);
973         } else {
974                 if (journal->j_checkpoint_transactions == NULL) {
975                         journal->j_checkpoint_transactions = commit_transaction;
976                         commit_transaction->t_cpnext = commit_transaction;
977                         commit_transaction->t_cpprev = commit_transaction;
978                 } else {
979                         commit_transaction->t_cpnext =
980                                 journal->j_checkpoint_transactions;
981                         commit_transaction->t_cpprev =
982                                 commit_transaction->t_cpnext->t_cpprev;
983                         commit_transaction->t_cpnext->t_cpprev =
984                                 commit_transaction;
985                         commit_transaction->t_cpprev->t_cpnext =
986                                 commit_transaction;
987                 }
988         }
989         spin_unlock(&journal->j_list_lock);
990
991         trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
992                    journal->j_devname, commit_transaction->t_tid,
993                    journal->j_tail_sequence);
994         jbd_debug(1, "JBD: commit %d complete, head %d\n",
995                   journal->j_commit_sequence, journal->j_tail_sequence);
996
997         wake_up(&journal->j_wait_done_commit);
998 }