fix get_active_super()/umount() race
[safe/jmp/linux-2.6] / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
44
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48
49 inline void
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 {
52         bh->b_end_io = handler;
53         bh->b_private = private;
54 }
55 EXPORT_SYMBOL(init_buffer);
56
57 static int sync_buffer(void *word)
58 {
59         struct block_device *bd;
60         struct buffer_head *bh
61                 = container_of(word, struct buffer_head, b_state);
62
63         smp_mb();
64         bd = bh->b_bdev;
65         if (bd)
66                 blk_run_address_space(bd->bd_inode->i_mapping);
67         io_schedule();
68         return 0;
69 }
70
71 void __lock_buffer(struct buffer_head *bh)
72 {
73         wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
74                                                         TASK_UNINTERRUPTIBLE);
75 }
76 EXPORT_SYMBOL(__lock_buffer);
77
78 void unlock_buffer(struct buffer_head *bh)
79 {
80         clear_bit_unlock(BH_Lock, &bh->b_state);
81         smp_mb__after_clear_bit();
82         wake_up_bit(&bh->b_state, BH_Lock);
83 }
84 EXPORT_SYMBOL(unlock_buffer);
85
86 /*
87  * Block until a buffer comes unlocked.  This doesn't stop it
88  * from becoming locked again - you have to lock it yourself
89  * if you want to preserve its state.
90  */
91 void __wait_on_buffer(struct buffer_head * bh)
92 {
93         wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
94 }
95 EXPORT_SYMBOL(__wait_on_buffer);
96
97 static void
98 __clear_page_buffers(struct page *page)
99 {
100         ClearPagePrivate(page);
101         set_page_private(page, 0);
102         page_cache_release(page);
103 }
104
105
106 static int quiet_error(struct buffer_head *bh)
107 {
108         if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
109                 return 0;
110         return 1;
111 }
112
113
114 static void buffer_io_error(struct buffer_head *bh)
115 {
116         char b[BDEVNAME_SIZE];
117         printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
118                         bdevname(bh->b_bdev, b),
119                         (unsigned long long)bh->b_blocknr);
120 }
121
122 /*
123  * End-of-IO handler helper function which does not touch the bh after
124  * unlocking it.
125  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
126  * a race there is benign: unlock_buffer() only use the bh's address for
127  * hashing after unlocking the buffer, so it doesn't actually touch the bh
128  * itself.
129  */
130 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
131 {
132         if (uptodate) {
133                 set_buffer_uptodate(bh);
134         } else {
135                 /* This happens, due to failed READA attempts. */
136                 clear_buffer_uptodate(bh);
137         }
138         unlock_buffer(bh);
139 }
140
141 /*
142  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
143  * unlock the buffer. This is what ll_rw_block uses too.
144  */
145 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
146 {
147         __end_buffer_read_notouch(bh, uptodate);
148         put_bh(bh);
149 }
150 EXPORT_SYMBOL(end_buffer_read_sync);
151
152 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
153 {
154         char b[BDEVNAME_SIZE];
155
156         if (uptodate) {
157                 set_buffer_uptodate(bh);
158         } else {
159                 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
160                         buffer_io_error(bh);
161                         printk(KERN_WARNING "lost page write due to "
162                                         "I/O error on %s\n",
163                                        bdevname(bh->b_bdev, b));
164                 }
165                 set_buffer_write_io_error(bh);
166                 clear_buffer_uptodate(bh);
167         }
168         unlock_buffer(bh);
169         put_bh(bh);
170 }
171 EXPORT_SYMBOL(end_buffer_write_sync);
172
173 /*
174  * Various filesystems appear to want __find_get_block to be non-blocking.
175  * But it's the page lock which protects the buffers.  To get around this,
176  * we get exclusion from try_to_free_buffers with the blockdev mapping's
177  * private_lock.
178  *
179  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
180  * may be quite high.  This code could TryLock the page, and if that
181  * succeeds, there is no need to take private_lock. (But if
182  * private_lock is contended then so is mapping->tree_lock).
183  */
184 static struct buffer_head *
185 __find_get_block_slow(struct block_device *bdev, sector_t block)
186 {
187         struct inode *bd_inode = bdev->bd_inode;
188         struct address_space *bd_mapping = bd_inode->i_mapping;
189         struct buffer_head *ret = NULL;
190         pgoff_t index;
191         struct buffer_head *bh;
192         struct buffer_head *head;
193         struct page *page;
194         int all_mapped = 1;
195
196         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
197         page = find_get_page(bd_mapping, index);
198         if (!page)
199                 goto out;
200
201         spin_lock(&bd_mapping->private_lock);
202         if (!page_has_buffers(page))
203                 goto out_unlock;
204         head = page_buffers(page);
205         bh = head;
206         do {
207                 if (!buffer_mapped(bh))
208                         all_mapped = 0;
209                 else if (bh->b_blocknr == block) {
210                         ret = bh;
211                         get_bh(bh);
212                         goto out_unlock;
213                 }
214                 bh = bh->b_this_page;
215         } while (bh != head);
216
217         /* we might be here because some of the buffers on this page are
218          * not mapped.  This is due to various races between
219          * file io on the block device and getblk.  It gets dealt with
220          * elsewhere, don't buffer_error if we had some unmapped buffers
221          */
222         if (all_mapped) {
223                 printk("__find_get_block_slow() failed. "
224                         "block=%llu, b_blocknr=%llu\n",
225                         (unsigned long long)block,
226                         (unsigned long long)bh->b_blocknr);
227                 printk("b_state=0x%08lx, b_size=%zu\n",
228                         bh->b_state, bh->b_size);
229                 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
230         }
231 out_unlock:
232         spin_unlock(&bd_mapping->private_lock);
233         page_cache_release(page);
234 out:
235         return ret;
236 }
237
238 /* If invalidate_buffers() will trash dirty buffers, it means some kind
239    of fs corruption is going on. Trashing dirty data always imply losing
240    information that was supposed to be just stored on the physical layer
241    by the user.
242
243    Thus invalidate_buffers in general usage is not allwowed to trash
244    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
245    be preserved.  These buffers are simply skipped.
246   
247    We also skip buffers which are still in use.  For example this can
248    happen if a userspace program is reading the block device.
249
250    NOTE: In the case where the user removed a removable-media-disk even if
251    there's still dirty data not synced on disk (due a bug in the device driver
252    or due an error of the user), by not destroying the dirty buffers we could
253    generate corruption also on the next media inserted, thus a parameter is
254    necessary to handle this case in the most safe way possible (trying
255    to not corrupt also the new disk inserted with the data belonging to
256    the old now corrupted disk). Also for the ramdisk the natural thing
257    to do in order to release the ramdisk memory is to destroy dirty buffers.
258
259    These are two special cases. Normal usage imply the device driver
260    to issue a sync on the device (without waiting I/O completion) and
261    then an invalidate_buffers call that doesn't trash dirty buffers.
262
263    For handling cache coherency with the blkdev pagecache the 'update' case
264    is been introduced. It is needed to re-read from disk any pinned
265    buffer. NOTE: re-reading from disk is destructive so we can do it only
266    when we assume nobody is changing the buffercache under our I/O and when
267    we think the disk contains more recent information than the buffercache.
268    The update == 1 pass marks the buffers we need to update, the update == 2
269    pass does the actual I/O. */
270 void invalidate_bdev(struct block_device *bdev)
271 {
272         struct address_space *mapping = bdev->bd_inode->i_mapping;
273
274         if (mapping->nrpages == 0)
275                 return;
276
277         invalidate_bh_lrus();
278         invalidate_mapping_pages(mapping, 0, -1);
279 }
280 EXPORT_SYMBOL(invalidate_bdev);
281
282 /*
283  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
284  */
285 static void free_more_memory(void)
286 {
287         struct zone *zone;
288         int nid;
289
290         wakeup_flusher_threads(1024);
291         yield();
292
293         for_each_online_node(nid) {
294                 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
295                                                 gfp_zone(GFP_NOFS), NULL,
296                                                 &zone);
297                 if (zone)
298                         try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
299                                                 GFP_NOFS, NULL);
300         }
301 }
302
303 /*
304  * I/O completion handler for block_read_full_page() - pages
305  * which come unlocked at the end of I/O.
306  */
307 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
308 {
309         unsigned long flags;
310         struct buffer_head *first;
311         struct buffer_head *tmp;
312         struct page *page;
313         int page_uptodate = 1;
314
315         BUG_ON(!buffer_async_read(bh));
316
317         page = bh->b_page;
318         if (uptodate) {
319                 set_buffer_uptodate(bh);
320         } else {
321                 clear_buffer_uptodate(bh);
322                 if (!quiet_error(bh))
323                         buffer_io_error(bh);
324                 SetPageError(page);
325         }
326
327         /*
328          * Be _very_ careful from here on. Bad things can happen if
329          * two buffer heads end IO at almost the same time and both
330          * decide that the page is now completely done.
331          */
332         first = page_buffers(page);
333         local_irq_save(flags);
334         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
335         clear_buffer_async_read(bh);
336         unlock_buffer(bh);
337         tmp = bh;
338         do {
339                 if (!buffer_uptodate(tmp))
340                         page_uptodate = 0;
341                 if (buffer_async_read(tmp)) {
342                         BUG_ON(!buffer_locked(tmp));
343                         goto still_busy;
344                 }
345                 tmp = tmp->b_this_page;
346         } while (tmp != bh);
347         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
348         local_irq_restore(flags);
349
350         /*
351          * If none of the buffers had errors and they are all
352          * uptodate then we can set the page uptodate.
353          */
354         if (page_uptodate && !PageError(page))
355                 SetPageUptodate(page);
356         unlock_page(page);
357         return;
358
359 still_busy:
360         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
361         local_irq_restore(flags);
362         return;
363 }
364
365 /*
366  * Completion handler for block_write_full_page() - pages which are unlocked
367  * during I/O, and which have PageWriteback cleared upon I/O completion.
368  */
369 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
370 {
371         char b[BDEVNAME_SIZE];
372         unsigned long flags;
373         struct buffer_head *first;
374         struct buffer_head *tmp;
375         struct page *page;
376
377         BUG_ON(!buffer_async_write(bh));
378
379         page = bh->b_page;
380         if (uptodate) {
381                 set_buffer_uptodate(bh);
382         } else {
383                 if (!quiet_error(bh)) {
384                         buffer_io_error(bh);
385                         printk(KERN_WARNING "lost page write due to "
386                                         "I/O error on %s\n",
387                                bdevname(bh->b_bdev, b));
388                 }
389                 set_bit(AS_EIO, &page->mapping->flags);
390                 set_buffer_write_io_error(bh);
391                 clear_buffer_uptodate(bh);
392                 SetPageError(page);
393         }
394
395         first = page_buffers(page);
396         local_irq_save(flags);
397         bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
398
399         clear_buffer_async_write(bh);
400         unlock_buffer(bh);
401         tmp = bh->b_this_page;
402         while (tmp != bh) {
403                 if (buffer_async_write(tmp)) {
404                         BUG_ON(!buffer_locked(tmp));
405                         goto still_busy;
406                 }
407                 tmp = tmp->b_this_page;
408         }
409         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
410         local_irq_restore(flags);
411         end_page_writeback(page);
412         return;
413
414 still_busy:
415         bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
416         local_irq_restore(flags);
417         return;
418 }
419 EXPORT_SYMBOL(end_buffer_async_write);
420
421 /*
422  * If a page's buffers are under async readin (end_buffer_async_read
423  * completion) then there is a possibility that another thread of
424  * control could lock one of the buffers after it has completed
425  * but while some of the other buffers have not completed.  This
426  * locked buffer would confuse end_buffer_async_read() into not unlocking
427  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
428  * that this buffer is not under async I/O.
429  *
430  * The page comes unlocked when it has no locked buffer_async buffers
431  * left.
432  *
433  * PageLocked prevents anyone starting new async I/O reads any of
434  * the buffers.
435  *
436  * PageWriteback is used to prevent simultaneous writeout of the same
437  * page.
438  *
439  * PageLocked prevents anyone from starting writeback of a page which is
440  * under read I/O (PageWriteback is only ever set against a locked page).
441  */
442 static void mark_buffer_async_read(struct buffer_head *bh)
443 {
444         bh->b_end_io = end_buffer_async_read;
445         set_buffer_async_read(bh);
446 }
447
448 static void mark_buffer_async_write_endio(struct buffer_head *bh,
449                                           bh_end_io_t *handler)
450 {
451         bh->b_end_io = handler;
452         set_buffer_async_write(bh);
453 }
454
455 void mark_buffer_async_write(struct buffer_head *bh)
456 {
457         mark_buffer_async_write_endio(bh, end_buffer_async_write);
458 }
459 EXPORT_SYMBOL(mark_buffer_async_write);
460
461
462 /*
463  * fs/buffer.c contains helper functions for buffer-backed address space's
464  * fsync functions.  A common requirement for buffer-based filesystems is
465  * that certain data from the backing blockdev needs to be written out for
466  * a successful fsync().  For example, ext2 indirect blocks need to be
467  * written back and waited upon before fsync() returns.
468  *
469  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
470  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
471  * management of a list of dependent buffers at ->i_mapping->private_list.
472  *
473  * Locking is a little subtle: try_to_free_buffers() will remove buffers
474  * from their controlling inode's queue when they are being freed.  But
475  * try_to_free_buffers() will be operating against the *blockdev* mapping
476  * at the time, not against the S_ISREG file which depends on those buffers.
477  * So the locking for private_list is via the private_lock in the address_space
478  * which backs the buffers.  Which is different from the address_space 
479  * against which the buffers are listed.  So for a particular address_space,
480  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
481  * mapping->private_list will always be protected by the backing blockdev's
482  * ->private_lock.
483  *
484  * Which introduces a requirement: all buffers on an address_space's
485  * ->private_list must be from the same address_space: the blockdev's.
486  *
487  * address_spaces which do not place buffers at ->private_list via these
488  * utility functions are free to use private_lock and private_list for
489  * whatever they want.  The only requirement is that list_empty(private_list)
490  * be true at clear_inode() time.
491  *
492  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
493  * filesystems should do that.  invalidate_inode_buffers() should just go
494  * BUG_ON(!list_empty).
495  *
496  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
497  * take an address_space, not an inode.  And it should be called
498  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
499  * queued up.
500  *
501  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
502  * list if it is already on a list.  Because if the buffer is on a list,
503  * it *must* already be on the right one.  If not, the filesystem is being
504  * silly.  This will save a ton of locking.  But first we have to ensure
505  * that buffers are taken *off* the old inode's list when they are freed
506  * (presumably in truncate).  That requires careful auditing of all
507  * filesystems (do it inside bforget()).  It could also be done by bringing
508  * b_inode back.
509  */
510
511 /*
512  * The buffer's backing address_space's private_lock must be held
513  */
514 static void __remove_assoc_queue(struct buffer_head *bh)
515 {
516         list_del_init(&bh->b_assoc_buffers);
517         WARN_ON(!bh->b_assoc_map);
518         if (buffer_write_io_error(bh))
519                 set_bit(AS_EIO, &bh->b_assoc_map->flags);
520         bh->b_assoc_map = NULL;
521 }
522
523 int inode_has_buffers(struct inode *inode)
524 {
525         return !list_empty(&inode->i_data.private_list);
526 }
527
528 /*
529  * osync is designed to support O_SYNC io.  It waits synchronously for
530  * all already-submitted IO to complete, but does not queue any new
531  * writes to the disk.
532  *
533  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
534  * you dirty the buffers, and then use osync_inode_buffers to wait for
535  * completion.  Any other dirty buffers which are not yet queued for
536  * write will not be flushed to disk by the osync.
537  */
538 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
539 {
540         struct buffer_head *bh;
541         struct list_head *p;
542         int err = 0;
543
544         spin_lock(lock);
545 repeat:
546         list_for_each_prev(p, list) {
547                 bh = BH_ENTRY(p);
548                 if (buffer_locked(bh)) {
549                         get_bh(bh);
550                         spin_unlock(lock);
551                         wait_on_buffer(bh);
552                         if (!buffer_uptodate(bh))
553                                 err = -EIO;
554                         brelse(bh);
555                         spin_lock(lock);
556                         goto repeat;
557                 }
558         }
559         spin_unlock(lock);
560         return err;
561 }
562
563 static void do_thaw_all(struct work_struct *work)
564 {
565         struct super_block *sb, *n;
566         char b[BDEVNAME_SIZE];
567
568         spin_lock(&sb_lock);
569         list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
570                 if (list_empty(&sb->s_instances))
571                         continue;
572                 sb->s_count++;
573                 spin_unlock(&sb_lock);
574                 down_read(&sb->s_umount);
575                 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
576                         printk(KERN_WARNING "Emergency Thaw on %s\n",
577                                bdevname(sb->s_bdev, b));
578                 up_read(&sb->s_umount);
579                 spin_lock(&sb_lock);
580         }
581         spin_unlock(&sb_lock);
582         kfree(work);
583         printk(KERN_WARNING "Emergency Thaw complete\n");
584 }
585
586 /**
587  * emergency_thaw_all -- forcibly thaw every frozen filesystem
588  *
589  * Used for emergency unfreeze of all filesystems via SysRq
590  */
591 void emergency_thaw_all(void)
592 {
593         struct work_struct *work;
594
595         work = kmalloc(sizeof(*work), GFP_ATOMIC);
596         if (work) {
597                 INIT_WORK(work, do_thaw_all);
598                 schedule_work(work);
599         }
600 }
601
602 /**
603  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
604  * @mapping: the mapping which wants those buffers written
605  *
606  * Starts I/O against the buffers at mapping->private_list, and waits upon
607  * that I/O.
608  *
609  * Basically, this is a convenience function for fsync().
610  * @mapping is a file or directory which needs those buffers to be written for
611  * a successful fsync().
612  */
613 int sync_mapping_buffers(struct address_space *mapping)
614 {
615         struct address_space *buffer_mapping = mapping->assoc_mapping;
616
617         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
618                 return 0;
619
620         return fsync_buffers_list(&buffer_mapping->private_lock,
621                                         &mapping->private_list);
622 }
623 EXPORT_SYMBOL(sync_mapping_buffers);
624
625 /*
626  * Called when we've recently written block `bblock', and it is known that
627  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
628  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
629  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
630  */
631 void write_boundary_block(struct block_device *bdev,
632                         sector_t bblock, unsigned blocksize)
633 {
634         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
635         if (bh) {
636                 if (buffer_dirty(bh))
637                         ll_rw_block(WRITE, 1, &bh);
638                 put_bh(bh);
639         }
640 }
641
642 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
643 {
644         struct address_space *mapping = inode->i_mapping;
645         struct address_space *buffer_mapping = bh->b_page->mapping;
646
647         mark_buffer_dirty(bh);
648         if (!mapping->assoc_mapping) {
649                 mapping->assoc_mapping = buffer_mapping;
650         } else {
651                 BUG_ON(mapping->assoc_mapping != buffer_mapping);
652         }
653         if (!bh->b_assoc_map) {
654                 spin_lock(&buffer_mapping->private_lock);
655                 list_move_tail(&bh->b_assoc_buffers,
656                                 &mapping->private_list);
657                 bh->b_assoc_map = mapping;
658                 spin_unlock(&buffer_mapping->private_lock);
659         }
660 }
661 EXPORT_SYMBOL(mark_buffer_dirty_inode);
662
663 /*
664  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
665  * dirty.
666  *
667  * If warn is true, then emit a warning if the page is not uptodate and has
668  * not been truncated.
669  */
670 static void __set_page_dirty(struct page *page,
671                 struct address_space *mapping, int warn)
672 {
673         spin_lock_irq(&mapping->tree_lock);
674         if (page->mapping) {    /* Race with truncate? */
675                 WARN_ON_ONCE(warn && !PageUptodate(page));
676                 account_page_dirtied(page, mapping);
677                 radix_tree_tag_set(&mapping->page_tree,
678                                 page_index(page), PAGECACHE_TAG_DIRTY);
679         }
680         spin_unlock_irq(&mapping->tree_lock);
681         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
682 }
683
684 /*
685  * Add a page to the dirty page list.
686  *
687  * It is a sad fact of life that this function is called from several places
688  * deeply under spinlocking.  It may not sleep.
689  *
690  * If the page has buffers, the uptodate buffers are set dirty, to preserve
691  * dirty-state coherency between the page and the buffers.  It the page does
692  * not have buffers then when they are later attached they will all be set
693  * dirty.
694  *
695  * The buffers are dirtied before the page is dirtied.  There's a small race
696  * window in which a writepage caller may see the page cleanness but not the
697  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
698  * before the buffers, a concurrent writepage caller could clear the page dirty
699  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
700  * page on the dirty page list.
701  *
702  * We use private_lock to lock against try_to_free_buffers while using the
703  * page's buffer list.  Also use this to protect against clean buffers being
704  * added to the page after it was set dirty.
705  *
706  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
707  * address_space though.
708  */
709 int __set_page_dirty_buffers(struct page *page)
710 {
711         int newly_dirty;
712         struct address_space *mapping = page_mapping(page);
713
714         if (unlikely(!mapping))
715                 return !TestSetPageDirty(page);
716
717         spin_lock(&mapping->private_lock);
718         if (page_has_buffers(page)) {
719                 struct buffer_head *head = page_buffers(page);
720                 struct buffer_head *bh = head;
721
722                 do {
723                         set_buffer_dirty(bh);
724                         bh = bh->b_this_page;
725                 } while (bh != head);
726         }
727         newly_dirty = !TestSetPageDirty(page);
728         spin_unlock(&mapping->private_lock);
729
730         if (newly_dirty)
731                 __set_page_dirty(page, mapping, 1);
732         return newly_dirty;
733 }
734 EXPORT_SYMBOL(__set_page_dirty_buffers);
735
736 /*
737  * Write out and wait upon a list of buffers.
738  *
739  * We have conflicting pressures: we want to make sure that all
740  * initially dirty buffers get waited on, but that any subsequently
741  * dirtied buffers don't.  After all, we don't want fsync to last
742  * forever if somebody is actively writing to the file.
743  *
744  * Do this in two main stages: first we copy dirty buffers to a
745  * temporary inode list, queueing the writes as we go.  Then we clean
746  * up, waiting for those writes to complete.
747  * 
748  * During this second stage, any subsequent updates to the file may end
749  * up refiling the buffer on the original inode's dirty list again, so
750  * there is a chance we will end up with a buffer queued for write but
751  * not yet completed on that list.  So, as a final cleanup we go through
752  * the osync code to catch these locked, dirty buffers without requeuing
753  * any newly dirty buffers for write.
754  */
755 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
756 {
757         struct buffer_head *bh;
758         struct list_head tmp;
759         struct address_space *mapping, *prev_mapping = NULL;
760         int err = 0, err2;
761
762         INIT_LIST_HEAD(&tmp);
763
764         spin_lock(lock);
765         while (!list_empty(list)) {
766                 bh = BH_ENTRY(list->next);
767                 mapping = bh->b_assoc_map;
768                 __remove_assoc_queue(bh);
769                 /* Avoid race with mark_buffer_dirty_inode() which does
770                  * a lockless check and we rely on seeing the dirty bit */
771                 smp_mb();
772                 if (buffer_dirty(bh) || buffer_locked(bh)) {
773                         list_add(&bh->b_assoc_buffers, &tmp);
774                         bh->b_assoc_map = mapping;
775                         if (buffer_dirty(bh)) {
776                                 get_bh(bh);
777                                 spin_unlock(lock);
778                                 /*
779                                  * Ensure any pending I/O completes so that
780                                  * ll_rw_block() actually writes the current
781                                  * contents - it is a noop if I/O is still in
782                                  * flight on potentially older contents.
783                                  */
784                                 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
785
786                                 /*
787                                  * Kick off IO for the previous mapping. Note
788                                  * that we will not run the very last mapping,
789                                  * wait_on_buffer() will do that for us
790                                  * through sync_buffer().
791                                  */
792                                 if (prev_mapping && prev_mapping != mapping)
793                                         blk_run_address_space(prev_mapping);
794                                 prev_mapping = mapping;
795
796                                 brelse(bh);
797                                 spin_lock(lock);
798                         }
799                 }
800         }
801
802         while (!list_empty(&tmp)) {
803                 bh = BH_ENTRY(tmp.prev);
804                 get_bh(bh);
805                 mapping = bh->b_assoc_map;
806                 __remove_assoc_queue(bh);
807                 /* Avoid race with mark_buffer_dirty_inode() which does
808                  * a lockless check and we rely on seeing the dirty bit */
809                 smp_mb();
810                 if (buffer_dirty(bh)) {
811                         list_add(&bh->b_assoc_buffers,
812                                  &mapping->private_list);
813                         bh->b_assoc_map = mapping;
814                 }
815                 spin_unlock(lock);
816                 wait_on_buffer(bh);
817                 if (!buffer_uptodate(bh))
818                         err = -EIO;
819                 brelse(bh);
820                 spin_lock(lock);
821         }
822         
823         spin_unlock(lock);
824         err2 = osync_buffers_list(lock, list);
825         if (err)
826                 return err;
827         else
828                 return err2;
829 }
830
831 /*
832  * Invalidate any and all dirty buffers on a given inode.  We are
833  * probably unmounting the fs, but that doesn't mean we have already
834  * done a sync().  Just drop the buffers from the inode list.
835  *
836  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
837  * assumes that all the buffers are against the blockdev.  Not true
838  * for reiserfs.
839  */
840 void invalidate_inode_buffers(struct inode *inode)
841 {
842         if (inode_has_buffers(inode)) {
843                 struct address_space *mapping = &inode->i_data;
844                 struct list_head *list = &mapping->private_list;
845                 struct address_space *buffer_mapping = mapping->assoc_mapping;
846
847                 spin_lock(&buffer_mapping->private_lock);
848                 while (!list_empty(list))
849                         __remove_assoc_queue(BH_ENTRY(list->next));
850                 spin_unlock(&buffer_mapping->private_lock);
851         }
852 }
853 EXPORT_SYMBOL(invalidate_inode_buffers);
854
855 /*
856  * Remove any clean buffers from the inode's buffer list.  This is called
857  * when we're trying to free the inode itself.  Those buffers can pin it.
858  *
859  * Returns true if all buffers were removed.
860  */
861 int remove_inode_buffers(struct inode *inode)
862 {
863         int ret = 1;
864
865         if (inode_has_buffers(inode)) {
866                 struct address_space *mapping = &inode->i_data;
867                 struct list_head *list = &mapping->private_list;
868                 struct address_space *buffer_mapping = mapping->assoc_mapping;
869
870                 spin_lock(&buffer_mapping->private_lock);
871                 while (!list_empty(list)) {
872                         struct buffer_head *bh = BH_ENTRY(list->next);
873                         if (buffer_dirty(bh)) {
874                                 ret = 0;
875                                 break;
876                         }
877                         __remove_assoc_queue(bh);
878                 }
879                 spin_unlock(&buffer_mapping->private_lock);
880         }
881         return ret;
882 }
883
884 /*
885  * Create the appropriate buffers when given a page for data area and
886  * the size of each buffer.. Use the bh->b_this_page linked list to
887  * follow the buffers created.  Return NULL if unable to create more
888  * buffers.
889  *
890  * The retry flag is used to differentiate async IO (paging, swapping)
891  * which may not fail from ordinary buffer allocations.
892  */
893 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
894                 int retry)
895 {
896         struct buffer_head *bh, *head;
897         long offset;
898
899 try_again:
900         head = NULL;
901         offset = PAGE_SIZE;
902         while ((offset -= size) >= 0) {
903                 bh = alloc_buffer_head(GFP_NOFS);
904                 if (!bh)
905                         goto no_grow;
906
907                 bh->b_bdev = NULL;
908                 bh->b_this_page = head;
909                 bh->b_blocknr = -1;
910                 head = bh;
911
912                 bh->b_state = 0;
913                 atomic_set(&bh->b_count, 0);
914                 bh->b_private = NULL;
915                 bh->b_size = size;
916
917                 /* Link the buffer to its page */
918                 set_bh_page(bh, page, offset);
919
920                 init_buffer(bh, NULL, NULL);
921         }
922         return head;
923 /*
924  * In case anything failed, we just free everything we got.
925  */
926 no_grow:
927         if (head) {
928                 do {
929                         bh = head;
930                         head = head->b_this_page;
931                         free_buffer_head(bh);
932                 } while (head);
933         }
934
935         /*
936          * Return failure for non-async IO requests.  Async IO requests
937          * are not allowed to fail, so we have to wait until buffer heads
938          * become available.  But we don't want tasks sleeping with 
939          * partially complete buffers, so all were released above.
940          */
941         if (!retry)
942                 return NULL;
943
944         /* We're _really_ low on memory. Now we just
945          * wait for old buffer heads to become free due to
946          * finishing IO.  Since this is an async request and
947          * the reserve list is empty, we're sure there are 
948          * async buffer heads in use.
949          */
950         free_more_memory();
951         goto try_again;
952 }
953 EXPORT_SYMBOL_GPL(alloc_page_buffers);
954
955 static inline void
956 link_dev_buffers(struct page *page, struct buffer_head *head)
957 {
958         struct buffer_head *bh, *tail;
959
960         bh = head;
961         do {
962                 tail = bh;
963                 bh = bh->b_this_page;
964         } while (bh);
965         tail->b_this_page = head;
966         attach_page_buffers(page, head);
967 }
968
969 /*
970  * Initialise the state of a blockdev page's buffers.
971  */ 
972 static void
973 init_page_buffers(struct page *page, struct block_device *bdev,
974                         sector_t block, int size)
975 {
976         struct buffer_head *head = page_buffers(page);
977         struct buffer_head *bh = head;
978         int uptodate = PageUptodate(page);
979
980         do {
981                 if (!buffer_mapped(bh)) {
982                         init_buffer(bh, NULL, NULL);
983                         bh->b_bdev = bdev;
984                         bh->b_blocknr = block;
985                         if (uptodate)
986                                 set_buffer_uptodate(bh);
987                         set_buffer_mapped(bh);
988                 }
989                 block++;
990                 bh = bh->b_this_page;
991         } while (bh != head);
992 }
993
994 /*
995  * Create the page-cache page that contains the requested block.
996  *
997  * This is user purely for blockdev mappings.
998  */
999 static struct page *
1000 grow_dev_page(struct block_device *bdev, sector_t block,
1001                 pgoff_t index, int size)
1002 {
1003         struct inode *inode = bdev->bd_inode;
1004         struct page *page;
1005         struct buffer_head *bh;
1006
1007         page = find_or_create_page(inode->i_mapping, index,
1008                 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1009         if (!page)
1010                 return NULL;
1011
1012         BUG_ON(!PageLocked(page));
1013
1014         if (page_has_buffers(page)) {
1015                 bh = page_buffers(page);
1016                 if (bh->b_size == size) {
1017                         init_page_buffers(page, bdev, block, size);
1018                         return page;
1019                 }
1020                 if (!try_to_free_buffers(page))
1021                         goto failed;
1022         }
1023
1024         /*
1025          * Allocate some buffers for this page
1026          */
1027         bh = alloc_page_buffers(page, size, 0);
1028         if (!bh)
1029                 goto failed;
1030
1031         /*
1032          * Link the page to the buffers and initialise them.  Take the
1033          * lock to be atomic wrt __find_get_block(), which does not
1034          * run under the page lock.
1035          */
1036         spin_lock(&inode->i_mapping->private_lock);
1037         link_dev_buffers(page, bh);
1038         init_page_buffers(page, bdev, block, size);
1039         spin_unlock(&inode->i_mapping->private_lock);
1040         return page;
1041
1042 failed:
1043         BUG();
1044         unlock_page(page);
1045         page_cache_release(page);
1046         return NULL;
1047 }
1048
1049 /*
1050  * Create buffers for the specified block device block's page.  If
1051  * that page was dirty, the buffers are set dirty also.
1052  */
1053 static int
1054 grow_buffers(struct block_device *bdev, sector_t block, int size)
1055 {
1056         struct page *page;
1057         pgoff_t index;
1058         int sizebits;
1059
1060         sizebits = -1;
1061         do {
1062                 sizebits++;
1063         } while ((size << sizebits) < PAGE_SIZE);
1064
1065         index = block >> sizebits;
1066
1067         /*
1068          * Check for a block which wants to lie outside our maximum possible
1069          * pagecache index.  (this comparison is done using sector_t types).
1070          */
1071         if (unlikely(index != block >> sizebits)) {
1072                 char b[BDEVNAME_SIZE];
1073
1074                 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1075                         "device %s\n",
1076                         __func__, (unsigned long long)block,
1077                         bdevname(bdev, b));
1078                 return -EIO;
1079         }
1080         block = index << sizebits;
1081         /* Create a page with the proper size buffers.. */
1082         page = grow_dev_page(bdev, block, index, size);
1083         if (!page)
1084                 return 0;
1085         unlock_page(page);
1086         page_cache_release(page);
1087         return 1;
1088 }
1089
1090 static struct buffer_head *
1091 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1092 {
1093         /* Size must be multiple of hard sectorsize */
1094         if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1095                         (size < 512 || size > PAGE_SIZE))) {
1096                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1097                                         size);
1098                 printk(KERN_ERR "logical block size: %d\n",
1099                                         bdev_logical_block_size(bdev));
1100
1101                 dump_stack();
1102                 return NULL;
1103         }
1104
1105         for (;;) {
1106                 struct buffer_head * bh;
1107                 int ret;
1108
1109                 bh = __find_get_block(bdev, block, size);
1110                 if (bh)
1111                         return bh;
1112
1113                 ret = grow_buffers(bdev, block, size);
1114                 if (ret < 0)
1115                         return NULL;
1116                 if (ret == 0)
1117                         free_more_memory();
1118         }
1119 }
1120
1121 /*
1122  * The relationship between dirty buffers and dirty pages:
1123  *
1124  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1125  * the page is tagged dirty in its radix tree.
1126  *
1127  * At all times, the dirtiness of the buffers represents the dirtiness of
1128  * subsections of the page.  If the page has buffers, the page dirty bit is
1129  * merely a hint about the true dirty state.
1130  *
1131  * When a page is set dirty in its entirety, all its buffers are marked dirty
1132  * (if the page has buffers).
1133  *
1134  * When a buffer is marked dirty, its page is dirtied, but the page's other
1135  * buffers are not.
1136  *
1137  * Also.  When blockdev buffers are explicitly read with bread(), they
1138  * individually become uptodate.  But their backing page remains not
1139  * uptodate - even if all of its buffers are uptodate.  A subsequent
1140  * block_read_full_page() against that page will discover all the uptodate
1141  * buffers, will set the page uptodate and will perform no I/O.
1142  */
1143
1144 /**
1145  * mark_buffer_dirty - mark a buffer_head as needing writeout
1146  * @bh: the buffer_head to mark dirty
1147  *
1148  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1149  * backing page dirty, then tag the page as dirty in its address_space's radix
1150  * tree and then attach the address_space's inode to its superblock's dirty
1151  * inode list.
1152  *
1153  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1154  * mapping->tree_lock and the global inode_lock.
1155  */
1156 void mark_buffer_dirty(struct buffer_head *bh)
1157 {
1158         WARN_ON_ONCE(!buffer_uptodate(bh));
1159
1160         /*
1161          * Very *carefully* optimize the it-is-already-dirty case.
1162          *
1163          * Don't let the final "is it dirty" escape to before we
1164          * perhaps modified the buffer.
1165          */
1166         if (buffer_dirty(bh)) {
1167                 smp_mb();
1168                 if (buffer_dirty(bh))
1169                         return;
1170         }
1171
1172         if (!test_set_buffer_dirty(bh)) {
1173                 struct page *page = bh->b_page;
1174                 if (!TestSetPageDirty(page)) {
1175                         struct address_space *mapping = page_mapping(page);
1176                         if (mapping)
1177                                 __set_page_dirty(page, mapping, 0);
1178                 }
1179         }
1180 }
1181 EXPORT_SYMBOL(mark_buffer_dirty);
1182
1183 /*
1184  * Decrement a buffer_head's reference count.  If all buffers against a page
1185  * have zero reference count, are clean and unlocked, and if the page is clean
1186  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1187  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1188  * a page but it ends up not being freed, and buffers may later be reattached).
1189  */
1190 void __brelse(struct buffer_head * buf)
1191 {
1192         if (atomic_read(&buf->b_count)) {
1193                 put_bh(buf);
1194                 return;
1195         }
1196         WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1197 }
1198 EXPORT_SYMBOL(__brelse);
1199
1200 /*
1201  * bforget() is like brelse(), except it discards any
1202  * potentially dirty data.
1203  */
1204 void __bforget(struct buffer_head *bh)
1205 {
1206         clear_buffer_dirty(bh);
1207         if (bh->b_assoc_map) {
1208                 struct address_space *buffer_mapping = bh->b_page->mapping;
1209
1210                 spin_lock(&buffer_mapping->private_lock);
1211                 list_del_init(&bh->b_assoc_buffers);
1212                 bh->b_assoc_map = NULL;
1213                 spin_unlock(&buffer_mapping->private_lock);
1214         }
1215         __brelse(bh);
1216 }
1217 EXPORT_SYMBOL(__bforget);
1218
1219 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1220 {
1221         lock_buffer(bh);
1222         if (buffer_uptodate(bh)) {
1223                 unlock_buffer(bh);
1224                 return bh;
1225         } else {
1226                 get_bh(bh);
1227                 bh->b_end_io = end_buffer_read_sync;
1228                 submit_bh(READ, bh);
1229                 wait_on_buffer(bh);
1230                 if (buffer_uptodate(bh))
1231                         return bh;
1232         }
1233         brelse(bh);
1234         return NULL;
1235 }
1236
1237 /*
1238  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1239  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1240  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1241  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1242  * CPU's LRUs at the same time.
1243  *
1244  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1245  * sb_find_get_block().
1246  *
1247  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1248  * a local interrupt disable for that.
1249  */
1250
1251 #define BH_LRU_SIZE     8
1252
1253 struct bh_lru {
1254         struct buffer_head *bhs[BH_LRU_SIZE];
1255 };
1256
1257 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1258
1259 #ifdef CONFIG_SMP
1260 #define bh_lru_lock()   local_irq_disable()
1261 #define bh_lru_unlock() local_irq_enable()
1262 #else
1263 #define bh_lru_lock()   preempt_disable()
1264 #define bh_lru_unlock() preempt_enable()
1265 #endif
1266
1267 static inline void check_irqs_on(void)
1268 {
1269 #ifdef irqs_disabled
1270         BUG_ON(irqs_disabled());
1271 #endif
1272 }
1273
1274 /*
1275  * The LRU management algorithm is dopey-but-simple.  Sorry.
1276  */
1277 static void bh_lru_install(struct buffer_head *bh)
1278 {
1279         struct buffer_head *evictee = NULL;
1280         struct bh_lru *lru;
1281
1282         check_irqs_on();
1283         bh_lru_lock();
1284         lru = &__get_cpu_var(bh_lrus);
1285         if (lru->bhs[0] != bh) {
1286                 struct buffer_head *bhs[BH_LRU_SIZE];
1287                 int in;
1288                 int out = 0;
1289
1290                 get_bh(bh);
1291                 bhs[out++] = bh;
1292                 for (in = 0; in < BH_LRU_SIZE; in++) {
1293                         struct buffer_head *bh2 = lru->bhs[in];
1294
1295                         if (bh2 == bh) {
1296                                 __brelse(bh2);
1297                         } else {
1298                                 if (out >= BH_LRU_SIZE) {
1299                                         BUG_ON(evictee != NULL);
1300                                         evictee = bh2;
1301                                 } else {
1302                                         bhs[out++] = bh2;
1303                                 }
1304                         }
1305                 }
1306                 while (out < BH_LRU_SIZE)
1307                         bhs[out++] = NULL;
1308                 memcpy(lru->bhs, bhs, sizeof(bhs));
1309         }
1310         bh_lru_unlock();
1311
1312         if (evictee)
1313                 __brelse(evictee);
1314 }
1315
1316 /*
1317  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1318  */
1319 static struct buffer_head *
1320 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1321 {
1322         struct buffer_head *ret = NULL;
1323         struct bh_lru *lru;
1324         unsigned int i;
1325
1326         check_irqs_on();
1327         bh_lru_lock();
1328         lru = &__get_cpu_var(bh_lrus);
1329         for (i = 0; i < BH_LRU_SIZE; i++) {
1330                 struct buffer_head *bh = lru->bhs[i];
1331
1332                 if (bh && bh->b_bdev == bdev &&
1333                                 bh->b_blocknr == block && bh->b_size == size) {
1334                         if (i) {
1335                                 while (i) {
1336                                         lru->bhs[i] = lru->bhs[i - 1];
1337                                         i--;
1338                                 }
1339                                 lru->bhs[0] = bh;
1340                         }
1341                         get_bh(bh);
1342                         ret = bh;
1343                         break;
1344                 }
1345         }
1346         bh_lru_unlock();
1347         return ret;
1348 }
1349
1350 /*
1351  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1352  * it in the LRU and mark it as accessed.  If it is not present then return
1353  * NULL
1354  */
1355 struct buffer_head *
1356 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1357 {
1358         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1359
1360         if (bh == NULL) {
1361                 bh = __find_get_block_slow(bdev, block);
1362                 if (bh)
1363                         bh_lru_install(bh);
1364         }
1365         if (bh)
1366                 touch_buffer(bh);
1367         return bh;
1368 }
1369 EXPORT_SYMBOL(__find_get_block);
1370
1371 /*
1372  * __getblk will locate (and, if necessary, create) the buffer_head
1373  * which corresponds to the passed block_device, block and size. The
1374  * returned buffer has its reference count incremented.
1375  *
1376  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1377  * illegal block number, __getblk() will happily return a buffer_head
1378  * which represents the non-existent block.  Very weird.
1379  *
1380  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1381  * attempt is failing.  FIXME, perhaps?
1382  */
1383 struct buffer_head *
1384 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1385 {
1386         struct buffer_head *bh = __find_get_block(bdev, block, size);
1387
1388         might_sleep();
1389         if (bh == NULL)
1390                 bh = __getblk_slow(bdev, block, size);
1391         return bh;
1392 }
1393 EXPORT_SYMBOL(__getblk);
1394
1395 /*
1396  * Do async read-ahead on a buffer..
1397  */
1398 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1399 {
1400         struct buffer_head *bh = __getblk(bdev, block, size);
1401         if (likely(bh)) {
1402                 ll_rw_block(READA, 1, &bh);
1403                 brelse(bh);
1404         }
1405 }
1406 EXPORT_SYMBOL(__breadahead);
1407
1408 /**
1409  *  __bread() - reads a specified block and returns the bh
1410  *  @bdev: the block_device to read from
1411  *  @block: number of block
1412  *  @size: size (in bytes) to read
1413  * 
1414  *  Reads a specified block, and returns buffer head that contains it.
1415  *  It returns NULL if the block was unreadable.
1416  */
1417 struct buffer_head *
1418 __bread(struct block_device *bdev, sector_t block, unsigned size)
1419 {
1420         struct buffer_head *bh = __getblk(bdev, block, size);
1421
1422         if (likely(bh) && !buffer_uptodate(bh))
1423                 bh = __bread_slow(bh);
1424         return bh;
1425 }
1426 EXPORT_SYMBOL(__bread);
1427
1428 /*
1429  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1430  * This doesn't race because it runs in each cpu either in irq
1431  * or with preempt disabled.
1432  */
1433 static void invalidate_bh_lru(void *arg)
1434 {
1435         struct bh_lru *b = &get_cpu_var(bh_lrus);
1436         int i;
1437
1438         for (i = 0; i < BH_LRU_SIZE; i++) {
1439                 brelse(b->bhs[i]);
1440                 b->bhs[i] = NULL;
1441         }
1442         put_cpu_var(bh_lrus);
1443 }
1444         
1445 void invalidate_bh_lrus(void)
1446 {
1447         on_each_cpu(invalidate_bh_lru, NULL, 1);
1448 }
1449 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1450
1451 void set_bh_page(struct buffer_head *bh,
1452                 struct page *page, unsigned long offset)
1453 {
1454         bh->b_page = page;
1455         BUG_ON(offset >= PAGE_SIZE);
1456         if (PageHighMem(page))
1457                 /*
1458                  * This catches illegal uses and preserves the offset:
1459                  */
1460                 bh->b_data = (char *)(0 + offset);
1461         else
1462                 bh->b_data = page_address(page) + offset;
1463 }
1464 EXPORT_SYMBOL(set_bh_page);
1465
1466 /*
1467  * Called when truncating a buffer on a page completely.
1468  */
1469 static void discard_buffer(struct buffer_head * bh)
1470 {
1471         lock_buffer(bh);
1472         clear_buffer_dirty(bh);
1473         bh->b_bdev = NULL;
1474         clear_buffer_mapped(bh);
1475         clear_buffer_req(bh);
1476         clear_buffer_new(bh);
1477         clear_buffer_delay(bh);
1478         clear_buffer_unwritten(bh);
1479         unlock_buffer(bh);
1480 }
1481
1482 /**
1483  * block_invalidatepage - invalidate part of all of a buffer-backed page
1484  *
1485  * @page: the page which is affected
1486  * @offset: the index of the truncation point
1487  *
1488  * block_invalidatepage() is called when all or part of the page has become
1489  * invalidatedby a truncate operation.
1490  *
1491  * block_invalidatepage() does not have to release all buffers, but it must
1492  * ensure that no dirty buffer is left outside @offset and that no I/O
1493  * is underway against any of the blocks which are outside the truncation
1494  * point.  Because the caller is about to free (and possibly reuse) those
1495  * blocks on-disk.
1496  */
1497 void block_invalidatepage(struct page *page, unsigned long offset)
1498 {
1499         struct buffer_head *head, *bh, *next;
1500         unsigned int curr_off = 0;
1501
1502         BUG_ON(!PageLocked(page));
1503         if (!page_has_buffers(page))
1504                 goto out;
1505
1506         head = page_buffers(page);
1507         bh = head;
1508         do {
1509                 unsigned int next_off = curr_off + bh->b_size;
1510                 next = bh->b_this_page;
1511
1512                 /*
1513                  * is this block fully invalidated?
1514                  */
1515                 if (offset <= curr_off)
1516                         discard_buffer(bh);
1517                 curr_off = next_off;
1518                 bh = next;
1519         } while (bh != head);
1520
1521         /*
1522          * We release buffers only if the entire page is being invalidated.
1523          * The get_block cached value has been unconditionally invalidated,
1524          * so real IO is not possible anymore.
1525          */
1526         if (offset == 0)
1527                 try_to_release_page(page, 0);
1528 out:
1529         return;
1530 }
1531 EXPORT_SYMBOL(block_invalidatepage);
1532
1533 /*
1534  * We attach and possibly dirty the buffers atomically wrt
1535  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1536  * is already excluded via the page lock.
1537  */
1538 void create_empty_buffers(struct page *page,
1539                         unsigned long blocksize, unsigned long b_state)
1540 {
1541         struct buffer_head *bh, *head, *tail;
1542
1543         head = alloc_page_buffers(page, blocksize, 1);
1544         bh = head;
1545         do {
1546                 bh->b_state |= b_state;
1547                 tail = bh;
1548                 bh = bh->b_this_page;
1549         } while (bh);
1550         tail->b_this_page = head;
1551
1552         spin_lock(&page->mapping->private_lock);
1553         if (PageUptodate(page) || PageDirty(page)) {
1554                 bh = head;
1555                 do {
1556                         if (PageDirty(page))
1557                                 set_buffer_dirty(bh);
1558                         if (PageUptodate(page))
1559                                 set_buffer_uptodate(bh);
1560                         bh = bh->b_this_page;
1561                 } while (bh != head);
1562         }
1563         attach_page_buffers(page, head);
1564         spin_unlock(&page->mapping->private_lock);
1565 }
1566 EXPORT_SYMBOL(create_empty_buffers);
1567
1568 /*
1569  * We are taking a block for data and we don't want any output from any
1570  * buffer-cache aliases starting from return from that function and
1571  * until the moment when something will explicitly mark the buffer
1572  * dirty (hopefully that will not happen until we will free that block ;-)
1573  * We don't even need to mark it not-uptodate - nobody can expect
1574  * anything from a newly allocated buffer anyway. We used to used
1575  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1576  * don't want to mark the alias unmapped, for example - it would confuse
1577  * anyone who might pick it with bread() afterwards...
1578  *
1579  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1580  * be writeout I/O going on against recently-freed buffers.  We don't
1581  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1582  * only if we really need to.  That happens here.
1583  */
1584 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1585 {
1586         struct buffer_head *old_bh;
1587
1588         might_sleep();
1589
1590         old_bh = __find_get_block_slow(bdev, block);
1591         if (old_bh) {
1592                 clear_buffer_dirty(old_bh);
1593                 wait_on_buffer(old_bh);
1594                 clear_buffer_req(old_bh);
1595                 __brelse(old_bh);
1596         }
1597 }
1598 EXPORT_SYMBOL(unmap_underlying_metadata);
1599
1600 /*
1601  * NOTE! All mapped/uptodate combinations are valid:
1602  *
1603  *      Mapped  Uptodate        Meaning
1604  *
1605  *      No      No              "unknown" - must do get_block()
1606  *      No      Yes             "hole" - zero-filled
1607  *      Yes     No              "allocated" - allocated on disk, not read in
1608  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1609  *
1610  * "Dirty" is valid only with the last case (mapped+uptodate).
1611  */
1612
1613 /*
1614  * While block_write_full_page is writing back the dirty buffers under
1615  * the page lock, whoever dirtied the buffers may decide to clean them
1616  * again at any time.  We handle that by only looking at the buffer
1617  * state inside lock_buffer().
1618  *
1619  * If block_write_full_page() is called for regular writeback
1620  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1621  * locked buffer.   This only can happen if someone has written the buffer
1622  * directly, with submit_bh().  At the address_space level PageWriteback
1623  * prevents this contention from occurring.
1624  *
1625  * If block_write_full_page() is called with wbc->sync_mode ==
1626  * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1627  * causes the writes to be flagged as synchronous writes, but the
1628  * block device queue will NOT be unplugged, since usually many pages
1629  * will be pushed to the out before the higher-level caller actually
1630  * waits for the writes to be completed.  The various wait functions,
1631  * such as wait_on_writeback_range() will ultimately call sync_page()
1632  * which will ultimately call blk_run_backing_dev(), which will end up
1633  * unplugging the device queue.
1634  */
1635 static int __block_write_full_page(struct inode *inode, struct page *page,
1636                         get_block_t *get_block, struct writeback_control *wbc,
1637                         bh_end_io_t *handler)
1638 {
1639         int err;
1640         sector_t block;
1641         sector_t last_block;
1642         struct buffer_head *bh, *head;
1643         const unsigned blocksize = 1 << inode->i_blkbits;
1644         int nr_underway = 0;
1645         int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1646                         WRITE_SYNC_PLUG : WRITE);
1647
1648         BUG_ON(!PageLocked(page));
1649
1650         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1651
1652         if (!page_has_buffers(page)) {
1653                 create_empty_buffers(page, blocksize,
1654                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1655         }
1656
1657         /*
1658          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1659          * here, and the (potentially unmapped) buffers may become dirty at
1660          * any time.  If a buffer becomes dirty here after we've inspected it
1661          * then we just miss that fact, and the page stays dirty.
1662          *
1663          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1664          * handle that here by just cleaning them.
1665          */
1666
1667         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1668         head = page_buffers(page);
1669         bh = head;
1670
1671         /*
1672          * Get all the dirty buffers mapped to disk addresses and
1673          * handle any aliases from the underlying blockdev's mapping.
1674          */
1675         do {
1676                 if (block > last_block) {
1677                         /*
1678                          * mapped buffers outside i_size will occur, because
1679                          * this page can be outside i_size when there is a
1680                          * truncate in progress.
1681                          */
1682                         /*
1683                          * The buffer was zeroed by block_write_full_page()
1684                          */
1685                         clear_buffer_dirty(bh);
1686                         set_buffer_uptodate(bh);
1687                 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1688                            buffer_dirty(bh)) {
1689                         WARN_ON(bh->b_size != blocksize);
1690                         err = get_block(inode, block, bh, 1);
1691                         if (err)
1692                                 goto recover;
1693                         clear_buffer_delay(bh);
1694                         if (buffer_new(bh)) {
1695                                 /* blockdev mappings never come here */
1696                                 clear_buffer_new(bh);
1697                                 unmap_underlying_metadata(bh->b_bdev,
1698                                                         bh->b_blocknr);
1699                         }
1700                 }
1701                 bh = bh->b_this_page;
1702                 block++;
1703         } while (bh != head);
1704
1705         do {
1706                 if (!buffer_mapped(bh))
1707                         continue;
1708                 /*
1709                  * If it's a fully non-blocking write attempt and we cannot
1710                  * lock the buffer then redirty the page.  Note that this can
1711                  * potentially cause a busy-wait loop from writeback threads
1712                  * and kswapd activity, but those code paths have their own
1713                  * higher-level throttling.
1714                  */
1715                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1716                         lock_buffer(bh);
1717                 } else if (!trylock_buffer(bh)) {
1718                         redirty_page_for_writepage(wbc, page);
1719                         continue;
1720                 }
1721                 if (test_clear_buffer_dirty(bh)) {
1722                         mark_buffer_async_write_endio(bh, handler);
1723                 } else {
1724                         unlock_buffer(bh);
1725                 }
1726         } while ((bh = bh->b_this_page) != head);
1727
1728         /*
1729          * The page and its buffers are protected by PageWriteback(), so we can
1730          * drop the bh refcounts early.
1731          */
1732         BUG_ON(PageWriteback(page));
1733         set_page_writeback(page);
1734
1735         do {
1736                 struct buffer_head *next = bh->b_this_page;
1737                 if (buffer_async_write(bh)) {
1738                         submit_bh(write_op, bh);
1739                         nr_underway++;
1740                 }
1741                 bh = next;
1742         } while (bh != head);
1743         unlock_page(page);
1744
1745         err = 0;
1746 done:
1747         if (nr_underway == 0) {
1748                 /*
1749                  * The page was marked dirty, but the buffers were
1750                  * clean.  Someone wrote them back by hand with
1751                  * ll_rw_block/submit_bh.  A rare case.
1752                  */
1753                 end_page_writeback(page);
1754
1755                 /*
1756                  * The page and buffer_heads can be released at any time from
1757                  * here on.
1758                  */
1759         }
1760         return err;
1761
1762 recover:
1763         /*
1764          * ENOSPC, or some other error.  We may already have added some
1765          * blocks to the file, so we need to write these out to avoid
1766          * exposing stale data.
1767          * The page is currently locked and not marked for writeback
1768          */
1769         bh = head;
1770         /* Recovery: lock and submit the mapped buffers */
1771         do {
1772                 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1773                     !buffer_delay(bh)) {
1774                         lock_buffer(bh);
1775                         mark_buffer_async_write_endio(bh, handler);
1776                 } else {
1777                         /*
1778                          * The buffer may have been set dirty during
1779                          * attachment to a dirty page.
1780                          */
1781                         clear_buffer_dirty(bh);
1782                 }
1783         } while ((bh = bh->b_this_page) != head);
1784         SetPageError(page);
1785         BUG_ON(PageWriteback(page));
1786         mapping_set_error(page->mapping, err);
1787         set_page_writeback(page);
1788         do {
1789                 struct buffer_head *next = bh->b_this_page;
1790                 if (buffer_async_write(bh)) {
1791                         clear_buffer_dirty(bh);
1792                         submit_bh(write_op, bh);
1793                         nr_underway++;
1794                 }
1795                 bh = next;
1796         } while (bh != head);
1797         unlock_page(page);
1798         goto done;
1799 }
1800
1801 /*
1802  * If a page has any new buffers, zero them out here, and mark them uptodate
1803  * and dirty so they'll be written out (in order to prevent uninitialised
1804  * block data from leaking). And clear the new bit.
1805  */
1806 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1807 {
1808         unsigned int block_start, block_end;
1809         struct buffer_head *head, *bh;
1810
1811         BUG_ON(!PageLocked(page));
1812         if (!page_has_buffers(page))
1813                 return;
1814
1815         bh = head = page_buffers(page);
1816         block_start = 0;
1817         do {
1818                 block_end = block_start + bh->b_size;
1819
1820                 if (buffer_new(bh)) {
1821                         if (block_end > from && block_start < to) {
1822                                 if (!PageUptodate(page)) {
1823                                         unsigned start, size;
1824
1825                                         start = max(from, block_start);
1826                                         size = min(to, block_end) - start;
1827
1828                                         zero_user(page, start, size);
1829                                         set_buffer_uptodate(bh);
1830                                 }
1831
1832                                 clear_buffer_new(bh);
1833                                 mark_buffer_dirty(bh);
1834                         }
1835                 }
1836
1837                 block_start = block_end;
1838                 bh = bh->b_this_page;
1839         } while (bh != head);
1840 }
1841 EXPORT_SYMBOL(page_zero_new_buffers);
1842
1843 static int __block_prepare_write(struct inode *inode, struct page *page,
1844                 unsigned from, unsigned to, get_block_t *get_block)
1845 {
1846         unsigned block_start, block_end;
1847         sector_t block;
1848         int err = 0;
1849         unsigned blocksize, bbits;
1850         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1851
1852         BUG_ON(!PageLocked(page));
1853         BUG_ON(from > PAGE_CACHE_SIZE);
1854         BUG_ON(to > PAGE_CACHE_SIZE);
1855         BUG_ON(from > to);
1856
1857         blocksize = 1 << inode->i_blkbits;
1858         if (!page_has_buffers(page))
1859                 create_empty_buffers(page, blocksize, 0);
1860         head = page_buffers(page);
1861
1862         bbits = inode->i_blkbits;
1863         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1864
1865         for(bh = head, block_start = 0; bh != head || !block_start;
1866             block++, block_start=block_end, bh = bh->b_this_page) {
1867                 block_end = block_start + blocksize;
1868                 if (block_end <= from || block_start >= to) {
1869                         if (PageUptodate(page)) {
1870                                 if (!buffer_uptodate(bh))
1871                                         set_buffer_uptodate(bh);
1872                         }
1873                         continue;
1874                 }
1875                 if (buffer_new(bh))
1876                         clear_buffer_new(bh);
1877                 if (!buffer_mapped(bh)) {
1878                         WARN_ON(bh->b_size != blocksize);
1879                         err = get_block(inode, block, bh, 1);
1880                         if (err)
1881                                 break;
1882                         if (buffer_new(bh)) {
1883                                 unmap_underlying_metadata(bh->b_bdev,
1884                                                         bh->b_blocknr);
1885                                 if (PageUptodate(page)) {
1886                                         clear_buffer_new(bh);
1887                                         set_buffer_uptodate(bh);
1888                                         mark_buffer_dirty(bh);
1889                                         continue;
1890                                 }
1891                                 if (block_end > to || block_start < from)
1892                                         zero_user_segments(page,
1893                                                 to, block_end,
1894                                                 block_start, from);
1895                                 continue;
1896                         }
1897                 }
1898                 if (PageUptodate(page)) {
1899                         if (!buffer_uptodate(bh))
1900                                 set_buffer_uptodate(bh);
1901                         continue; 
1902                 }
1903                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1904                     !buffer_unwritten(bh) &&
1905                      (block_start < from || block_end > to)) {
1906                         ll_rw_block(READ, 1, &bh);
1907                         *wait_bh++=bh;
1908                 }
1909         }
1910         /*
1911          * If we issued read requests - let them complete.
1912          */
1913         while(wait_bh > wait) {
1914                 wait_on_buffer(*--wait_bh);
1915                 if (!buffer_uptodate(*wait_bh))
1916                         err = -EIO;
1917         }
1918         if (unlikely(err))
1919                 page_zero_new_buffers(page, from, to);
1920         return err;
1921 }
1922
1923 static int __block_commit_write(struct inode *inode, struct page *page,
1924                 unsigned from, unsigned to)
1925 {
1926         unsigned block_start, block_end;
1927         int partial = 0;
1928         unsigned blocksize;
1929         struct buffer_head *bh, *head;
1930
1931         blocksize = 1 << inode->i_blkbits;
1932
1933         for(bh = head = page_buffers(page), block_start = 0;
1934             bh != head || !block_start;
1935             block_start=block_end, bh = bh->b_this_page) {
1936                 block_end = block_start + blocksize;
1937                 if (block_end <= from || block_start >= to) {
1938                         if (!buffer_uptodate(bh))
1939                                 partial = 1;
1940                 } else {
1941                         set_buffer_uptodate(bh);
1942                         mark_buffer_dirty(bh);
1943                 }
1944                 clear_buffer_new(bh);
1945         }
1946
1947         /*
1948          * If this is a partial write which happened to make all buffers
1949          * uptodate then we can optimize away a bogus readpage() for
1950          * the next read(). Here we 'discover' whether the page went
1951          * uptodate as a result of this (potentially partial) write.
1952          */
1953         if (!partial)
1954                 SetPageUptodate(page);
1955         return 0;
1956 }
1957
1958 /*
1959  * block_write_begin takes care of the basic task of block allocation and
1960  * bringing partial write blocks uptodate first.
1961  *
1962  * If *pagep is not NULL, then block_write_begin uses the locked page
1963  * at *pagep rather than allocating its own. In this case, the page will
1964  * not be unlocked or deallocated on failure.
1965  */
1966 int block_write_begin(struct file *file, struct address_space *mapping,
1967                         loff_t pos, unsigned len, unsigned flags,
1968                         struct page **pagep, void **fsdata,
1969                         get_block_t *get_block)
1970 {
1971         struct inode *inode = mapping->host;
1972         int status = 0;
1973         struct page *page;
1974         pgoff_t index;
1975         unsigned start, end;
1976         int ownpage = 0;
1977
1978         index = pos >> PAGE_CACHE_SHIFT;
1979         start = pos & (PAGE_CACHE_SIZE - 1);
1980         end = start + len;
1981
1982         page = *pagep;
1983         if (page == NULL) {
1984                 ownpage = 1;
1985                 page = grab_cache_page_write_begin(mapping, index, flags);
1986                 if (!page) {
1987                         status = -ENOMEM;
1988                         goto out;
1989                 }
1990                 *pagep = page;
1991         } else
1992                 BUG_ON(!PageLocked(page));
1993
1994         status = __block_prepare_write(inode, page, start, end, get_block);
1995         if (unlikely(status)) {
1996                 ClearPageUptodate(page);
1997
1998                 if (ownpage) {
1999                         unlock_page(page);
2000                         page_cache_release(page);
2001                         *pagep = NULL;
2002
2003                         /*
2004                          * prepare_write() may have instantiated a few blocks
2005                          * outside i_size.  Trim these off again. Don't need
2006                          * i_size_read because we hold i_mutex.
2007                          */
2008                         if (pos + len > inode->i_size)
2009                                 vmtruncate(inode, inode->i_size);
2010                 }
2011         }
2012
2013 out:
2014         return status;
2015 }
2016 EXPORT_SYMBOL(block_write_begin);
2017
2018 int block_write_end(struct file *file, struct address_space *mapping,
2019                         loff_t pos, unsigned len, unsigned copied,
2020                         struct page *page, void *fsdata)
2021 {
2022         struct inode *inode = mapping->host;
2023         unsigned start;
2024
2025         start = pos & (PAGE_CACHE_SIZE - 1);
2026
2027         if (unlikely(copied < len)) {
2028                 /*
2029                  * The buffers that were written will now be uptodate, so we
2030                  * don't have to worry about a readpage reading them and
2031                  * overwriting a partial write. However if we have encountered
2032                  * a short write and only partially written into a buffer, it
2033                  * will not be marked uptodate, so a readpage might come in and
2034                  * destroy our partial write.
2035                  *
2036                  * Do the simplest thing, and just treat any short write to a
2037                  * non uptodate page as a zero-length write, and force the
2038                  * caller to redo the whole thing.
2039                  */
2040                 if (!PageUptodate(page))
2041                         copied = 0;
2042
2043                 page_zero_new_buffers(page, start+copied, start+len);
2044         }
2045         flush_dcache_page(page);
2046
2047         /* This could be a short (even 0-length) commit */
2048         __block_commit_write(inode, page, start, start+copied);
2049
2050         return copied;
2051 }
2052 EXPORT_SYMBOL(block_write_end);
2053
2054 int generic_write_end(struct file *file, struct address_space *mapping,
2055                         loff_t pos, unsigned len, unsigned copied,
2056                         struct page *page, void *fsdata)
2057 {
2058         struct inode *inode = mapping->host;
2059         int i_size_changed = 0;
2060
2061         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2062
2063         /*
2064          * No need to use i_size_read() here, the i_size
2065          * cannot change under us because we hold i_mutex.
2066          *
2067          * But it's important to update i_size while still holding page lock:
2068          * page writeout could otherwise come in and zero beyond i_size.
2069          */
2070         if (pos+copied > inode->i_size) {
2071                 i_size_write(inode, pos+copied);
2072                 i_size_changed = 1;
2073         }
2074
2075         unlock_page(page);
2076         page_cache_release(page);
2077
2078         /*
2079          * Don't mark the inode dirty under page lock. First, it unnecessarily
2080          * makes the holding time of page lock longer. Second, it forces lock
2081          * ordering of page lock and transaction start for journaling
2082          * filesystems.
2083          */
2084         if (i_size_changed)
2085                 mark_inode_dirty(inode);
2086
2087         return copied;
2088 }
2089 EXPORT_SYMBOL(generic_write_end);
2090
2091 /*
2092  * block_is_partially_uptodate checks whether buffers within a page are
2093  * uptodate or not.
2094  *
2095  * Returns true if all buffers which correspond to a file portion
2096  * we want to read are uptodate.
2097  */
2098 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2099                                         unsigned long from)
2100 {
2101         struct inode *inode = page->mapping->host;
2102         unsigned block_start, block_end, blocksize;
2103         unsigned to;
2104         struct buffer_head *bh, *head;
2105         int ret = 1;
2106
2107         if (!page_has_buffers(page))
2108                 return 0;
2109
2110         blocksize = 1 << inode->i_blkbits;
2111         to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2112         to = from + to;
2113         if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2114                 return 0;
2115
2116         head = page_buffers(page);
2117         bh = head;
2118         block_start = 0;
2119         do {
2120                 block_end = block_start + blocksize;
2121                 if (block_end > from && block_start < to) {
2122                         if (!buffer_uptodate(bh)) {
2123                                 ret = 0;
2124                                 break;
2125                         }
2126                         if (block_end >= to)
2127                                 break;
2128                 }
2129                 block_start = block_end;
2130                 bh = bh->b_this_page;
2131         } while (bh != head);
2132
2133         return ret;
2134 }
2135 EXPORT_SYMBOL(block_is_partially_uptodate);
2136
2137 /*
2138  * Generic "read page" function for block devices that have the normal
2139  * get_block functionality. This is most of the block device filesystems.
2140  * Reads the page asynchronously --- the unlock_buffer() and
2141  * set/clear_buffer_uptodate() functions propagate buffer state into the
2142  * page struct once IO has completed.
2143  */
2144 int block_read_full_page(struct page *page, get_block_t *get_block)
2145 {
2146         struct inode *inode = page->mapping->host;
2147         sector_t iblock, lblock;
2148         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2149         unsigned int blocksize;
2150         int nr, i;
2151         int fully_mapped = 1;
2152
2153         BUG_ON(!PageLocked(page));
2154         blocksize = 1 << inode->i_blkbits;
2155         if (!page_has_buffers(page))
2156                 create_empty_buffers(page, blocksize, 0);
2157         head = page_buffers(page);
2158
2159         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2160         lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2161         bh = head;
2162         nr = 0;
2163         i = 0;
2164
2165         do {
2166                 if (buffer_uptodate(bh))
2167                         continue;
2168
2169                 if (!buffer_mapped(bh)) {
2170                         int err = 0;
2171
2172                         fully_mapped = 0;
2173                         if (iblock < lblock) {
2174                                 WARN_ON(bh->b_size != blocksize);
2175                                 err = get_block(inode, iblock, bh, 0);
2176                                 if (err)
2177                                         SetPageError(page);
2178                         }
2179                         if (!buffer_mapped(bh)) {
2180                                 zero_user(page, i * blocksize, blocksize);
2181                                 if (!err)
2182                                         set_buffer_uptodate(bh);
2183                                 continue;
2184                         }
2185                         /*
2186                          * get_block() might have updated the buffer
2187                          * synchronously
2188                          */
2189                         if (buffer_uptodate(bh))
2190                                 continue;
2191                 }
2192                 arr[nr++] = bh;
2193         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2194
2195         if (fully_mapped)
2196                 SetPageMappedToDisk(page);
2197
2198         if (!nr) {
2199                 /*
2200                  * All buffers are uptodate - we can set the page uptodate
2201                  * as well. But not if get_block() returned an error.
2202                  */
2203                 if (!PageError(page))
2204                         SetPageUptodate(page);
2205                 unlock_page(page);
2206                 return 0;
2207         }
2208
2209         /* Stage two: lock the buffers */
2210         for (i = 0; i < nr; i++) {
2211                 bh = arr[i];
2212                 lock_buffer(bh);
2213                 mark_buffer_async_read(bh);
2214         }
2215
2216         /*
2217          * Stage 3: start the IO.  Check for uptodateness
2218          * inside the buffer lock in case another process reading
2219          * the underlying blockdev brought it uptodate (the sct fix).
2220          */
2221         for (i = 0; i < nr; i++) {
2222                 bh = arr[i];
2223                 if (buffer_uptodate(bh))
2224                         end_buffer_async_read(bh, 1);
2225                 else
2226                         submit_bh(READ, bh);
2227         }
2228         return 0;
2229 }
2230 EXPORT_SYMBOL(block_read_full_page);
2231
2232 /* utility function for filesystems that need to do work on expanding
2233  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2234  * deal with the hole.  
2235  */
2236 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2237 {
2238         struct address_space *mapping = inode->i_mapping;
2239         struct page *page;
2240         void *fsdata;
2241         int err;
2242
2243         err = inode_newsize_ok(inode, size);
2244         if (err)
2245                 goto out;
2246
2247         err = pagecache_write_begin(NULL, mapping, size, 0,
2248                                 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2249                                 &page, &fsdata);
2250         if (err)
2251                 goto out;
2252
2253         err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2254         BUG_ON(err > 0);
2255
2256 out:
2257         return err;
2258 }
2259 EXPORT_SYMBOL(generic_cont_expand_simple);
2260
2261 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2262                             loff_t pos, loff_t *bytes)
2263 {
2264         struct inode *inode = mapping->host;
2265         unsigned blocksize = 1 << inode->i_blkbits;
2266         struct page *page;
2267         void *fsdata;
2268         pgoff_t index, curidx;
2269         loff_t curpos;
2270         unsigned zerofrom, offset, len;
2271         int err = 0;
2272
2273         index = pos >> PAGE_CACHE_SHIFT;
2274         offset = pos & ~PAGE_CACHE_MASK;
2275
2276         while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2277                 zerofrom = curpos & ~PAGE_CACHE_MASK;
2278                 if (zerofrom & (blocksize-1)) {
2279                         *bytes |= (blocksize-1);
2280                         (*bytes)++;
2281                 }
2282                 len = PAGE_CACHE_SIZE - zerofrom;
2283
2284                 err = pagecache_write_begin(file, mapping, curpos, len,
2285                                                 AOP_FLAG_UNINTERRUPTIBLE,
2286                                                 &page, &fsdata);
2287                 if (err)
2288                         goto out;
2289                 zero_user(page, zerofrom, len);
2290                 err = pagecache_write_end(file, mapping, curpos, len, len,
2291                                                 page, fsdata);
2292                 if (err < 0)
2293                         goto out;
2294                 BUG_ON(err != len);
2295                 err = 0;
2296
2297                 balance_dirty_pages_ratelimited(mapping);
2298         }
2299
2300         /* page covers the boundary, find the boundary offset */
2301         if (index == curidx) {
2302                 zerofrom = curpos & ~PAGE_CACHE_MASK;
2303                 /* if we will expand the thing last block will be filled */
2304                 if (offset <= zerofrom) {
2305                         goto out;
2306                 }
2307                 if (zerofrom & (blocksize-1)) {
2308                         *bytes |= (blocksize-1);
2309                         (*bytes)++;
2310                 }
2311                 len = offset - zerofrom;
2312
2313                 err = pagecache_write_begin(file, mapping, curpos, len,
2314                                                 AOP_FLAG_UNINTERRUPTIBLE,
2315                                                 &page, &fsdata);
2316                 if (err)
2317                         goto out;
2318                 zero_user(page, zerofrom, len);
2319                 err = pagecache_write_end(file, mapping, curpos, len, len,
2320                                                 page, fsdata);
2321                 if (err < 0)
2322                         goto out;
2323                 BUG_ON(err != len);
2324                 err = 0;
2325         }
2326 out:
2327         return err;
2328 }
2329
2330 /*
2331  * For moronic filesystems that do not allow holes in file.
2332  * We may have to extend the file.
2333  */
2334 int cont_write_begin(struct file *file, struct address_space *mapping,
2335                         loff_t pos, unsigned len, unsigned flags,
2336                         struct page **pagep, void **fsdata,
2337                         get_block_t *get_block, loff_t *bytes)
2338 {
2339         struct inode *inode = mapping->host;
2340         unsigned blocksize = 1 << inode->i_blkbits;
2341         unsigned zerofrom;
2342         int err;
2343
2344         err = cont_expand_zero(file, mapping, pos, bytes);
2345         if (err)
2346                 goto out;
2347
2348         zerofrom = *bytes & ~PAGE_CACHE_MASK;
2349         if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2350                 *bytes |= (blocksize-1);
2351                 (*bytes)++;
2352         }
2353
2354         *pagep = NULL;
2355         err = block_write_begin(file, mapping, pos, len,
2356                                 flags, pagep, fsdata, get_block);
2357 out:
2358         return err;
2359 }
2360 EXPORT_SYMBOL(cont_write_begin);
2361
2362 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2363                         get_block_t *get_block)
2364 {
2365         struct inode *inode = page->mapping->host;
2366         int err = __block_prepare_write(inode, page, from, to, get_block);
2367         if (err)
2368                 ClearPageUptodate(page);
2369         return err;
2370 }
2371 EXPORT_SYMBOL(block_prepare_write);
2372
2373 int block_commit_write(struct page *page, unsigned from, unsigned to)
2374 {
2375         struct inode *inode = page->mapping->host;
2376         __block_commit_write(inode,page,from,to);
2377         return 0;
2378 }
2379 EXPORT_SYMBOL(block_commit_write);
2380
2381 /*
2382  * block_page_mkwrite() is not allowed to change the file size as it gets
2383  * called from a page fault handler when a page is first dirtied. Hence we must
2384  * be careful to check for EOF conditions here. We set the page up correctly
2385  * for a written page which means we get ENOSPC checking when writing into
2386  * holes and correct delalloc and unwritten extent mapping on filesystems that
2387  * support these features.
2388  *
2389  * We are not allowed to take the i_mutex here so we have to play games to
2390  * protect against truncate races as the page could now be beyond EOF.  Because
2391  * vmtruncate() writes the inode size before removing pages, once we have the
2392  * page lock we can determine safely if the page is beyond EOF. If it is not
2393  * beyond EOF, then the page is guaranteed safe against truncation until we
2394  * unlock the page.
2395  */
2396 int
2397 block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2398                    get_block_t get_block)
2399 {
2400         struct page *page = vmf->page;
2401         struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2402         unsigned long end;
2403         loff_t size;
2404         int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2405
2406         lock_page(page);
2407         size = i_size_read(inode);
2408         if ((page->mapping != inode->i_mapping) ||
2409             (page_offset(page) > size)) {
2410                 /* page got truncated out from underneath us */
2411                 unlock_page(page);
2412                 goto out;
2413         }
2414
2415         /* page is wholly or partially inside EOF */
2416         if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2417                 end = size & ~PAGE_CACHE_MASK;
2418         else
2419                 end = PAGE_CACHE_SIZE;
2420
2421         ret = block_prepare_write(page, 0, end, get_block);
2422         if (!ret)
2423                 ret = block_commit_write(page, 0, end);
2424
2425         if (unlikely(ret)) {
2426                 unlock_page(page);
2427                 if (ret == -ENOMEM)
2428                         ret = VM_FAULT_OOM;
2429                 else /* -ENOSPC, -EIO, etc */
2430                         ret = VM_FAULT_SIGBUS;
2431         } else
2432                 ret = VM_FAULT_LOCKED;
2433
2434 out:
2435         return ret;
2436 }
2437 EXPORT_SYMBOL(block_page_mkwrite);
2438
2439 /*
2440  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2441  * immediately, while under the page lock.  So it needs a special end_io
2442  * handler which does not touch the bh after unlocking it.
2443  */
2444 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2445 {
2446         __end_buffer_read_notouch(bh, uptodate);
2447 }
2448
2449 /*
2450  * Attach the singly-linked list of buffers created by nobh_write_begin, to
2451  * the page (converting it to circular linked list and taking care of page
2452  * dirty races).
2453  */
2454 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2455 {
2456         struct buffer_head *bh;
2457
2458         BUG_ON(!PageLocked(page));
2459
2460         spin_lock(&page->mapping->private_lock);
2461         bh = head;
2462         do {
2463                 if (PageDirty(page))
2464                         set_buffer_dirty(bh);
2465                 if (!bh->b_this_page)
2466                         bh->b_this_page = head;
2467                 bh = bh->b_this_page;
2468         } while (bh != head);
2469         attach_page_buffers(page, head);
2470         spin_unlock(&page->mapping->private_lock);
2471 }
2472
2473 /*
2474  * On entry, the page is fully not uptodate.
2475  * On exit the page is fully uptodate in the areas outside (from,to)
2476  */
2477 int nobh_write_begin(struct file *file, struct address_space *mapping,
2478                         loff_t pos, unsigned len, unsigned flags,
2479                         struct page **pagep, void **fsdata,
2480                         get_block_t *get_block)
2481 {
2482         struct inode *inode = mapping->host;
2483         const unsigned blkbits = inode->i_blkbits;
2484         const unsigned blocksize = 1 << blkbits;
2485         struct buffer_head *head, *bh;
2486         struct page *page;
2487         pgoff_t index;
2488         unsigned from, to;
2489         unsigned block_in_page;
2490         unsigned block_start, block_end;
2491         sector_t block_in_file;
2492         int nr_reads = 0;
2493         int ret = 0;
2494         int is_mapped_to_disk = 1;
2495
2496         index = pos >> PAGE_CACHE_SHIFT;
2497         from = pos & (PAGE_CACHE_SIZE - 1);
2498         to = from + len;
2499
2500         page = grab_cache_page_write_begin(mapping, index, flags);
2501         if (!page)
2502                 return -ENOMEM;
2503         *pagep = page;
2504         *fsdata = NULL;
2505
2506         if (page_has_buffers(page)) {
2507                 unlock_page(page);
2508                 page_cache_release(page);
2509                 *pagep = NULL;
2510                 return block_write_begin(file, mapping, pos, len, flags, pagep,
2511                                         fsdata, get_block);
2512         }
2513
2514         if (PageMappedToDisk(page))
2515                 return 0;
2516
2517         /*
2518          * Allocate buffers so that we can keep track of state, and potentially
2519          * attach them to the page if an error occurs. In the common case of
2520          * no error, they will just be freed again without ever being attached
2521          * to the page (which is all OK, because we're under the page lock).
2522          *
2523          * Be careful: the buffer linked list is a NULL terminated one, rather
2524          * than the circular one we're used to.
2525          */
2526         head = alloc_page_buffers(page, blocksize, 0);
2527         if (!head) {
2528                 ret = -ENOMEM;
2529                 goto out_release;
2530         }
2531
2532         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2533
2534         /*
2535          * We loop across all blocks in the page, whether or not they are
2536          * part of the affected region.  This is so we can discover if the
2537          * page is fully mapped-to-disk.
2538          */
2539         for (block_start = 0, block_in_page = 0, bh = head;
2540                   block_start < PAGE_CACHE_SIZE;
2541                   block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2542                 int create;
2543
2544                 block_end = block_start + blocksize;
2545                 bh->b_state = 0;
2546                 create = 1;
2547                 if (block_start >= to)
2548                         create = 0;
2549                 ret = get_block(inode, block_in_file + block_in_page,
2550                                         bh, create);
2551                 if (ret)
2552                         goto failed;
2553                 if (!buffer_mapped(bh))
2554                         is_mapped_to_disk = 0;
2555                 if (buffer_new(bh))
2556                         unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2557                 if (PageUptodate(page)) {
2558                         set_buffer_uptodate(bh);
2559                         continue;
2560                 }
2561                 if (buffer_new(bh) || !buffer_mapped(bh)) {
2562                         zero_user_segments(page, block_start, from,
2563                                                         to, block_end);
2564                         continue;
2565                 }
2566                 if (buffer_uptodate(bh))
2567                         continue;       /* reiserfs does this */
2568                 if (block_start < from || block_end > to) {
2569                         lock_buffer(bh);
2570                         bh->b_end_io = end_buffer_read_nobh;
2571                         submit_bh(READ, bh);
2572                         nr_reads++;
2573                 }
2574         }
2575
2576         if (nr_reads) {
2577                 /*
2578                  * The page is locked, so these buffers are protected from
2579                  * any VM or truncate activity.  Hence we don't need to care
2580                  * for the buffer_head refcounts.
2581                  */
2582                 for (bh = head; bh; bh = bh->b_this_page) {
2583                         wait_on_buffer(bh);
2584                         if (!buffer_uptodate(bh))
2585                                 ret = -EIO;
2586                 }
2587                 if (ret)
2588                         goto failed;
2589         }
2590
2591         if (is_mapped_to_disk)
2592                 SetPageMappedToDisk(page);
2593
2594         *fsdata = head; /* to be released by nobh_write_end */
2595
2596         return 0;
2597
2598 failed:
2599         BUG_ON(!ret);
2600         /*
2601          * Error recovery is a bit difficult. We need to zero out blocks that
2602          * were newly allocated, and dirty them to ensure they get written out.
2603          * Buffers need to be attached to the page at this point, otherwise
2604          * the handling of potential IO errors during writeout would be hard
2605          * (could try doing synchronous writeout, but what if that fails too?)
2606          */
2607         attach_nobh_buffers(page, head);
2608         page_zero_new_buffers(page, from, to);
2609
2610 out_release:
2611         unlock_page(page);
2612         page_cache_release(page);
2613         *pagep = NULL;
2614
2615         if (pos + len > inode->i_size)
2616                 vmtruncate(inode, inode->i_size);
2617
2618         return ret;
2619 }
2620 EXPORT_SYMBOL(nobh_write_begin);
2621
2622 int nobh_write_end(struct file *file, struct address_space *mapping,
2623                         loff_t pos, unsigned len, unsigned copied,
2624                         struct page *page, void *fsdata)
2625 {
2626         struct inode *inode = page->mapping->host;
2627         struct buffer_head *head = fsdata;
2628         struct buffer_head *bh;
2629         BUG_ON(fsdata != NULL && page_has_buffers(page));
2630
2631         if (unlikely(copied < len) && head)
2632                 attach_nobh_buffers(page, head);
2633         if (page_has_buffers(page))
2634                 return generic_write_end(file, mapping, pos, len,
2635                                         copied, page, fsdata);
2636
2637         SetPageUptodate(page);
2638         set_page_dirty(page);
2639         if (pos+copied > inode->i_size) {
2640                 i_size_write(inode, pos+copied);
2641                 mark_inode_dirty(inode);
2642         }
2643
2644         unlock_page(page);
2645         page_cache_release(page);
2646
2647         while (head) {
2648                 bh = head;
2649                 head = head->b_this_page;
2650                 free_buffer_head(bh);
2651         }
2652
2653         return copied;
2654 }
2655 EXPORT_SYMBOL(nobh_write_end);
2656
2657 /*
2658  * nobh_writepage() - based on block_full_write_page() except
2659  * that it tries to operate without attaching bufferheads to
2660  * the page.
2661  */
2662 int nobh_writepage(struct page *page, get_block_t *get_block,
2663                         struct writeback_control *wbc)
2664 {
2665         struct inode * const inode = page->mapping->host;
2666         loff_t i_size = i_size_read(inode);
2667         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2668         unsigned offset;
2669         int ret;
2670
2671         /* Is the page fully inside i_size? */
2672         if (page->index < end_index)
2673                 goto out;
2674
2675         /* Is the page fully outside i_size? (truncate in progress) */
2676         offset = i_size & (PAGE_CACHE_SIZE-1);
2677         if (page->index >= end_index+1 || !offset) {
2678                 /*
2679                  * The page may have dirty, unmapped buffers.  For example,
2680                  * they may have been added in ext3_writepage().  Make them
2681                  * freeable here, so the page does not leak.
2682                  */
2683 #if 0
2684                 /* Not really sure about this  - do we need this ? */
2685                 if (page->mapping->a_ops->invalidatepage)
2686                         page->mapping->a_ops->invalidatepage(page, offset);
2687 #endif
2688                 unlock_page(page);
2689                 return 0; /* don't care */
2690         }
2691
2692         /*
2693          * The page straddles i_size.  It must be zeroed out on each and every
2694          * writepage invocation because it may be mmapped.  "A file is mapped
2695          * in multiples of the page size.  For a file that is not a multiple of
2696          * the  page size, the remaining memory is zeroed when mapped, and
2697          * writes to that region are not written out to the file."
2698          */
2699         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2700 out:
2701         ret = mpage_writepage(page, get_block, wbc);
2702         if (ret == -EAGAIN)
2703                 ret = __block_write_full_page(inode, page, get_block, wbc,
2704                                               end_buffer_async_write);
2705         return ret;
2706 }
2707 EXPORT_SYMBOL(nobh_writepage);
2708
2709 int nobh_truncate_page(struct address_space *mapping,
2710                         loff_t from, get_block_t *get_block)
2711 {
2712         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2713         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2714         unsigned blocksize;
2715         sector_t iblock;
2716         unsigned length, pos;
2717         struct inode *inode = mapping->host;
2718         struct page *page;
2719         struct buffer_head map_bh;
2720         int err;
2721
2722         blocksize = 1 << inode->i_blkbits;
2723         length = offset & (blocksize - 1);
2724
2725         /* Block boundary? Nothing to do */
2726         if (!length)
2727                 return 0;
2728
2729         length = blocksize - length;
2730         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2731
2732         page = grab_cache_page(mapping, index);
2733         err = -ENOMEM;
2734         if (!page)
2735                 goto out;
2736
2737         if (page_has_buffers(page)) {
2738 has_buffers:
2739                 unlock_page(page);
2740                 page_cache_release(page);
2741                 return block_truncate_page(mapping, from, get_block);
2742         }
2743
2744         /* Find the buffer that contains "offset" */
2745         pos = blocksize;
2746         while (offset >= pos) {
2747                 iblock++;
2748                 pos += blocksize;
2749         }
2750
2751         map_bh.b_size = blocksize;
2752         map_bh.b_state = 0;
2753         err = get_block(inode, iblock, &map_bh, 0);
2754         if (err)
2755                 goto unlock;
2756         /* unmapped? It's a hole - nothing to do */
2757         if (!buffer_mapped(&map_bh))
2758                 goto unlock;
2759
2760         /* Ok, it's mapped. Make sure it's up-to-date */
2761         if (!PageUptodate(page)) {
2762                 err = mapping->a_ops->readpage(NULL, page);
2763                 if (err) {
2764                         page_cache_release(page);
2765                         goto out;
2766                 }
2767                 lock_page(page);
2768                 if (!PageUptodate(page)) {
2769                         err = -EIO;
2770                         goto unlock;
2771                 }
2772                 if (page_has_buffers(page))
2773                         goto has_buffers;
2774         }
2775         zero_user(page, offset, length);
2776         set_page_dirty(page);
2777         err = 0;
2778
2779 unlock:
2780         unlock_page(page);
2781         page_cache_release(page);
2782 out:
2783         return err;
2784 }
2785 EXPORT_SYMBOL(nobh_truncate_page);
2786
2787 int block_truncate_page(struct address_space *mapping,
2788                         loff_t from, get_block_t *get_block)
2789 {
2790         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2791         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2792         unsigned blocksize;
2793         sector_t iblock;
2794         unsigned length, pos;
2795         struct inode *inode = mapping->host;
2796         struct page *page;
2797         struct buffer_head *bh;
2798         int err;
2799
2800         blocksize = 1 << inode->i_blkbits;
2801         length = offset & (blocksize - 1);
2802
2803         /* Block boundary? Nothing to do */
2804         if (!length)
2805                 return 0;
2806
2807         length = blocksize - length;
2808         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2809         
2810         page = grab_cache_page(mapping, index);
2811         err = -ENOMEM;
2812         if (!page)
2813                 goto out;
2814
2815         if (!page_has_buffers(page))
2816                 create_empty_buffers(page, blocksize, 0);
2817
2818         /* Find the buffer that contains "offset" */
2819         bh = page_buffers(page);
2820         pos = blocksize;
2821         while (offset >= pos) {
2822                 bh = bh->b_this_page;
2823                 iblock++;
2824                 pos += blocksize;
2825         }
2826
2827         err = 0;
2828         if (!buffer_mapped(bh)) {
2829                 WARN_ON(bh->b_size != blocksize);
2830                 err = get_block(inode, iblock, bh, 0);
2831                 if (err)
2832                         goto unlock;
2833                 /* unmapped? It's a hole - nothing to do */
2834                 if (!buffer_mapped(bh))
2835                         goto unlock;
2836         }
2837
2838         /* Ok, it's mapped. Make sure it's up-to-date */
2839         if (PageUptodate(page))
2840                 set_buffer_uptodate(bh);
2841
2842         if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2843                 err = -EIO;
2844                 ll_rw_block(READ, 1, &bh);
2845                 wait_on_buffer(bh);
2846                 /* Uhhuh. Read error. Complain and punt. */
2847                 if (!buffer_uptodate(bh))
2848                         goto unlock;
2849         }
2850
2851         zero_user(page, offset, length);
2852         mark_buffer_dirty(bh);
2853         err = 0;
2854
2855 unlock:
2856         unlock_page(page);
2857         page_cache_release(page);
2858 out:
2859         return err;
2860 }
2861 EXPORT_SYMBOL(block_truncate_page);
2862
2863 /*
2864  * The generic ->writepage function for buffer-backed address_spaces
2865  * this form passes in the end_io handler used to finish the IO.
2866  */
2867 int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2868                         struct writeback_control *wbc, bh_end_io_t *handler)
2869 {
2870         struct inode * const inode = page->mapping->host;
2871         loff_t i_size = i_size_read(inode);
2872         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2873         unsigned offset;
2874
2875         /* Is the page fully inside i_size? */
2876         if (page->index < end_index)
2877                 return __block_write_full_page(inode, page, get_block, wbc,
2878                                                handler);
2879
2880         /* Is the page fully outside i_size? (truncate in progress) */
2881         offset = i_size & (PAGE_CACHE_SIZE-1);
2882         if (page->index >= end_index+1 || !offset) {
2883                 /*
2884                  * The page may have dirty, unmapped buffers.  For example,
2885                  * they may have been added in ext3_writepage().  Make them
2886                  * freeable here, so the page does not leak.
2887                  */
2888                 do_invalidatepage(page, 0);
2889                 unlock_page(page);
2890                 return 0; /* don't care */
2891         }
2892
2893         /*
2894          * The page straddles i_size.  It must be zeroed out on each and every
2895          * writepage invocation because it may be mmapped.  "A file is mapped
2896          * in multiples of the page size.  For a file that is not a multiple of
2897          * the  page size, the remaining memory is zeroed when mapped, and
2898          * writes to that region are not written out to the file."
2899          */
2900         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2901         return __block_write_full_page(inode, page, get_block, wbc, handler);
2902 }
2903 EXPORT_SYMBOL(block_write_full_page_endio);
2904
2905 /*
2906  * The generic ->writepage function for buffer-backed address_spaces
2907  */
2908 int block_write_full_page(struct page *page, get_block_t *get_block,
2909                         struct writeback_control *wbc)
2910 {
2911         return block_write_full_page_endio(page, get_block, wbc,
2912                                            end_buffer_async_write);
2913 }
2914 EXPORT_SYMBOL(block_write_full_page);
2915
2916 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2917                             get_block_t *get_block)
2918 {
2919         struct buffer_head tmp;
2920         struct inode *inode = mapping->host;
2921         tmp.b_state = 0;
2922         tmp.b_blocknr = 0;
2923         tmp.b_size = 1 << inode->i_blkbits;
2924         get_block(inode, block, &tmp, 0);
2925         return tmp.b_blocknr;
2926 }
2927 EXPORT_SYMBOL(generic_block_bmap);
2928
2929 static void end_bio_bh_io_sync(struct bio *bio, int err)
2930 {
2931         struct buffer_head *bh = bio->bi_private;
2932
2933         if (err == -EOPNOTSUPP) {
2934                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2935                 set_bit(BH_Eopnotsupp, &bh->b_state);
2936         }
2937
2938         if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2939                 set_bit(BH_Quiet, &bh->b_state);
2940
2941         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2942         bio_put(bio);
2943 }
2944
2945 int submit_bh(int rw, struct buffer_head * bh)
2946 {
2947         struct bio *bio;
2948         int ret = 0;
2949
2950         BUG_ON(!buffer_locked(bh));
2951         BUG_ON(!buffer_mapped(bh));
2952         BUG_ON(!bh->b_end_io);
2953         BUG_ON(buffer_delay(bh));
2954         BUG_ON(buffer_unwritten(bh));
2955
2956         /*
2957          * Mask in barrier bit for a write (could be either a WRITE or a
2958          * WRITE_SYNC
2959          */
2960         if (buffer_ordered(bh) && (rw & WRITE))
2961                 rw |= WRITE_BARRIER;
2962
2963         /*
2964          * Only clear out a write error when rewriting
2965          */
2966         if (test_set_buffer_req(bh) && (rw & WRITE))
2967                 clear_buffer_write_io_error(bh);
2968
2969         /*
2970          * from here on down, it's all bio -- do the initial mapping,
2971          * submit_bio -> generic_make_request may further map this bio around
2972          */
2973         bio = bio_alloc(GFP_NOIO, 1);
2974
2975         bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2976         bio->bi_bdev = bh->b_bdev;
2977         bio->bi_io_vec[0].bv_page = bh->b_page;
2978         bio->bi_io_vec[0].bv_len = bh->b_size;
2979         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2980
2981         bio->bi_vcnt = 1;
2982         bio->bi_idx = 0;
2983         bio->bi_size = bh->b_size;
2984
2985         bio->bi_end_io = end_bio_bh_io_sync;
2986         bio->bi_private = bh;
2987
2988         bio_get(bio);
2989         submit_bio(rw, bio);
2990
2991         if (bio_flagged(bio, BIO_EOPNOTSUPP))
2992                 ret = -EOPNOTSUPP;
2993
2994         bio_put(bio);
2995         return ret;
2996 }
2997 EXPORT_SYMBOL(submit_bh);
2998
2999 /**
3000  * ll_rw_block: low-level access to block devices (DEPRECATED)
3001  * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
3002  * @nr: number of &struct buffer_heads in the array
3003  * @bhs: array of pointers to &struct buffer_head
3004  *
3005  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3006  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
3007  * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
3008  * are sent to disk. The fourth %READA option is described in the documentation
3009  * for generic_make_request() which ll_rw_block() calls.
3010  *
3011  * This function drops any buffer that it cannot get a lock on (with the
3012  * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
3013  * clean when doing a write request, and any buffer that appears to be
3014  * up-to-date when doing read request.  Further it marks as clean buffers that
3015  * are processed for writing (the buffer cache won't assume that they are
3016  * actually clean until the buffer gets unlocked).
3017  *
3018  * ll_rw_block sets b_end_io to simple completion handler that marks
3019  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
3020  * any waiters. 
3021  *
3022  * All of the buffers must be for the same device, and must also be a
3023  * multiple of the current approved size for the device.
3024  */
3025 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3026 {
3027         int i;
3028
3029         for (i = 0; i < nr; i++) {
3030                 struct buffer_head *bh = bhs[i];
3031
3032                 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
3033                         lock_buffer(bh);
3034                 else if (!trylock_buffer(bh))
3035                         continue;
3036
3037                 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
3038                     rw == SWRITE_SYNC_PLUG) {
3039                         if (test_clear_buffer_dirty(bh)) {
3040                                 bh->b_end_io = end_buffer_write_sync;
3041                                 get_bh(bh);
3042                                 if (rw == SWRITE_SYNC)
3043                                         submit_bh(WRITE_SYNC, bh);
3044                                 else
3045                                         submit_bh(WRITE, bh);
3046                                 continue;
3047                         }
3048                 } else {
3049                         if (!buffer_uptodate(bh)) {
3050                                 bh->b_end_io = end_buffer_read_sync;
3051                                 get_bh(bh);
3052                                 submit_bh(rw, bh);
3053                                 continue;
3054                         }
3055                 }
3056                 unlock_buffer(bh);
3057         }
3058 }
3059 EXPORT_SYMBOL(ll_rw_block);
3060
3061 /*
3062  * For a data-integrity writeout, we need to wait upon any in-progress I/O
3063  * and then start new I/O and then wait upon it.  The caller must have a ref on
3064  * the buffer_head.
3065  */
3066 int sync_dirty_buffer(struct buffer_head *bh)
3067 {
3068         int ret = 0;
3069
3070         WARN_ON(atomic_read(&bh->b_count) < 1);
3071         lock_buffer(bh);
3072         if (test_clear_buffer_dirty(bh)) {
3073                 get_bh(bh);
3074                 bh->b_end_io = end_buffer_write_sync;
3075                 ret = submit_bh(WRITE_SYNC, bh);
3076                 wait_on_buffer(bh);
3077                 if (buffer_eopnotsupp(bh)) {
3078                         clear_buffer_eopnotsupp(bh);
3079                         ret = -EOPNOTSUPP;
3080                 }
3081                 if (!ret && !buffer_uptodate(bh))
3082                         ret = -EIO;
3083         } else {
3084                 unlock_buffer(bh);
3085         }
3086         return ret;
3087 }
3088 EXPORT_SYMBOL(sync_dirty_buffer);
3089
3090 /*
3091  * try_to_free_buffers() checks if all the buffers on this particular page
3092  * are unused, and releases them if so.
3093  *
3094  * Exclusion against try_to_free_buffers may be obtained by either
3095  * locking the page or by holding its mapping's private_lock.
3096  *
3097  * If the page is dirty but all the buffers are clean then we need to
3098  * be sure to mark the page clean as well.  This is because the page
3099  * may be against a block device, and a later reattachment of buffers
3100  * to a dirty page will set *all* buffers dirty.  Which would corrupt
3101  * filesystem data on the same device.
3102  *
3103  * The same applies to regular filesystem pages: if all the buffers are
3104  * clean then we set the page clean and proceed.  To do that, we require
3105  * total exclusion from __set_page_dirty_buffers().  That is obtained with
3106  * private_lock.
3107  *
3108  * try_to_free_buffers() is non-blocking.
3109  */
3110 static inline int buffer_busy(struct buffer_head *bh)
3111 {
3112         return atomic_read(&bh->b_count) |
3113                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3114 }
3115
3116 static int
3117 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3118 {
3119         struct buffer_head *head = page_buffers(page);
3120         struct buffer_head *bh;
3121
3122         bh = head;
3123         do {
3124                 if (buffer_write_io_error(bh) && page->mapping)
3125                         set_bit(AS_EIO, &page->mapping->flags);
3126                 if (buffer_busy(bh))
3127                         goto failed;
3128                 bh = bh->b_this_page;
3129         } while (bh != head);
3130
3131         do {
3132                 struct buffer_head *next = bh->b_this_page;
3133
3134                 if (bh->b_assoc_map)
3135                         __remove_assoc_queue(bh);
3136                 bh = next;
3137         } while (bh != head);
3138         *buffers_to_free = head;
3139         __clear_page_buffers(page);
3140         return 1;
3141 failed:
3142         return 0;
3143 }
3144
3145 int try_to_free_buffers(struct page *page)
3146 {
3147         struct address_space * const mapping = page->mapping;
3148         struct buffer_head *buffers_to_free = NULL;
3149         int ret = 0;
3150
3151         BUG_ON(!PageLocked(page));
3152         if (PageWriteback(page))
3153                 return 0;
3154
3155         if (mapping == NULL) {          /* can this still happen? */
3156                 ret = drop_buffers(page, &buffers_to_free);
3157                 goto out;
3158         }
3159
3160         spin_lock(&mapping->private_lock);
3161         ret = drop_buffers(page, &buffers_to_free);
3162
3163         /*
3164          * If the filesystem writes its buffers by hand (eg ext3)
3165          * then we can have clean buffers against a dirty page.  We
3166          * clean the page here; otherwise the VM will never notice
3167          * that the filesystem did any IO at all.
3168          *
3169          * Also, during truncate, discard_buffer will have marked all
3170          * the page's buffers clean.  We discover that here and clean
3171          * the page also.
3172          *
3173          * private_lock must be held over this entire operation in order
3174          * to synchronise against __set_page_dirty_buffers and prevent the
3175          * dirty bit from being lost.
3176          */
3177         if (ret)
3178                 cancel_dirty_page(page, PAGE_CACHE_SIZE);
3179         spin_unlock(&mapping->private_lock);
3180 out:
3181         if (buffers_to_free) {
3182                 struct buffer_head *bh = buffers_to_free;
3183
3184                 do {
3185                         struct buffer_head *next = bh->b_this_page;
3186                         free_buffer_head(bh);
3187                         bh = next;
3188                 } while (bh != buffers_to_free);
3189         }
3190         return ret;
3191 }
3192 EXPORT_SYMBOL(try_to_free_buffers);
3193
3194 void block_sync_page(struct page *page)
3195 {
3196         struct address_space *mapping;
3197
3198         smp_mb();
3199         mapping = page_mapping(page);
3200         if (mapping)
3201                 blk_run_backing_dev(mapping->backing_dev_info, page);
3202 }
3203 EXPORT_SYMBOL(block_sync_page);
3204
3205 /*
3206  * There are no bdflush tunables left.  But distributions are
3207  * still running obsolete flush daemons, so we terminate them here.
3208  *
3209  * Use of bdflush() is deprecated and will be removed in a future kernel.
3210  * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3211  */
3212 SYSCALL_DEFINE2(bdflush, int, func, long, data)
3213 {
3214         static int msg_count;
3215
3216         if (!capable(CAP_SYS_ADMIN))
3217                 return -EPERM;
3218
3219         if (msg_count < 5) {
3220                 msg_count++;
3221                 printk(KERN_INFO
3222                         "warning: process `%s' used the obsolete bdflush"
3223                         " system call\n", current->comm);
3224                 printk(KERN_INFO "Fix your initscripts?\n");
3225         }
3226
3227         if (func == 1)
3228                 do_exit(0);
3229         return 0;
3230 }
3231
3232 /*
3233  * Buffer-head allocation
3234  */
3235 static struct kmem_cache *bh_cachep;
3236
3237 /*
3238  * Once the number of bh's in the machine exceeds this level, we start
3239  * stripping them in writeback.
3240  */
3241 static int max_buffer_heads;
3242
3243 int buffer_heads_over_limit;
3244
3245 struct bh_accounting {
3246         int nr;                 /* Number of live bh's */
3247         int ratelimit;          /* Limit cacheline bouncing */
3248 };
3249
3250 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3251
3252 static void recalc_bh_state(void)
3253 {
3254         int i;
3255         int tot = 0;
3256
3257         if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3258                 return;
3259         __get_cpu_var(bh_accounting).ratelimit = 0;
3260         for_each_online_cpu(i)
3261                 tot += per_cpu(bh_accounting, i).nr;
3262         buffer_heads_over_limit = (tot > max_buffer_heads);
3263 }
3264         
3265 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3266 {
3267         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3268         if (ret) {
3269                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3270                 get_cpu_var(bh_accounting).nr++;
3271                 recalc_bh_state();
3272                 put_cpu_var(bh_accounting);
3273         }
3274         return ret;
3275 }
3276 EXPORT_SYMBOL(alloc_buffer_head);
3277
3278 void free_buffer_head(struct buffer_head *bh)
3279 {
3280         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3281         kmem_cache_free(bh_cachep, bh);
3282         get_cpu_var(bh_accounting).nr--;
3283         recalc_bh_state();
3284         put_cpu_var(bh_accounting);
3285 }
3286 EXPORT_SYMBOL(free_buffer_head);
3287
3288 static void buffer_exit_cpu(int cpu)
3289 {
3290         int i;
3291         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3292
3293         for (i = 0; i < BH_LRU_SIZE; i++) {
3294                 brelse(b->bhs[i]);
3295                 b->bhs[i] = NULL;
3296         }
3297         get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3298         per_cpu(bh_accounting, cpu).nr = 0;
3299         put_cpu_var(bh_accounting);
3300 }
3301
3302 static int buffer_cpu_notify(struct notifier_block *self,
3303                               unsigned long action, void *hcpu)
3304 {
3305         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3306                 buffer_exit_cpu((unsigned long)hcpu);
3307         return NOTIFY_OK;
3308 }
3309
3310 /**
3311  * bh_uptodate_or_lock - Test whether the buffer is uptodate
3312  * @bh: struct buffer_head
3313  *
3314  * Return true if the buffer is up-to-date and false,
3315  * with the buffer locked, if not.
3316  */
3317 int bh_uptodate_or_lock(struct buffer_head *bh)
3318 {
3319         if (!buffer_uptodate(bh)) {
3320                 lock_buffer(bh);
3321                 if (!buffer_uptodate(bh))
3322                         return 0;
3323                 unlock_buffer(bh);
3324         }
3325         return 1;
3326 }
3327 EXPORT_SYMBOL(bh_uptodate_or_lock);
3328
3329 /**
3330  * bh_submit_read - Submit a locked buffer for reading
3331  * @bh: struct buffer_head
3332  *
3333  * Returns zero on success and -EIO on error.
3334  */
3335 int bh_submit_read(struct buffer_head *bh)
3336 {
3337         BUG_ON(!buffer_locked(bh));
3338
3339         if (buffer_uptodate(bh)) {
3340                 unlock_buffer(bh);
3341                 return 0;
3342         }
3343
3344         get_bh(bh);
3345         bh->b_end_io = end_buffer_read_sync;
3346         submit_bh(READ, bh);
3347         wait_on_buffer(bh);
3348         if (buffer_uptodate(bh))
3349                 return 0;
3350         return -EIO;
3351 }
3352 EXPORT_SYMBOL(bh_submit_read);
3353
3354 void __init buffer_init(void)
3355 {
3356         int nrpages;
3357
3358         bh_cachep = kmem_cache_create("buffer_head",
3359                         sizeof(struct buffer_head), 0,
3360                                 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3361                                 SLAB_MEM_SPREAD),
3362                                 NULL);
3363
3364         /*
3365          * Limit the bh occupancy to 10% of ZONE_NORMAL
3366          */
3367         nrpages = (nr_free_buffer_pages() * 10) / 100;
3368         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3369         hotcpu_notifier(buffer_cpu_notify, 0);
3370 }