[XFS] Flush the block device before closing it on unmount.
[safe/jmp/linux-2.6] / fs / xfs / linux-2.6 / xfs_buf.c
index bfb4f29..2df6362 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
@@ -15,6 +15,7 @@
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
+#include "xfs.h"
 #include <linux/stddef.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/hash.h>
 #include <linux/kthread.h>
-#include "xfs_linux.h"
+#include <linux/migrate.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
 
-STATIC kmem_zone_t *xfs_buf_zone;
-STATIC kmem_shaker_t xfs_buf_shake;
+static kmem_zone_t *xfs_buf_zone;
+static struct shrinker *xfs_buf_shake;
 STATIC int xfsbufd(void *);
 STATIC int xfsbufd_wakeup(int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 
-STATIC struct workqueue_struct *xfslogd_workqueue;
+static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
 
 #ifdef XFS_BUF_TRACE
@@ -136,7 +139,7 @@ page_region_mask(
        return mask;
 }
 
-STATIC inline void
+STATIC_INLINE void
 set_page_region(
        struct page     *page,
        size_t          offset,
@@ -148,7 +151,7 @@ set_page_region(
                SetPageUptodate(page);
 }
 
-STATIC inline int
+STATIC_INLINE int
 test_page_region(
        struct page     *page,
        size_t          offset,
@@ -168,9 +171,9 @@ typedef struct a_list {
        struct a_list   *next;
 } a_list_t;
 
-STATIC a_list_t                *as_free_head;
-STATIC int             as_list_len;
-STATIC DEFINE_SPINLOCK(as_lock);
+static a_list_t                *as_free_head;
+static int             as_list_len;
+static DEFINE_SPINLOCK(as_lock);
 
 /*
  *     Try to batch vunmaps because they are costly.
@@ -181,7 +184,7 @@ free_address(
 {
        a_list_t        *aentry;
 
-       aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
+       aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
        if (likely(aentry)) {
                spin_lock(&as_lock);
                aentry->next = as_free_head;
@@ -311,22 +314,19 @@ xfs_buf_free(
 
        ASSERT(list_empty(&bp->b_hash_list));
 
-       if (bp->b_flags & _XBF_PAGE_CACHE) {
+       if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
                uint            i;
 
                if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
                        free_address(bp->b_addr - bp->b_offset);
 
-               for (i = 0; i < bp->b_page_count; i++)
-                       page_cache_release(bp->b_pages[i]);
-               _xfs_buf_free_pages(bp);
-       } else if (bp->b_flags & _XBF_KMEM_ALLOC) {
-                /*
-                 * XXX(hch): bp->b_count_desired might be incorrect (see
-                 * xfs_buf_associate_memory for details), but fortunately
-                 * the Linux version of kmem_free ignores the len argument..
-                 */
-               kmem_free(bp->b_addr, bp->b_count_desired);
+               for (i = 0; i < bp->b_page_count; i++) {
+                       struct page     *page = bp->b_pages[i];
+
+                       if (bp->b_flags & _XBF_PAGE_CACHE)
+                               ASSERT(!PagePrivate(page));
+                       page_cache_release(page);
+               }
                _xfs_buf_free_pages(bp);
        }
 
@@ -390,7 +390,7 @@ _xfs_buf_lookup_pages(
 
                        XFS_STATS_INC(xb_page_retries);
                        xfsbufd_wakeup(0, gfp_mask);
-                       blk_congestion_wait(WRITE, HZ/50);
+                       congestion_wait(WRITE, HZ/50);
                        goto retry;
                }
 
@@ -399,6 +399,7 @@ _xfs_buf_lookup_pages(
                nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
                size -= nbytes;
 
+               ASSERT(!PagePrivate(page));
                if (!PageUptodate(page)) {
                        page_count--;
                        if (blocksize >= PAGE_CACHE_SIZE) {
@@ -756,43 +757,44 @@ xfs_buf_get_noaddr(
        size_t                  len,
        xfs_buftarg_t           *target)
 {
-       size_t                  malloc_len = len;
+       unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
+       int                     error, i;
        xfs_buf_t               *bp;
-       void                    *data;
-       int                     error;
 
        bp = xfs_buf_allocate(0);
        if (unlikely(bp == NULL))
                goto fail;
        _xfs_buf_initialize(bp, target, 0, len, 0);
 
- try_again:
-       data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
-       if (unlikely(data == NULL))
+       error = _xfs_buf_get_pages(bp, page_count, 0);
+       if (error)
                goto fail_free_buf;
 
-       /* check whether alignment matches.. */
-       if ((__psunsigned_t)data !=
-           ((__psunsigned_t)data & ~target->bt_smask)) {
-               /* .. else double the size and try again */
-               kmem_free(data, malloc_len);
-               malloc_len <<= 1;
-               goto try_again;
+       for (i = 0; i < page_count; i++) {
+               bp->b_pages[i] = alloc_page(GFP_KERNEL);
+               if (!bp->b_pages[i])
+                       goto fail_free_mem;
        }
+       bp->b_flags |= _XBF_PAGES;
 
-       error = xfs_buf_associate_memory(bp, data, len);
-       if (error)
+       error = _xfs_buf_map_pages(bp, XBF_MAPPED);
+       if (unlikely(error)) {
+               printk(KERN_WARNING "%s: failed to map pages\n",
+                               __FUNCTION__);
                goto fail_free_mem;
-       bp->b_flags |= _XBF_KMEM_ALLOC;
+       }
 
        xfs_buf_unlock(bp);
 
-       XB_TRACE(bp, "no_daddr", data);
+       XB_TRACE(bp, "no_daddr", len);
        return bp;
+
  fail_free_mem:
-       kmem_free(data, malloc_len);
+       while (--i >= 0)
+               __free_page(bp->b_pages[i]);
+       _xfs_buf_free_pages(bp);
  fail_free_buf:
-       xfs_buf_free(bp);
+       xfs_buf_deallocate(bp);
  fail:
        return NULL;
 }
@@ -987,9 +989,10 @@ xfs_buf_wait_unpin(
 
 STATIC void
 xfs_buf_iodone_work(
-       void                    *v)
+       struct work_struct      *work)
 {
-       xfs_buf_t               *bp = (xfs_buf_t *)v;
+       xfs_buf_t               *bp =
+               container_of(work, xfs_buf_t, b_iodone_work);
 
        if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
@@ -1010,10 +1013,10 @@ xfs_buf_ioend(
 
        if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
                if (schedule) {
-                       INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work, bp);
+                       INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
                        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
                } else {
-                       xfs_buf_iodone_work(bp);
+                       xfs_buf_iodone_work(&bp->b_iodone_work);
                }
        } else {
                up(&bp->b_iodonesema);
@@ -1076,7 +1079,7 @@ xfs_buf_iostart(
        return status;
 }
 
-STATIC __inline__ int
+STATIC_INLINE int
 _xfs_buf_iolocked(
        xfs_buf_t               *bp)
 {
@@ -1086,7 +1089,7 @@ _xfs_buf_iolocked(
        return 0;
 }
 
-STATIC __inline__ void
+STATIC_INLINE void
 _xfs_buf_ioend(
        xfs_buf_t               *bp,
        int                     schedule)
@@ -1116,10 +1119,10 @@ xfs_buf_bio_end_io(
        do {
                struct page     *page = bvec->bv_page;
 
+               ASSERT(!PagePrivate(page));
                if (unlikely(bp->b_error)) {
                        if (bp->b_flags & XBF_READ)
                                ClearPageUptodate(page);
-                       SetPageError(page);
                } else if (blocksize >= PAGE_CACHE_SIZE) {
                        SetPageUptodate(page);
                } else if (!PagePrivate(page) &&
@@ -1155,16 +1158,16 @@ _xfs_buf_ioapply(
        total_nr_pages = bp->b_page_count;
        map_i = 0;
 
-       if (bp->b_flags & _XBF_RUN_QUEUES) {
-               bp->b_flags &= ~_XBF_RUN_QUEUES;
-               rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC;
-       } else {
-               rw = (bp->b_flags & XBF_READ) ? READ : WRITE;
-       }
-
        if (bp->b_flags & XBF_ORDERED) {
                ASSERT(!(bp->b_flags & XBF_READ));
                rw = WRITE_BARRIER;
+       } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+               ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+               bp->b_flags &= ~_XBF_RUN_QUEUES;
+               rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
+       } else {
+               rw = (bp->b_flags & XBF_WRITE) ? WRITE :
+                    (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
        }
 
        /* Special code path for reading a sub page size buffer in --
@@ -1399,7 +1402,7 @@ xfs_alloc_bufhash(
        btp->bt_hashshift = external ? 3 : 8;   /* 8 or 256 buckets */
        btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
        btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
-                                       sizeof(xfs_bufhash_t), KM_SLEEP);
+                                       sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
                spin_lock_init(&btp->bt_hash[i].bh_lock);
                INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1417,8 +1420,8 @@ xfs_free_bufhash(
 /*
  *     buftarg list for delwrite queue processing
  */
-STATIC LIST_HEAD(xfs_buftarg_list);
-STATIC DEFINE_SPINLOCK(xfs_buftarg_lock);
+static LIST_HEAD(xfs_buftarg_list);
+static DEFINE_SPINLOCK(xfs_buftarg_lock);
 
 STATIC void
 xfs_register_buftarg(
@@ -1444,6 +1447,7 @@ xfs_free_buftarg(
        int                     external)
 {
        xfs_flush_buftarg(btp, 1);
+       xfs_blkdev_issue_flush(btp);
        if (external)
                xfs_blkdev_put(btp->bt_bdev);
        xfs_free_bufhash(btp);
@@ -1519,7 +1523,7 @@ xfs_mapping_buftarg(
        struct backing_dev_info *bdi;
        struct inode            *inode;
        struct address_space    *mapping;
-       static struct address_space_operations mapping_aops = {
+       static const struct address_space_operations mapping_aops = {
                .sync_page = block_sync_page,
                .migratepage = fail_migrate_page,
        };
@@ -1670,20 +1674,60 @@ xfsbufd_wakeup(
        return 0;
 }
 
+/*
+ * Move as many buffers as specified to the supplied list
+ * idicating if we skipped any buffers to prevent deadlocks.
+ */
+STATIC int
+xfs_buf_delwri_split(
+       xfs_buftarg_t   *target,
+       struct list_head *list,
+       unsigned long   age)
+{
+       xfs_buf_t       *bp, *n;
+       struct list_head *dwq = &target->bt_delwrite_queue;
+       spinlock_t      *dwlk = &target->bt_delwrite_lock;
+       int             skipped = 0;
+       int             force;
+
+       force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
+       INIT_LIST_HEAD(list);
+       spin_lock(dwlk);
+       list_for_each_entry_safe(bp, n, dwq, b_list) {
+               XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
+               ASSERT(bp->b_flags & XBF_DELWRI);
+
+               if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
+                       if (!force &&
+                           time_before(jiffies, bp->b_queuetime + age)) {
+                               xfs_buf_unlock(bp);
+                               break;
+                       }
+
+                       bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
+                                        _XBF_RUN_QUEUES);
+                       bp->b_flags |= XBF_WRITE;
+                       list_move_tail(&bp->b_list, list);
+               } else
+                       skipped++;
+       }
+       spin_unlock(dwlk);
+
+       return skipped;
+
+}
+
 STATIC int
 xfsbufd(
-       void                    *data)
+       void            *data)
 {
-       struct list_head        tmp;
-       unsigned long           age;
-       xfs_buftarg_t           *target = (xfs_buftarg_t *)data;
-       xfs_buf_t               *bp, *n;
-       struct list_head        *dwq = &target->bt_delwrite_queue;
-       spinlock_t              *dwlk = &target->bt_delwrite_lock;
+       struct list_head tmp;
+       xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
+       int             count;
+       xfs_buf_t       *bp;
 
        current->flags |= PF_MEMALLOC;
 
-       INIT_LIST_HEAD(&tmp);
        do {
                if (unlikely(freezing(current))) {
                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1695,42 +1739,24 @@ xfsbufd(
                schedule_timeout_interruptible(
                        xfs_buf_timer_centisecs * msecs_to_jiffies(10));
 
-               age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
-               spin_lock(dwlk);
-               list_for_each_entry_safe(bp, n, dwq, b_list) {
-                       XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
-                       ASSERT(bp->b_flags & XBF_DELWRI);
-
-                       if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
-                               if (!test_bit(XBT_FORCE_FLUSH,
-                                               &target->bt_flags) &&
-                                   time_before(jiffies,
-                                               bp->b_queuetime + age)) {
-                                       xfs_buf_unlock(bp);
-                                       break;
-                               }
-
-                               bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-                               bp->b_flags |= XBF_WRITE;
-                               list_move(&bp->b_list, &tmp);
-                       }
-               }
-               spin_unlock(dwlk);
+               xfs_buf_delwri_split(target, &tmp,
+                               xfs_buf_age_centisecs * msecs_to_jiffies(10));
 
+               count = 0;
                while (!list_empty(&tmp)) {
                        bp = list_entry(tmp.next, xfs_buf_t, b_list);
                        ASSERT(target == bp->b_target);
 
                        list_del_init(&bp->b_list);
                        xfs_buf_iostrategy(bp);
-
-                       blk_run_address_space(target->bt_mapping);
+                       count++;
                }
 
                if (as_list_len > 0)
                        purge_addresses();
+               if (count)
+                       blk_run_address_space(target->bt_mapping);
 
-               clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
        } while (!kthread_should_stop());
 
        return 0;
@@ -1743,40 +1769,24 @@ xfsbufd(
  */
 int
 xfs_flush_buftarg(
-       xfs_buftarg_t           *target,
-       int                     wait)
+       xfs_buftarg_t   *target,
+       int             wait)
 {
-       struct list_head        tmp;
-       xfs_buf_t               *bp, *n;
-       int                     pincount = 0;
-       struct list_head        *dwq = &target->bt_delwrite_queue;
-       spinlock_t              *dwlk = &target->bt_delwrite_lock;
+       struct list_head tmp;
+       xfs_buf_t       *bp, *n;
+       int             pincount = 0;
 
        xfs_buf_runall_queues(xfsdatad_workqueue);
        xfs_buf_runall_queues(xfslogd_workqueue);
 
-       INIT_LIST_HEAD(&tmp);
-       spin_lock(dwlk);
-       list_for_each_entry_safe(bp, n, dwq, b_list) {
-               ASSERT(bp->b_target == target);
-               ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q));
-               XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp));
-               if (xfs_buf_ispin(bp)) {
-                       pincount++;
-                       continue;
-               }
-
-               list_move(&bp->b_list, &tmp);
-       }
-       spin_unlock(dwlk);
+       set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
+       pincount = xfs_buf_delwri_split(target, &tmp, 0);
 
        /*
         * Dropped the delayed write list lock, now walk the temporary list
         */
        list_for_each_entry_safe(bp, n, &tmp, b_list) {
-               xfs_buf_lock(bp);
-               bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-               bp->b_flags |= XBF_WRITE;
+               ASSERT(target == bp->b_target);
                if (wait)
                        bp->b_flags &= ~XBF_ASYNC;
                else
@@ -1785,6 +1795,9 @@ xfs_flush_buftarg(
                xfs_buf_iostrategy(bp);
        }
 
+       if (wait)
+               blk_run_address_space(target->bt_mapping);
+
        /*
         * Remaining list items must be flushed before returning
         */
@@ -1796,22 +1809,18 @@ xfs_flush_buftarg(
                xfs_buf_relse(bp);
        }
 
-       if (wait)
-               blk_run_address_space(target->bt_mapping);
-
        return pincount;
 }
 
 int __init
 xfs_buf_init(void)
 {
-       int             error = -ENOMEM;
-
 #ifdef XFS_BUF_TRACE
        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
 #endif
 
-       xfs_buf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf");
+       xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
+                                               KM_ZONE_HWALIGN, NULL);
        if (!xfs_buf_zone)
                goto out_free_trace_buf;
 
@@ -1823,7 +1832,7 @@ xfs_buf_init(void)
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
 
-       xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup);
+       xfs_buf_shake = set_shrinker(DEFAULT_SEEKS, xfsbufd_wakeup);
        if (!xfs_buf_shake)
                goto out_destroy_xfsdatad_workqueue;
 
@@ -1839,13 +1848,13 @@ xfs_buf_init(void)
 #ifdef XFS_BUF_TRACE
        ktrace_free(xfs_buf_trace_buf);
 #endif
-       return error;
+       return -ENOMEM;
 }
 
 void
 xfs_buf_terminate(void)
 {
-       kmem_shake_deregister(xfs_buf_shake);
+       remove_shrinker(xfs_buf_shake);
        destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
@@ -1853,3 +1862,11 @@ xfs_buf_terminate(void)
        ktrace_free(xfs_buf_trace_buf);
 #endif
 }
+
+#ifdef CONFIG_KDB_MODULES
+struct list_head *
+xfs_get_buftarg_list(void)
+{
+       return &xfs_buftarg_list;
+}
+#endif