anon-inodes use open coded atomic_inc for the shared inode
[safe/jmp/linux-2.6] / fs / gfs2 / ops_address.c
index 38b702a..9679f8b 100644 (file)
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -16,6 +16,8 @@
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
 
@@ -31,6 +33,7 @@
 #include "trans.h"
 #include "rgrp.h"
 #include "ops_file.h"
+#include "super.h"
 #include "util.h"
 #include "glops.h"
 
@@ -48,6 +51,8 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
                end = start + bsize;
                if (end <= from || start >= to)
                        continue;
+               if (gfs2_is_jdata(ip))
+                       set_buffer_uptodate(bh);
                gfs2_trans_add_bh(ip->i_gl, bh, 0);
        }
 }
@@ -86,7 +91,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
        error = gfs2_block_map(inode, lblock, 0, bh_result);
        if (error)
                return error;
-       if (bh_result->b_blocknr == 0)
+       if (!buffer_mapped(bh_result))
                return -EIO;
        return 0;
 }
@@ -133,7 +138,9 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
                return 0; /* don't care */
        }
 
-       if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) {
+       if ((sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) &&
+           PageChecked(page)) {
+               ClearPageChecked(page);
                error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
                if (error)
                        goto out_ignore;
@@ -156,17 +163,30 @@ out_ignore:
        return 0;
 }
 
-static int zero_readpage(struct page *page)
+/**
+ * gfs2_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: Write-back control
+ *
+ * For journaled files and/or ordered writes this just falls back to the
+ * kernel's default writepages path for now. We will probably want to change
+ * that eventually (i.e. when we look at allocate on flush).
+ *
+ * For the data=writeback case though we can already ignore buffer heads
+ * and write whole extents at once. This is a big reduction in the
+ * number of I/O requests we send and the bmap calls we make in this case.
+ */
+static int gfs2_writepages(struct address_space *mapping,
+                          struct writeback_control *wbc)
 {
-       void *kaddr;
-
-       kaddr = kmap_atomic(page, KM_USER0);
-       memset(kaddr, 0, PAGE_CACHE_SIZE);
-       kunmap_atomic(kaddr, KM_USER0);
+       struct inode *inode = mapping->host;
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
 
-       SetPageUptodate(page);
+       if (sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK && !gfs2_is_jdata(ip))
+               return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
 
-       return 0;
+       return generic_writepages(mapping, wbc);
 }
 
 /**
@@ -183,9 +203,15 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
        void *kaddr;
        int error;
 
-       /* Only the first page of a stuffed file might contain data */
-       if (unlikely(page->index))
-               return zero_readpage(page);
+       /*
+        * Due to the order of unstuffing files and ->nopage(), we can be
+        * asked for a zero page in the case of a stuffed file being extended,
+        * so we need to supply one here. It doesn't happen often.
+        */
+       if (unlikely(page->index)) {
+               zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0);
+               return 0;
+       }
 
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error)
@@ -196,9 +222,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
               ip->i_di.di_size);
        memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
        kunmap_atomic(kaddr, KM_USER0);
-
+       flush_dcache_page(page);
        brelse(dibh);
-
        SetPageUptodate(page);
 
        return 0;
@@ -227,12 +252,12 @@ static int gfs2_readpage(struct file *file, struct page *page)
                if (file) {
                        gf = file->private_data;
                        if (test_bit(GFF_EXLOCK, &gf->f_flags))
-                               /* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
+                               /* gfs2_sharewrite_fault has grabbed the ip->i_gl already */
                                goto skip_lock;
                }
-               gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|GL_AOP, &gh);
+               gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
                do_unlock = 1;
-               error = gfs2_glock_nq_m_atime(1, &gh);
+               error = gfs2_glock_nq_atime(&gh);
                if (unlikely(error))
                        goto out_unlock;
        }
@@ -255,6 +280,10 @@ out:
        return error;
 out_unlock:
        unlock_page(page);
+       if (error == GLR_TRYFAILED) {
+               error = AOP_TRUNCATED_PAGE;
+               yield();
+       }
        if (do_unlock)
                gfs2_holder_uninit(&gh);
        goto out;
@@ -269,7 +298,7 @@ out_unlock:
  *    the page lock and the glock) and return having done no I/O. Its
  *    obviously not something we'd want to do on too regular a basis.
  *    Any I/O we ignore at this time will be done via readpage later.
- * 2. We have to handle stuffed files here too.
+ * 2. We don't handle stuffed files here we let readpage do the honours.
  * 3. mpage_readpages() does most of the heavy lifting in the common case.
  * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
  * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
@@ -282,8 +311,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct gfs2_holder gh;
-       unsigned page_idx;
-       int ret;
+       int ret = 0;
        int do_unlock = 0;
 
        if (likely(file != &gfs2_internal_file_sentinel)) {
@@ -293,38 +321,17 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
                                goto skip_lock;
                }
                gfs2_holder_init(ip->i_gl, LM_ST_SHARED,
-                                LM_FLAG_TRY_1CB|GL_ATIME|GL_AOP, &gh);
+                                LM_FLAG_TRY_1CB|GL_ATIME, &gh);
                do_unlock = 1;
-               ret = gfs2_glock_nq_m_atime(1, &gh);
+               ret = gfs2_glock_nq_atime(&gh);
                if (ret == GLR_TRYFAILED)
                        goto out_noerror;
                if (unlikely(ret))
                        goto out_unlock;
        }
 skip_lock:
-       if (gfs2_is_stuffed(ip)) {
-               struct pagevec lru_pvec;
-               pagevec_init(&lru_pvec, 0);
-               for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-                       struct page *page = list_entry(pages->prev, struct page, lru);
-                       prefetchw(&page->flags);
-                       list_del(&page->lru);
-                       if (!add_to_page_cache(page, mapping,
-                                              page->index, GFP_KERNEL)) {
-                               ret = stuffed_readpage(ip, page);
-                               unlock_page(page);
-                               if (!pagevec_add(&lru_pvec, page))
-                                        __pagevec_lru_add(&lru_pvec);
-                       } else {
-                               page_cache_release(page);
-                       }
-               }
-               pagevec_lru_add(&lru_pvec);
-               ret = 0;
-       } else {
-               /* What we really want to do .... */
+       if (!gfs2_is_stuffed(ip))
                ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
-       }
 
        if (do_unlock) {
                gfs2_glock_dq_m(1, &gh);
@@ -343,39 +350,49 @@ out_unlock:
 }
 
 /**
- * gfs2_prepare_write - Prepare to write a page to a file
+ * gfs2_write_begin - Begin to write to a file
  * @file: The file to write to
- * @page: The page which is to be prepared for writing
- * @from: From (byte range within page)
- * @to: To (byte range within page)
+ * @mapping: The mapping in which to write
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused by GFS2)
  *
  * Returns: errno
  */
 
-static int gfs2_prepare_write(struct file *file, struct page *page,
-                             unsigned from, unsigned to)
+static int gfs2_write_begin(struct file *file, struct address_space *mapping,
+                           loff_t pos, unsigned len, unsigned flags,
+                           struct page **pagep, void **fsdata)
 {
-       struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-       struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+       struct gfs2_inode *ip = GFS2_I(mapping->host);
+       struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
        unsigned int data_blocks, ind_blocks, rblocks;
        int alloc_required;
        int error = 0;
-       loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from;
-       loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
        struct gfs2_alloc *al;
-       unsigned int write_len = to - from;
-
-
-       gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|GL_AOP, &ip->i_gh);
-       error = gfs2_glock_nq_m_atime(1, &ip->i_gh);
-       if (error)
+       pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+       unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+       unsigned to = from + len;
+       struct page *page;
+
+       gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh);
+       error = gfs2_glock_nq_atime(&ip->i_gh);
+       if (unlikely(error))
                goto out_uninit;
 
-       gfs2_write_calc_reserv(ip, write_len, &data_blocks, &ind_blocks);
+       error = -ENOMEM;
+       page = __grab_cache_page(mapping, index);
+       *pagep = page;
+       if (!page)
+               goto out_unlock;
+
+       gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
 
-       error = gfs2_write_alloc_required(ip, pos, write_len, &alloc_required);
+       error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
-               goto out_unlock;
+               goto out_putpage;
 
 
        ip->i_alloc.al_requested = 0;
@@ -402,12 +419,13 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
        if (ind_blocks || data_blocks)
                rblocks += RES_STATFS + RES_QUOTA;
 
-       error = gfs2_trans_begin(sdp, rblocks, 0);
+       error = gfs2_trans_begin(sdp, rblocks,
+                                PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
        if (error)
-               goto out;
+               goto out_trans_fail;
 
        if (gfs2_is_stuffed(ip)) {
-               if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+               if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
                        error = gfs2_unstuff_dinode(ip, page);
                        if (error == 0)
                                goto prepare_write;
@@ -422,6 +440,7 @@ prepare_write:
 out:
        if (error) {
                gfs2_trans_end(sdp);
+out_trans_fail:
                if (alloc_required) {
                        gfs2_inplace_release(ip);
 out_qunlock:
@@ -429,6 +448,10 @@ out_qunlock:
 out_alloc_put:
                        gfs2_alloc_put(ip);
                }
+out_putpage:
+               page_cache_release(page);
+               if (pos + len > ip->i_inode.i_size)
+                       vmtruncate(&ip->i_inode, ip->i_inode.i_size);
 out_unlock:
                gfs2_glock_dq_m(1, &ip->i_gh);
 out_uninit:
@@ -439,94 +462,175 @@ out_uninit:
 }
 
 /**
- * gfs2_commit_write - Commit write to a file
+ * adjust_fs_space - Adjusts the free space available due to gfs2_grow
+ * @inode: the rindex inode
+ */
+static void adjust_fs_space(struct inode *inode)
+{
+       struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+       struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+       struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+       u64 fs_total, new_free;
+
+       /* Total up the file system space, according to the latest rindex. */
+       fs_total = gfs2_ri_total(sdp);
+
+       spin_lock(&sdp->sd_statfs_spin);
+       if (fs_total > (m_sc->sc_total + l_sc->sc_total))
+               new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
+       else
+               new_free = 0;
+       spin_unlock(&sdp->sd_statfs_spin);
+       fs_warn(sdp, "File system extended by %llu blocks.\n",
+               (unsigned long long)new_free);
+       gfs2_statfs_change(sdp, new_free, new_free, 0);
+}
+
+/**
+ * gfs2_stuffed_write_end - Write end for stuffed files
+ * @inode: The inode
+ * @dibh: The buffer_head containing the on-disk inode
+ * @pos: The file position
+ * @len: The length of the write
+ * @copied: How much was actually copied by the VFS
+ * @page: The page
+ *
+ * This copies the data from the page into the inode block after
+ * the inode data structure itself.
+ *
+ * Returns: errno
+ */
+static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+                                 loff_t pos, unsigned len, unsigned copied,
+                                 struct page *page)
+{
+       struct gfs2_inode *ip = GFS2_I(inode);
+       struct gfs2_sbd *sdp = GFS2_SB(inode);
+       u64 to = pos + copied;
+       void *kaddr;
+       unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
+       struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+
+       BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
+       kaddr = kmap_atomic(page, KM_USER0);
+       memcpy(buf + pos, kaddr + pos, copied);
+       memset(kaddr + pos + copied, 0, len - copied);
+       flush_dcache_page(page);
+       kunmap_atomic(kaddr, KM_USER0);
+
+       if (!PageUptodate(page))
+               SetPageUptodate(page);
+       unlock_page(page);
+       page_cache_release(page);
+
+       if (inode->i_size < to) {
+               i_size_write(inode, to);
+               ip->i_di.di_size = inode->i_size;
+               di->di_size = cpu_to_be64(inode->i_size);
+               mark_inode_dirty(inode);
+       }
+
+       if (inode == sdp->sd_rindex)
+               adjust_fs_space(inode);
+
+       brelse(dibh);
+       gfs2_trans_end(sdp);
+       gfs2_glock_dq(&ip->i_gh);
+       gfs2_holder_uninit(&ip->i_gh);
+       return copied;
+}
+
+/**
+ * gfs2_write_end
  * @file: The file to write to
- * @page: The page containing the data
- * @from: From (byte range within page)
- * @to: To (byte range within page)
+ * @mapping: The address space to write to
+ * @pos: The file position
+ * @len: The length of the data
+ * @copied:
+ * @page: The page that has been written
+ * @fsdata: The fsdata (unused in GFS2)
+ *
+ * The main write_end function for GFS2. We have a separate one for
+ * stuffed files as they are slightly different, otherwise we just
+ * put our locking around the VFS provided functions.
  *
  * Returns: errno
  */
 
-static int gfs2_commit_write(struct file *file, struct page *page,
-                            unsigned from, unsigned to)
+static int gfs2_write_end(struct file *file, struct address_space *mapping,
+                         loff_t pos, unsigned len, unsigned copied,
+                         struct page *page, void *fsdata)
 {
        struct inode *inode = page->mapping->host;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
-       int error = -EOPNOTSUPP;
        struct buffer_head *dibh;
        struct gfs2_alloc *al = &ip->i_alloc;
        struct gfs2_dinode *di;
+       unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+       unsigned int to = from + len;
+       int ret;
 
-       if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl)))
-                goto fail_nounlock;
+       BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0);
 
-       error = gfs2_meta_inode_buffer(ip, &dibh);
-       if (error)
-               goto fail_endtrans;
+       ret = gfs2_meta_inode_buffer(ip, &dibh);
+       if (unlikely(ret)) {
+               unlock_page(page);
+               page_cache_release(page);
+               goto failed;
+       }
 
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-       di = (struct gfs2_dinode *)dibh->b_data;
-
-       if (gfs2_is_stuffed(ip)) {
-               u64 file_size;
-               void *kaddr;
-
-               file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to;
 
-               kaddr = kmap_atomic(page, KM_USER0);
-               memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from,
-                      kaddr + from, to - from);
-               kunmap_atomic(kaddr, KM_USER0);
+       if (gfs2_is_stuffed(ip))
+               return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
 
-               SetPageUptodate(page);
+       if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+               gfs2_page_add_databufs(ip, page, from, to);
 
-               if (inode->i_size < file_size)
-                       i_size_write(inode, file_size);
-       } else {
-               if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED ||
-                   gfs2_is_jdata(ip))
-                       gfs2_page_add_databufs(ip, page, from, to);
-               error = generic_commit_write(file, page, from, to);
-               if (error)
-                       goto fail;
-       }
+       ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 
-       if (ip->i_di.di_size < inode->i_size) {
-               ip->i_di.di_size = inode->i_size;
-               di->di_size = cpu_to_be64(inode->i_size);
+       if (likely(ret >= 0)) {
+               copied = ret;
+               if  ((pos + copied) > inode->i_size) {
+                       di = (struct gfs2_dinode *)dibh->b_data;
+                       ip->i_di.di_size = inode->i_size;
+                       di->di_size = cpu_to_be64(inode->i_size);
+                       mark_inode_dirty(inode);
+               }
        }
 
-       di->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
-       di->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
-       di->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
+       if (inode == sdp->sd_rindex)
+               adjust_fs_space(inode);
 
        brelse(dibh);
        gfs2_trans_end(sdp);
+failed:
        if (al->al_requested) {
                gfs2_inplace_release(ip);
                gfs2_quota_unlock(ip);
                gfs2_alloc_put(ip);
        }
-       gfs2_glock_dq_m(1, &ip->i_gh);
+       gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
-       return 0;
+       return ret;
+}
 
-fail:
-       brelse(dibh);
-fail_endtrans:
-       gfs2_trans_end(sdp);
-       if (al->al_requested) {
-               gfs2_inplace_release(ip);
-               gfs2_quota_unlock(ip);
-               gfs2_alloc_put(ip);
-       }
-       gfs2_glock_dq_m(1, &ip->i_gh);
-       gfs2_holder_uninit(&ip->i_gh);
-fail_nounlock:
-       ClearPageUptodate(page);
-       return error;
+/**
+ * gfs2_set_page_dirty - Page dirtying function
+ * @page: The page to dirty
+ *
+ * Returns: 1 if it dirtyed the page, or 0 otherwise
+ */
+static int gfs2_set_page_dirty(struct page *page)
+{
+       struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+       struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+
+       if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+               SetPageChecked(page);
+       return __set_page_dirty_buffers(page);
 }
 
 /**
@@ -556,56 +660,82 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
        return dblock;
 }
 
-static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
+static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
 {
        struct gfs2_bufdata *bd;
 
+       lock_buffer(bh);
        gfs2_log_lock(sdp);
+       clear_buffer_dirty(bh);
        bd = bh->b_private;
        if (bd) {
-               bd->bd_bh = NULL;
-               bh->b_private = NULL;
+               if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
+                       list_del_init(&bd->bd_le.le_list);
+               else
+                       gfs2_remove_from_journal(bh, current->journal_info, 0);
        }
-       gfs2_log_unlock(sdp);
-
-       lock_buffer(bh);
-       clear_buffer_dirty(bh);
        bh->b_bdev = NULL;
        clear_buffer_mapped(bh);
        clear_buffer_req(bh);
        clear_buffer_new(bh);
-       clear_buffer_delay(bh);
+       gfs2_log_unlock(sdp);
        unlock_buffer(bh);
 }
 
 static void gfs2_invalidatepage(struct page *page, unsigned long offset)
 {
        struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-       struct buffer_head *head, *bh, *next;
-       unsigned int curr_off = 0;
+       struct buffer_head *bh, *head;
+       unsigned long pos = 0;
 
        BUG_ON(!PageLocked(page));
+       if (offset == 0)
+               ClearPageChecked(page);
        if (!page_has_buffers(page))
-               return;
+               goto out;
 
        bh = head = page_buffers(page);
        do {
-               unsigned int next_off = curr_off + bh->b_size;
-               next = bh->b_this_page;
-
-               if (offset <= curr_off)
-                       discard_buffer(sdp, bh);
-
-               curr_off = next_off;
-               bh = next;
+               if (offset <= pos)
+                       gfs2_discard(sdp, bh);
+               pos += bh->b_size;
+               bh = bh->b_this_page;
        } while (bh != head);
-
-       if (!offset)
+out:
+       if (offset == 0)
                try_to_release_page(page, 0);
+}
 
-       return;
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @rw: READ or WRITE
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+{
+       /*
+        * Should we return an error here? I can't see that O_DIRECT for
+        * a journaled file makes any sense. For now we'll silently fall
+        * back to buffered I/O, likewise we do the same for stuffed
+        * files since they are (a) small and (b) unaligned.
+        */
+       if (gfs2_is_jdata(ip))
+               return 0;
+
+       if (gfs2_is_stuffed(ip))
+               return 0;
+
+       if (offset > i_size_read(&ip->i_inode))
+               return 0;
+       return 1;
 }
 
+
+
 static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
                              const struct iovec *iov, loff_t offset,
                              unsigned long nr_segs)
@@ -616,99 +746,32 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
        struct gfs2_holder gh;
        int rv;
 
-       if (rw == READ)
-               mutex_lock(&inode->i_mutex);
        /*
-        * Shared lock, even if its a write, since we do no allocation
-        * on this path. All we need change is atime.
+        * Deferred lock, even if its a write, since we do no allocation
+        * on this path. All we need change is atime, and this lock mode
+        * ensures that other nodes have flushed their buffered read caches
+        * (i.e. their page cache entries for this inode). We do not,
+        * unfortunately have the option of only flushing a range like
+        * the VFS does.
         */
-       gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
-       rv = gfs2_glock_nq_m_atime(1, &gh);
+       gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
+       rv = gfs2_glock_nq_atime(&gh);
        if (rv)
-               goto out;
-
-       if (offset > i_size_read(inode))
-               goto out;
-
-       /*
-        * Should we return an error here? I can't see that O_DIRECT for
-        * a journaled file makes any sense. For now we'll silently fall
-        * back to buffered I/O, likewise we do the same for stuffed
-        * files since they are (a) small and (b) unaligned.
-        */
-       if (gfs2_is_jdata(ip))
-               goto out;
-
-       if (gfs2_is_stuffed(ip))
-               goto out;
-
-       rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
-                                           inode->i_sb->s_bdev,
-                                           iov, offset, nr_segs,
-                                           gfs2_get_block_direct, NULL);
+               return rv;
+       rv = gfs2_ok_for_dio(ip, rw, offset);
+       if (rv != 1)
+               goto out; /* dio not valid, fall back to buffered i/o */
+
+       rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
+                                          iov, offset, nr_segs,
+                                          gfs2_get_block_direct, NULL);
 out:
        gfs2_glock_dq_m(1, &gh);
        gfs2_holder_uninit(&gh);
-       if (rw == READ)
-               mutex_unlock(&inode->i_mutex);
-
        return rv;
 }
 
 /**
- * stuck_releasepage - We're stuck in gfs2_releasepage().  Print stuff out.
- * @bh: the buffer we're stuck on
- *
- */
-
-static void stuck_releasepage(struct buffer_head *bh)
-{
-       struct inode *inode = bh->b_page->mapping->host;
-       struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-       struct gfs2_bufdata *bd = bh->b_private;
-       struct gfs2_glock *gl;
-static unsigned limit = 0;
-
-       if (limit > 3)
-               return;
-       limit++;
-
-       fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
-       fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
-               (unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
-       fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
-       fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
-
-       if (!bd)
-               return;
-
-       gl = bd->bd_gl;
-
-       fs_warn(sdp, "gl = (%u, %llu)\n",
-               gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
-
-       fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
-               (list_empty(&bd->bd_list_tr)) ? "no" : "yes",
-               (list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
-
-       if (gl->gl_ops == &gfs2_inode_glops) {
-               struct gfs2_inode *ip = gl->gl_object;
-               unsigned int x;
-
-               if (!ip)
-                       return;
-
-               fs_warn(sdp, "ip = %llu %llu\n",
-                       (unsigned long long)ip->i_num.no_formal_ino,
-                       (unsigned long long)ip->i_num.no_addr);
-
-               for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
-                       fs_warn(sdp, "ip->i_cache[%u] = %s\n",
-                               x, (ip->i_cache[x]) ? "!NULL" : "NULL");
-       }
-}
-
-/**
  * gfs2_releasepage - free the metadata associated with a page
  * @page: the page that's being released
  * @gfp_mask: passed from Linux VFS, ignored by us
@@ -725,38 +788,39 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
        struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
        struct buffer_head *bh, *head;
        struct gfs2_bufdata *bd;
-       unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
 
        if (!page_has_buffers(page))
-               goto out;
+               return 0;
 
+       gfs2_log_lock(sdp);
        head = bh = page_buffers(page);
        do {
-               while (atomic_read(&bh->b_count)) {
-                       if (!atomic_read(&aspace->i_writecount))
-                               return 0;
-
-                       if (time_after_eq(jiffies, t)) {
-                               stuck_releasepage(bh);
-                               /* should we withdraw here? */
-                               return 0;
-                       }
-
-                       yield();
-               }
-
+               if (atomic_read(&bh->b_count))
+                       goto cannot_release;
+               bd = bh->b_private;
+               if (bd && bd->bd_ail)
+                       goto cannot_release;
                gfs2_assert_warn(sdp, !buffer_pinned(bh));
                gfs2_assert_warn(sdp, !buffer_dirty(bh));
+               bh = bh->b_this_page;
+       } while(bh != head);
+       gfs2_log_unlock(sdp);
 
+       head = bh = page_buffers(page);
+       do {
                gfs2_log_lock(sdp);
                bd = bh->b_private;
                if (bd) {
                        gfs2_assert_warn(sdp, bd->bd_bh == bh);
                        gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
-                       gfs2_assert_warn(sdp, !bd->bd_ail);
-                       bd->bd_bh = NULL;
-                       if (!list_empty(&bd->bd_le.le_list))
-                               bd = NULL;
+                       if (!list_empty(&bd->bd_le.le_list)) {
+                               if (!buffer_pinned(bh))
+                                       list_del_init(&bd->bd_le.le_list);
+                               else
+                                       bd = NULL;
+                       }
+                       if (bd)
+                               bd->bd_bh = NULL;
                        bh->b_private = NULL;
                }
                gfs2_log_unlock(sdp);
@@ -766,17 +830,21 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
                bh = bh->b_this_page;
        } while (bh != head);
 
-out:
        return try_to_free_buffers(page);
+cannot_release:
+       gfs2_log_unlock(sdp);
+       return 0;
 }
 
 const struct address_space_operations gfs2_file_aops = {
        .writepage = gfs2_writepage,
+       .writepages = gfs2_writepages,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
        .sync_page = block_sync_page,
-       .prepare_write = gfs2_prepare_write,
-       .commit_write = gfs2_commit_write,
+       .write_begin = gfs2_write_begin,
+       .write_end = gfs2_write_end,
+       .set_page_dirty = gfs2_set_page_dirty,
        .bmap = gfs2_bmap,
        .invalidatepage = gfs2_invalidatepage,
        .releasepage = gfs2_releasepage,