ocfs2/trivial: Use proper mask for 2 places in hearbeat.c

[safe/jmp/linux-2.6] / fs / ocfs2 / buffer_head_io.c
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c

index 9a24adf..d43d34a 100644 (file)
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,13 +39,25 @@
  
  #include "buffer_head_io.h"
  
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+       BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
  int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
-                     struct inode *inode)
+                     struct ocfs2_caching_info *ci)
  {
         int ret = 0;
  
-       mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n",
-                  (unsigned long long)bh->b_blocknr, inode);
+       mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
+                  (unsigned long long)bh->b_blocknr, ci);
  
         BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
         BUG_ON(buffer_jbd(bh));
@@ -58,7 +70,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
                 goto out;
         }
  
-       mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+       ocfs2_metadata_cache_io_lock(ci);
  
         lock_buffer(bh);
         set_buffer_uptodate(bh);
@@ -66,41 +78,123 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
         /* remove from dirty list before I/O. */
         clear_buffer_dirty(bh);
  
-       get_bh(bh); /* for end_buffer_write_sync() */                   
+       get_bh(bh); /* for end_buffer_write_sync() */
         bh->b_end_io = end_buffer_write_sync;
         submit_bh(WRITE, bh);
  
         wait_on_buffer(bh);
  
         if (buffer_uptodate(bh)) {
-               ocfs2_set_buffer_uptodate(inode, bh);
+               ocfs2_set_buffer_uptodate(ci, bh);
         } else {
                 /* We don't need to remove the clustered uptodate
                  * information for this bh as it's not marked locally
                  * uptodate. */
                 ret = -EIO;
-               brelse(bh);
+               put_bh(bh);
         }
  
-       mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+       ocfs2_metadata_cache_io_unlock(ci);
  out:
         mlog_exit(ret);
         return ret;
  }
  
-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+                          unsigned int nr, struct buffer_head *bhs[])
+{
+       int status = 0;
+       unsigned int i;
+       struct buffer_head *bh;
+
+       if (!nr) {
+               mlog(ML_BH_IO, "No buffers will be read!\n");
+               goto bail;
+       }
+
+       for (i = 0 ; i < nr ; i++) {
+               if (bhs[i] == NULL) {
+                       bhs[i] = sb_getblk(osb->sb, block++);
+                       if (bhs[i] == NULL) {
+                               status = -EIO;
+                               mlog_errno(status);
+                               goto bail;
+                       }
+               }
+               bh = bhs[i];
+
+               if (buffer_jbd(bh)) {
+                       mlog(ML_BH_IO,
+                            "trying to sync read a jbd "
+                            "managed bh (blocknr = %llu), skipping\n",
+                            (unsigned long long)bh->b_blocknr);
+                       continue;
+               }
+
+               if (buffer_dirty(bh)) {
+                       /* This should probably be a BUG, or
+                        * at least return an error. */
+                       mlog(ML_ERROR,
+                            "trying to sync read a dirty "
+                            "buffer! (blocknr = %llu), skipping\n",
+                            (unsigned long long)bh->b_blocknr);
+                       continue;
+               }
+
+               lock_buffer(bh);
+               if (buffer_jbd(bh)) {
+                       mlog(ML_ERROR,
+                            "block %llu had the JBD bit set "
+                            "while I was in lock_buffer!",
+                            (unsigned long long)bh->b_blocknr);
+                       BUG();
+               }
+
+               clear_buffer_uptodate(bh);
+               get_bh(bh); /* for end_buffer_read_sync() */
+               bh->b_end_io = end_buffer_read_sync;
+               submit_bh(READ, bh);
+       }
+
+       for (i = nr; i > 0; i--) {
+               bh = bhs[i - 1];
+
+               /* No need to wait on the buffer if it's managed by JBD. */
+               if (!buffer_jbd(bh))
+                       wait_on_buffer(bh);
+
+               if (!buffer_uptodate(bh)) {
+                       /* Status won't be cleared from here on out,
+                        * so we can safely record this and loop back
+                        * to cleanup the other buffers. */
+                       status = -EIO;
+                       put_bh(bh);
+                       bhs[i - 1] = NULL;
+               }
+       }
+
+bail:
+       return status;
+}
+
+int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
                       struct buffer_head *bhs[], int flags,
-                     struct inode *inode)
+                     int (*validate)(struct super_block *sb,
+                                     struct buffer_head *bh))
  {
         int status = 0;
-       struct super_block *sb;
         int i, ignore_cache = 0;
         struct buffer_head *bh;
+       struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+
+       mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
+                  ci, (unsigned long long)block, nr, flags);
  
-       mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
-                  (unsigned long long)block, nr, flags, inode);
+       BUG_ON(!ci);
+       BUG_ON((flags & OCFS2_BH_READAHEAD) &&
+              (flags & OCFS2_BH_IGNORE_CACHE));
  
-       if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+       if (bhs == NULL) {
                 status = -EINVAL;
                 mlog_errno(status);
                 goto bail;
@@ -119,47 +213,63 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                 goto bail;
         }
  
-       sb = osb->sb;
-
-       if (flags & OCFS2_BH_CACHED && !inode)
-               flags &= ~OCFS2_BH_CACHED;
-
-       if (inode)
-               mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+       ocfs2_metadata_cache_io_lock(ci);
         for (i = 0 ; i < nr ; i++) {
                 if (bhs[i] == NULL) {
                         bhs[i] = sb_getblk(sb, block++);
                         if (bhs[i] == NULL) {
-                               if (inode)
-                                       mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+                               ocfs2_metadata_cache_io_unlock(ci);
                                 status = -EIO;
                                 mlog_errno(status);
                                 goto bail;
                         }
                 }
                 bh = bhs[i];
-               ignore_cache = 0;
-
-               if (flags & OCFS2_BH_CACHED &&
-                   !ocfs2_buffer_uptodate(inode, bh)) {
+               ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
+
+               /* There are three read-ahead cases here which we need to
+                * be concerned with. All three assume a buffer has
+                * previously been submitted with OCFS2_BH_READAHEAD
+                * and it hasn't yet completed I/O.
+                *
+                * 1) The current request is sync to disk. This rarely
+                *    happens these days, and never when performance
+                *    matters - the code can just wait on the buffer
+                *    lock and re-submit.
+                *
+                * 2) The current request is cached, but not
+                *    readahead. ocfs2_buffer_uptodate() will return
+                *    false anyway, so we'll wind up waiting on the
+                *    buffer lock to do I/O. We re-check the request
+                *    with after getting the lock to avoid a re-submit.
+                *
+                * 3) The current request is readahead (and so must
+                *    also be a caching one). We short circuit if the
+                *    buffer is locked (under I/O) and if it's in the
+                *    uptodate cache. The re-check from #2 catches the
+                *    case that the previous read-ahead completes just
+                *    before our is-it-in-flight check.
+                */
+
+               if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
                         mlog(ML_UPTODATE,
-                            "bh (%llu), inode %llu not uptodate\n",
+                            "bh (%llu), owner %llu not uptodate\n",
                              (unsigned long long)bh->b_blocknr,
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                            (unsigned long long)ocfs2_metadata_cache_owner(ci));
+                       /* We're using ignore_cache here to say
+                        * "go to disk" */
                         ignore_cache = 1;
                 }
  
-               /* XXX: Can we ever get this and *not* have the cached
-                * flag set? */
                 if (buffer_jbd(bh)) {
-                       if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+                       if (ignore_cache)
                                 mlog(ML_BH_IO, "trying to sync read a jbd "
                                                "managed bh (blocknr = %llu)\n",
                                      (unsigned long long)bh->b_blocknr);
                         continue;
                 }
  
-               if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+               if (ignore_cache) {
                         if (buffer_dirty(bh)) {
                                 /* This should probably be a BUG, or
                                  * at least return an error. */
@@ -169,6 +279,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                 continue;
                         }
  
+                       /* A read-ahead request was made - if the
+                        * buffer is already under read-ahead from a
+                        * previously submitted request than we are
+                        * done here. */
+                       if ((flags & OCFS2_BH_READAHEAD)
+                           && ocfs2_buffer_read_ahead(ci, bh))
+                               continue;
+
                         lock_buffer(bh);
                         if (buffer_jbd(bh)) {
  #ifdef CATCH_BH_JBD_RACES
@@ -181,13 +299,24 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                 continue;
  #endif
                         }
+
+                       /* Re-check ocfs2_buffer_uptodate() as a
+                        * previously read-ahead buffer may have
+                        * completed I/O while we were waiting for the
+                        * buffer lock. */
+                       if (!(flags & OCFS2_BH_IGNORE_CACHE)
+                           && !(flags & OCFS2_BH_READAHEAD)
+                           && ocfs2_buffer_uptodate(ci, bh)) {
+                               unlock_buffer(bh);
+                               continue;
+                       }
+
                         clear_buffer_uptodate(bh);
                         get_bh(bh); /* for end_buffer_read_sync() */
+                       if (validate)
+                               set_buffer_needs_validate(bh);
                         bh->b_end_io = end_buffer_read_sync;
-                       if (flags & OCFS2_BH_READAHEAD)
-                               submit_bh(READA, bh);
-                       else
-                               submit_bh(READ, bh);
+                       submit_bh(READ, bh);
                         continue;
                 }
         }
@@ -197,37 +326,116 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
         for (i = (nr - 1); i >= 0; i--) {
                 bh = bhs[i];
  
-               /* We know this can't have changed as we hold the
-                * inode sem. Avoid doing any work on the bh if the
-                * journal has it. */
-               if (!buffer_jbd(bh))
-                       wait_on_buffer(bh);
+               if (!(flags & OCFS2_BH_READAHEAD)) {
+                       /* We know this can't have changed as we hold the
+                        * owner sem. Avoid doing any work on the bh if the
+                        * journal has it. */
+                       if (!buffer_jbd(bh))
+                               wait_on_buffer(bh);
+
+                       if (!buffer_uptodate(bh)) {
+                               /* Status won't be cleared from here on out,
+                                * so we can safely record this and loop back
+                                * to cleanup the other buffers. Don't need to
+                                * remove the clustered uptodate information
+                                * for this bh as it's not marked locally
+                                * uptodate. */
+                               status = -EIO;
+                               put_bh(bh);
+                               bhs[i] = NULL;
+                               continue;
+                       }
  
-               if (!buffer_uptodate(bh)) {
-                       /* Status won't be cleared from here on out,
-                        * so we can safely record this and loop back
-                        * to cleanup the other buffers. Don't need to
-                        * remove the clustered uptodate information
-                        * for this bh as it's not marked locally
-                        * uptodate. */
-                       status = -EIO;
-                       brelse(bh);
-                       bhs[i] = NULL;
-                       continue;
+                       if (buffer_needs_validate(bh)) {
+                               /* We never set NeedsValidate if the
+                                * buffer was held by the journal, so
+                                * that better not have changed */
+                               BUG_ON(buffer_jbd(bh));
+                               clear_buffer_needs_validate(bh);
+                               status = validate(sb, bh);
+                               if (status) {
+                                       put_bh(bh);
+                                       bhs[i] = NULL;
+                                       continue;
+                               }
+                       }
                 }
  
-               if (inode)
-                       ocfs2_set_buffer_uptodate(inode, bh);
+               /* Always set the buffer in the cache, even if it was
+                * a forced read, or read-ahead which hasn't yet
+                * completed. */
+               ocfs2_set_buffer_uptodate(ci, bh);
         }
-       if (inode)
-               mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+       ocfs2_metadata_cache_io_unlock(ci);
  
-       mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s\n", 
+       mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
              (unsigned long long)block, nr,
-            (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+            ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
+            flags);
  
  bail:
  
         mlog_exit(status);
         return status;
  }
+
+/* Check whether the blkno is the super block or one of the backups. */
+static void ocfs2_check_super_or_backup(struct super_block *sb,
+                                       sector_t blkno)
+{
+       int i;
+       u64 backup_blkno;
+
+       if (blkno == OCFS2_SUPER_BLOCK_BLKNO)
+               return;
+
+       for (i = 0; i < OCFS2_MAX_BACKUP_SUPERBLOCKS; i++) {
+               backup_blkno = ocfs2_backup_super_blkno(sb, i);
+               if (backup_blkno == blkno)
+                       return;
+       }
+
+       BUG();
+}
+
+/*
+ * Write super block and backups doesn't need to collaborate with journal,
+ * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
+ * into this function.
+ */
+int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
+                               struct buffer_head *bh)
+{
+       int ret = 0;
+
+       mlog_entry_void();
+
+       BUG_ON(buffer_jbd(bh));
+       ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
+
+       if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
+               ret = -EROFS;
+               goto out;
+       }
+
+       lock_buffer(bh);
+       set_buffer_uptodate(bh);
+
+       /* remove from dirty list before I/O. */
+       clear_buffer_dirty(bh);
+
+       get_bh(bh); /* for end_buffer_write_sync() */
+       bh->b_end_io = end_buffer_write_sync;
+       submit_bh(WRITE, bh);
+
+       wait_on_buffer(bh);
+
+       if (!buffer_uptodate(bh)) {
+               ret = -EIO;
+               put_bh(bh);
+       }
+
+out:
+       mlog_exit(ret);
+       return ret;
+}