xfs: add log item recovery tracing
[safe/jmp/linux-2.6] / fs / xfs / xfs_log_recover.c
index bf8573b..f21eb8a 100644 (file)
@@ -36,7 +36,6 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
-#include "xfs_imap.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
 #include "xfs_quota.h"
 #include "xfs_rw.h"
 #include "xfs_utils.h"
+#include "xfs_trace.h"
 
 STATIC int     xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int     xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
-STATIC void    xlog_recover_insert_item_backq(xlog_recover_item_t **q,
-                                              xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void    xlog_recover_check_summary(xlog_t *);
 #else
@@ -68,34 +66,57 @@ STATIC void xlog_recover_check_summary(xlog_t *);
        ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
 #define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)  ((bno) & ~(log)->l_sectbb_mask)
 
-xfs_buf_t *
+STATIC xfs_buf_t *
 xlog_get_bp(
        xlog_t          *log,
-       int             num_bblks)
+       int             nbblks)
 {
-       ASSERT(num_bblks > 0);
+       if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+               xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+               XFS_ERROR_REPORT("xlog_get_bp(1)",
+                                XFS_ERRLEVEL_HIGH, log->l_mp);
+               return NULL;
+       }
 
        if (log->l_sectbb_log) {
-               if (num_bblks > 1)
-                       num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
-               num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
+               if (nbblks > 1)
+                       nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+               nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
        }
-       return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
+       return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
 
-void
+STATIC void
 xlog_put_bp(
        xfs_buf_t       *bp)
 {
        xfs_buf_free(bp);
 }
 
+STATIC xfs_caddr_t
+xlog_align(
+       xlog_t          *log,
+       xfs_daddr_t     blk_no,
+       int             nbblks,
+       xfs_buf_t       *bp)
+{
+       xfs_caddr_t     ptr;
+
+       if (!log->l_sectbb_log)
+               return XFS_BUF_PTR(bp);
+
+       ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
+       ASSERT(XFS_BUF_SIZE(bp) >=
+               BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
+       return ptr;
+}
+
 
 /*
  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
  */
-int
-xlog_bread(
+STATIC int
+xlog_bread_noalign(
        xlog_t          *log,
        xfs_daddr_t     blk_no,
        int             nbblks,
@@ -103,6 +124,13 @@ xlog_bread(
 {
        int             error;
 
+       if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+               xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+               XFS_ERROR_REPORT("xlog_bread(1)",
+                                XFS_ERRLEVEL_HIGH, log->l_mp);
+               return EFSCORRUPTED;
+       }
+
        if (log->l_sectbb_log) {
                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -126,6 +154,24 @@ xlog_bread(
        return error;
 }
 
+STATIC int
+xlog_bread(
+       xlog_t          *log,
+       xfs_daddr_t     blk_no,
+       int             nbblks,
+       xfs_buf_t       *bp,
+       xfs_caddr_t     *offset)
+{
+       int             error;
+
+       error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+       if (error)
+               return error;
+
+       *offset = xlog_align(log, blk_no, nbblks, bp);
+       return 0;
+}
+
 /*
  * Write out the buffer at the given block for the given number of blocks.
  * The buffer is kept locked across the write and is returned locked.
@@ -140,6 +186,13 @@ xlog_bwrite(
 {
        int             error;
 
+       if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+               xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+               XFS_ERROR_REPORT("xlog_bwrite(1)",
+                                XFS_ERRLEVEL_HIGH, log->l_mp);
+               return EFSCORRUPTED;
+       }
+
        if (log->l_sectbb_log) {
                blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
                nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -162,24 +215,6 @@ xlog_bwrite(
        return error;
 }
 
-STATIC xfs_caddr_t
-xlog_align(
-       xlog_t          *log,
-       xfs_daddr_t     blk_no,
-       int             nbblks,
-       xfs_buf_t       *bp)
-{
-       xfs_caddr_t     ptr;
-
-       if (!log->l_sectbb_log)
-               return XFS_BUF_PTR(bp);
-
-       ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
-       ASSERT(XFS_BUF_SIZE(bp) >=
-               BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
-       return ptr;
-}
-
 #ifdef DEBUG
 /*
  * dump debug superblock and log record information
@@ -189,16 +224,10 @@ xlog_header_check_dump(
        xfs_mount_t             *mp,
        xlog_rec_header_t       *head)
 {
-       int                     b;
-
-       cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
-       for (b = 0; b < 16; b++)
-               cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
-       cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
-       cmn_err(CE_DEBUG, "    log : uuid = ");
-       for (b = 0; b < 16; b++)
-               cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
-       cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
+       cmn_err(CE_DEBUG, "%s:  SB : uuid = %pU, fmt = %d\n",
+               __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
+       cmn_err(CE_DEBUG, "    log : uuid = %pU, fmt = %d\n",
+               &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 }
 #else
 #define xlog_header_check_dump(mp, head)
@@ -268,21 +297,16 @@ STATIC void
 xlog_recover_iodone(
        struct xfs_buf  *bp)
 {
-       xfs_mount_t     *mp;
-
-       ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
-
        if (XFS_BUF_GETERROR(bp)) {
                /*
                 * We're not going to bother about retrying
                 * this during recovery. One strike!
                 */
-               mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
                xfs_ioerror_alert("xlog_recover_iodone",
-                                 mp, bp, XFS_BUF_ADDR(bp));
-               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+                                 bp->b_mount, bp, XFS_BUF_ADDR(bp));
+               xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
        }
-       XFS_BUF_SET_FSPRIVATE(bp, NULL);
+       bp->b_mount = NULL;
        XFS_BUF_CLR_IODONE_FUNC(bp);
        xfs_biodone(bp);
 }
@@ -308,9 +332,9 @@ xlog_find_cycle_start(
 
        mid_blk = BLK_AVG(first_blk, *last_blk);
        while (mid_blk != first_blk && mid_blk != *last_blk) {
-               if ((error = xlog_bread(log, mid_blk, 1, bp)))
+               error = xlog_bread(log, mid_blk, 1, bp, &offset);
+               if (error)
                        return error;
-               offset = xlog_align(log, mid_blk, 1, bp);
                mid_cycle = xlog_get_cycle(offset);
                if (mid_cycle == cycle) {
                        *last_blk = mid_blk;
@@ -366,10 +390,10 @@ xlog_find_verify_cycle(
 
                bcount = min(bufblks, (start_blk + nbblks - i));
 
-               if ((error = xlog_bread(log, i, bcount, bp)))
+               error = xlog_bread(log, i, bcount, bp, &buf);
+               if (error)
                        goto out;
 
-               buf = xlog_align(log, i, bcount, bp);
                for (j = 0; j < bcount; j++) {
                        cycle = xlog_get_cycle(buf);
                        if (cycle == stop_on_cycle_no) {
@@ -423,9 +447,9 @@ xlog_find_verify_log_record(
                        return ENOMEM;
                smallmem = 1;
        } else {
-               if ((error = xlog_bread(log, start_blk, num_blks, bp)))
+               error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+               if (error)
                        goto out;
-               offset = xlog_align(log, start_blk, num_blks, bp);
                offset += ((num_blks - 1) << BBSHIFT);
        }
 
@@ -440,9 +464,9 @@ xlog_find_verify_log_record(
                }
 
                if (smallmem) {
-                       if ((error = xlog_bread(log, i, 1, bp)))
+                       error = xlog_bread(log, i, 1, bp, &offset);
+                       if (error)
                                goto out;
-                       offset = xlog_align(log, i, 1, bp);
                }
 
                head = (xlog_rec_header_t *)offset;
@@ -546,15 +570,18 @@ xlog_find_head(
        bp = xlog_get_bp(log, 1);
        if (!bp)
                return ENOMEM;
-       if ((error = xlog_bread(log, 0, 1, bp)))
+
+       error = xlog_bread(log, 0, 1, bp, &offset);
+       if (error)
                goto bp_err;
-       offset = xlog_align(log, 0, 1, bp);
+
        first_half_cycle = xlog_get_cycle(offset);
 
        last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
-       if ((error = xlog_bread(log, last_blk, 1, bp)))
+       error = xlog_bread(log, last_blk, 1, bp, &offset);
+       if (error)
                goto bp_err;
-       offset = xlog_align(log, last_blk, 1, bp);
+
        last_half_cycle = xlog_get_cycle(offset);
        ASSERT(last_half_cycle != 0);
 
@@ -776,7 +803,7 @@ xlog_find_head(
  * We could speed up search by using current head_blk buffer, but it is not
  * available.
  */
-int
+STATIC int
 xlog_find_tail(
        xlog_t                  *log,
        xfs_daddr_t             *head_blk,
@@ -804,9 +831,10 @@ xlog_find_tail(
        if (!bp)
                return ENOMEM;
        if (*head_blk == 0) {                           /* special case */
-               if ((error = xlog_bread(log, 0, 1, bp)))
+               error = xlog_bread(log, 0, 1, bp, &offset);
+               if (error)
                        goto bread_err;
-               offset = xlog_align(log, 0, 1, bp);
+
                if (xlog_get_cycle(offset) == 0) {
                        *tail_blk = 0;
                        /* leave all other log inited values alone */
@@ -819,9 +847,10 @@ xlog_find_tail(
         */
        ASSERT(*head_blk < INT_MAX);
        for (i = (int)(*head_blk) - 1; i >= 0; i--) {
-               if ((error = xlog_bread(log, i, 1, bp)))
+               error = xlog_bread(log, i, 1, bp, &offset);
+               if (error)
                        goto bread_err;
-               offset = xlog_align(log, i, 1, bp);
+
                if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
                        found = 1;
                        break;
@@ -835,9 +864,10 @@ xlog_find_tail(
         */
        if (!found) {
                for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
-                       if ((error = xlog_bread(log, i, 1, bp)))
+                       error = xlog_bread(log, i, 1, bp, &offset);
+                       if (error)
                                goto bread_err;
-                       offset = xlog_align(log, i, 1, bp);
+
                        if (XLOG_HEADER_MAGIC_NUM ==
                            be32_to_cpu(*(__be32 *)offset)) {
                                found = 2;
@@ -909,10 +939,10 @@ xlog_find_tail(
        if (*head_blk == after_umount_blk &&
            be32_to_cpu(rhead->h_num_logops) == 1) {
                umount_data_blk = (i + hblks) % log->l_logBBsize;
-               if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
+               error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+               if (error)
                        goto bread_err;
-               }
-               offset = xlog_align(log, umount_data_blk, 1, bp);
+
                op_head = (xlog_op_header_t *)offset;
                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
                        /*
@@ -1004,9 +1034,10 @@ xlog_find_zeroed(
        bp = xlog_get_bp(log, 1);
        if (!bp)
                return ENOMEM;
-       if ((error = xlog_bread(log, 0, 1, bp)))
+       error = xlog_bread(log, 0, 1, bp, &offset);
+       if (error)
                goto bp_err;
-       offset = xlog_align(log, 0, 1, bp);
+
        first_cycle = xlog_get_cycle(offset);
        if (first_cycle == 0) {         /* completely zeroed log */
                *blk_no = 0;
@@ -1015,9 +1046,10 @@ xlog_find_zeroed(
        }
 
        /* check partially zeroed log */
-       if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
+       error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+       if (error)
                goto bp_err;
-       offset = xlog_align(log, log_bbnum-1, 1, bp);
+
        last_cycle = xlog_get_cycle(offset);
        if (last_cycle != 0) {          /* log completely written to */
                xlog_put_bp(bp);
@@ -1139,10 +1171,10 @@ xlog_write_log_records(
         */
        balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
        if (balign != start_block) {
-               if ((error = xlog_bread(log, start_block, 1, bp))) {
-                       xlog_put_bp(bp);
-                       return error;
-               }
+               error = xlog_bread_noalign(log, start_block, 1, bp);
+               if (error)
+                       goto out_put_bp;
+
                j = start_block - balign;
        }
 
@@ -1162,10 +1194,14 @@ xlog_write_log_records(
                        balign = BBTOB(ealign - start_block);
                        error = XFS_BUF_SET_PTR(bp, offset + balign,
                                                BBTOB(sectbb));
-                       if (!error)
-                               error = xlog_bread(log, ealign, sectbb, bp);
-                       if (!error)
-                               error = XFS_BUF_SET_PTR(bp, offset, bufblks);
+                       if (error)
+                               break;
+
+                       error = xlog_bread_noalign(log, ealign, sectbb, bp);
+                       if (error)
+                               break;
+
+                       error = XFS_BUF_SET_PTR(bp, offset, bufblks);
                        if (error)
                                break;
                }
@@ -1182,6 +1218,8 @@ xlog_write_log_records(
                start_block += endcount;
                j = 0;
        }
+
+ out_put_bp:
        xlog_put_bp(bp);
        return error;
 }
@@ -1327,40 +1365,50 @@ xlog_clear_stale_blocks(
 
 STATIC xlog_recover_t *
 xlog_recover_find_tid(
-       xlog_recover_t          *q,
+       struct hlist_head       *head,
        xlog_tid_t              tid)
 {
-       xlog_recover_t          *p = q;
+       xlog_recover_t          *trans;
+       struct hlist_node       *n;
 
-       while (p != NULL) {
-               if (p->r_log_tid == tid)
-                   break;
-               p = p->r_next;
+       hlist_for_each_entry(trans, n, head, r_list) {
+               if (trans->r_log_tid == tid)
+                       return trans;
        }
-       return p;
+       return NULL;
 }
 
 STATIC void
-xlog_recover_put_hashq(
-       xlog_recover_t          **q,
-       xlog_recover_t          *trans)
+xlog_recover_new_tid(
+       struct hlist_head       *head,
+       xlog_tid_t              tid,
+       xfs_lsn_t               lsn)
 {
-       trans->r_next = *q;
-       *q = trans;
+       xlog_recover_t          *trans;
+
+       trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
+       trans->r_log_tid   = tid;
+       trans->r_lsn       = lsn;
+       INIT_LIST_HEAD(&trans->r_itemq);
+
+       INIT_HLIST_NODE(&trans->r_list);
+       hlist_add_head(&trans->r_list, head);
 }
 
 STATIC void
 xlog_recover_add_item(
-       xlog_recover_item_t     **itemq)
+       struct list_head        *head)
 {
        xlog_recover_item_t     *item;
 
        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
-       xlog_recover_insert_item_backq(itemq, item);
+       INIT_LIST_HEAD(&item->ri_list);
+       list_add_tail(&item->ri_list, head);
 }
 
 STATIC int
 xlog_recover_add_to_cont_trans(
+       struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1369,8 +1417,7 @@ xlog_recover_add_to_cont_trans(
        xfs_caddr_t             ptr, old_ptr;
        int                     old_len;
 
-       item = trans->r_itemq;
-       if (item == NULL) {
+       if (list_empty(&trans->r_itemq)) {
                /* finish copying rest of trans header */
                xlog_recover_add_item(&trans->r_itemq);
                ptr = (xfs_caddr_t) &trans->r_theader +
@@ -1378,7 +1425,8 @@ xlog_recover_add_to_cont_trans(
                memcpy(ptr, dp, len); /* d, s, l */
                return 0;
        }
-       item = item->ri_prev;
+       /* take the tail entry */
+       item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
 
        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
        old_len = item->ri_buf[item->ri_cnt-1].i_len;
@@ -1387,6 +1435,7 @@ xlog_recover_add_to_cont_trans(
        memcpy(&ptr[old_len], dp, len); /* d, s, l */
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+       trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
        return 0;
 }
 
@@ -1405,6 +1454,7 @@ xlog_recover_add_to_cont_trans(
  */
 STATIC int
 xlog_recover_add_to_trans(
+       struct log              *log,
        xlog_recover_t          *trans,
        xfs_caddr_t             dp,
        int                     len)
@@ -1415,8 +1465,7 @@ xlog_recover_add_to_trans(
 
        if (!len)
                return 0;
-       item = trans->r_itemq;
-       if (item == NULL) {
+       if (list_empty(&trans->r_itemq)) {
                /* we need to catch log corruptions here */
                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
                        xlog_warn("XFS: xlog_recover_add_to_trans: "
@@ -1434,117 +1483,67 @@ xlog_recover_add_to_trans(
        memcpy(ptr, dp, len);
        in_f = (xfs_inode_log_format_t *)ptr;
 
-       if (item->ri_prev->ri_total != 0 &&
-            item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
+       /* take the tail entry */
+       item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
+       if (item->ri_total != 0 &&
+            item->ri_total == item->ri_cnt) {
+               /* tail item is in use, get a new one */
                xlog_recover_add_item(&trans->r_itemq);
+               item = list_entry(trans->r_itemq.prev,
+                                       xlog_recover_item_t, ri_list);
        }
-       item = trans->r_itemq;
-       item = item->ri_prev;
 
        if (item->ri_total == 0) {              /* first region to be added */
-               item->ri_total  = in_f->ilf_size;
-               ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
-               item->ri_buf = kmem_zalloc((item->ri_total *
-                                           sizeof(xfs_log_iovec_t)), KM_SLEEP);
+               if (in_f->ilf_size == 0 ||
+                   in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+                       xlog_warn(
+       "XFS: bad number of regions (%d) in inode log format",
+                                 in_f->ilf_size);
+                       ASSERT(0);
+                       return XFS_ERROR(EIO);
+               }
+
+               item->ri_total = in_f->ilf_size;
+               item->ri_buf =
+                       kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+                                   KM_SLEEP);
        }
        ASSERT(item->ri_total > item->ri_cnt);
        /* Description region is ri_buf[0] */
        item->ri_buf[item->ri_cnt].i_addr = ptr;
        item->ri_buf[item->ri_cnt].i_len  = len;
        item->ri_cnt++;
+       trace_xfs_log_recover_item_add(log, trans, item, 0);
        return 0;
 }
 
-STATIC void
-xlog_recover_new_tid(
-       xlog_recover_t          **q,
-       xlog_tid_t              tid,
-       xfs_lsn_t               lsn)
-{
-       xlog_recover_t          *trans;
-
-       trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
-       trans->r_log_tid   = tid;
-       trans->r_lsn       = lsn;
-       xlog_recover_put_hashq(q, trans);
-}
-
-STATIC int
-xlog_recover_unlink_tid(
-       xlog_recover_t          **q,
-       xlog_recover_t          *trans)
-{
-       xlog_recover_t          *tp;
-       int                     found = 0;
-
-       ASSERT(trans != NULL);
-       if (trans == *q) {
-               *q = (*q)->r_next;
-       } else {
-               tp = *q;
-               while (tp) {
-                       if (tp->r_next == trans) {
-                               found = 1;
-                               break;
-                       }
-                       tp = tp->r_next;
-               }
-               if (!found) {
-                       xlog_warn(
-                            "XFS: xlog_recover_unlink_tid: trans not found");
-                       ASSERT(0);
-                       return XFS_ERROR(EIO);
-               }
-               tp->r_next = tp->r_next->r_next;
-       }
-       return 0;
-}
-
-STATIC void
-xlog_recover_insert_item_backq(
-       xlog_recover_item_t     **q,
-       xlog_recover_item_t     *item)
-{
-       if (*q == NULL) {
-               item->ri_prev = item->ri_next = item;
-               *q = item;
-       } else {
-               item->ri_next           = *q;
-               item->ri_prev           = (*q)->ri_prev;
-               (*q)->ri_prev           = item;
-               item->ri_prev->ri_next  = item;
-       }
-}
-
-STATIC void
-xlog_recover_insert_item_frontq(
-       xlog_recover_item_t     **q,
-       xlog_recover_item_t     *item)
-{
-       xlog_recover_insert_item_backq(q, item);
-       *q = item;
-}
-
+/*
+ * Sort the log items in the transaction. Cancelled buffers need
+ * to be put first so they are processed before any items that might
+ * modify the buffers. If they are cancelled, then the modifications
+ * don't need to be replayed.
+ */
 STATIC int
 xlog_recover_reorder_trans(
-       xlog_recover_t          *trans)
+       struct log              *log,
+       xlog_recover_t          *trans,
+       int                     pass)
 {
-       xlog_recover_item_t     *first_item, *itemq, *itemq_next;
-       xfs_buf_log_format_t    *buf_f;
-       ushort                  flags = 0;
+       xlog_recover_item_t     *item, *n;
+       LIST_HEAD(sort_list);
+
+       list_splice_init(&trans->r_itemq, &sort_list);
+       list_for_each_entry_safe(item, n, &sort_list, ri_list) {
+               xfs_buf_log_format_t    *buf_f;
 
-       first_item = itemq = trans->r_itemq;
-       trans->r_itemq = NULL;
-       do {
-               itemq_next = itemq->ri_next;
-               buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
+               buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
 
-               switch (ITEM_TYPE(itemq)) {
+               switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
-                       flags = buf_f->blf_flags;
-                       if (!(flags & XFS_BLI_CANCEL)) {
-                               xlog_recover_insert_item_frontq(&trans->r_itemq,
-                                                               itemq);
+                       if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+                               trace_xfs_log_recover_item_reorder_head(log,
+                                                       trans, item, pass);
+                               list_move(&item->ri_list, &trans->r_itemq);
                                break;
                        }
                case XFS_LI_INODE:
@@ -1552,7 +1551,9 @@ xlog_recover_reorder_trans(
                case XFS_LI_QUOTAOFF:
                case XFS_LI_EFD:
                case XFS_LI_EFI:
-                       xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
+                       trace_xfs_log_recover_item_reorder_tail(log,
+                                                       trans, item, pass);
+                       list_move_tail(&item->ri_list, &trans->r_itemq);
                        break;
                default:
                        xlog_warn(
@@ -1560,8 +1561,8 @@ xlog_recover_reorder_trans(
                        ASSERT(0);
                        return XFS_ERROR(EIO);
                }
-               itemq = itemq_next;
-       } while (first_item != itemq);
+       }
+       ASSERT(list_empty(&sort_list));
        return 0;
 }
 
@@ -1601,8 +1602,10 @@ xlog_recover_do_buffer_pass1(
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-       if (!(flags & XFS_BLI_CANCEL))
+       if (!(flags & XFS_BLI_CANCEL)) {
+               trace_xfs_log_recover_buf_not_cancel(log, buf_f);
                return;
+       }
 
        /*
         * Insert an xfs_buf_cancel record into the hash table of
@@ -1636,6 +1639,7 @@ xlog_recover_do_buffer_pass1(
        while (nextp != NULL) {
                if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
                        nextp->bc_refcount++;
+                       trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
                        return;
                }
                prevp = nextp;
@@ -1649,6 +1653,7 @@ xlog_recover_do_buffer_pass1(
        bcp->bc_refcount = 1;
        bcp->bc_next = NULL;
        prevp->bc_next = bcp;
+       trace_xfs_log_recover_buf_cancel_add(log, buf_f);
 }
 
 /*
@@ -1788,6 +1793,8 @@ xlog_recover_do_inode_buffer(
        unsigned int            *data_map = NULL;
        unsigned int            map_size = 0;
 
+       trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
+
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1883,6 +1890,7 @@ xlog_recover_do_inode_buffer(
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
+       struct xfs_mount        *mp,
        xlog_recover_item_t     *item,
        xfs_buf_t               *bp,
        xfs_buf_log_format_t    *buf_f)
@@ -1894,6 +1902,8 @@ xlog_recover_do_reg_buffer(
        unsigned int            map_size = 0;
        int                     error;
 
+       trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
+
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                data_map = buf_f->blf_data_map;
@@ -1921,16 +1931,30 @@ xlog_recover_do_reg_buffer(
                error = 0;
                if (buf_f->blf_flags &
                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                       if (item->ri_buf[i].i_addr == NULL) {
+                               cmn_err(CE_ALERT,
+                                       "XFS: NULL dquot in %s.", __func__);
+                               goto next;
+                       }
+                       if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
+                               cmn_err(CE_ALERT,
+                                       "XFS: dquot too small (%d) in %s.",
+                                       item->ri_buf[i].i_len, __func__);
+                               goto next;
+                       }
                        error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
                                               item->ri_buf[i].i_addr,
                                               -1, 0, XFS_QMOPT_DOWARN,
                                               "dquot_buf_recover");
+                       if (error)
+                               goto next;
                }
-               if (!error)
-                       memcpy(xfs_buf_offset(bp,
-                               (uint)bit << XFS_BLI_SHIFT),    /* dest */
-                               item->ri_buf[i].i_addr,         /* source */
-                               nbits<<XFS_BLI_SHIFT);          /* length */
+
+               memcpy(xfs_buf_offset(bp,
+                       (uint)bit << XFS_BLI_SHIFT),    /* dest */
+                       item->ri_buf[i].i_addr,         /* source */
+                       nbits<<XFS_BLI_SHIFT);          /* length */
+ next:
                i++;
                bit += nbits;
        }
@@ -2078,6 +2102,8 @@ xlog_recover_do_dquot_buffer(
 {
        uint                    type;
 
+       trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
+
        /*
         * Filesystems are required to send in quota flags at mount time.
         */
@@ -2098,7 +2124,7 @@ xlog_recover_do_dquot_buffer(
        if (log->l_quotaoffs_flag & type)
                return;
 
-       xlog_recover_do_reg_buffer(item, bp, buf_f);
+       xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
 }
 
 /*
@@ -2138,6 +2164,7 @@ xlog_recover_do_buffer_trans(
        xfs_daddr_t             blkno;
        int                     len;
        ushort                  flags;
+       uint                    buf_flags;
 
        buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
 
@@ -2158,9 +2185,11 @@ xlog_recover_do_buffer_trans(
                 */
                cancel = xlog_recover_do_buffer_pass2(log, buf_f);
                if (cancel) {
+                       trace_xfs_log_recover_buf_cancel(log, buf_f);
                        return 0;
                }
        }
+       trace_xfs_log_recover_buf_recover(log, buf_f);
        switch (buf_f->blf_type) {
        case XFS_LI_BUF:
                blkno = buf_f->blf_blkno;
@@ -2178,12 +2207,11 @@ xlog_recover_do_buffer_trans(
        }
 
        mp = log->l_mp;
-       if (flags & XFS_BLI_INODE_BUF) {
-               bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
-                                                               XFS_BUF_LOCK);
-       } else {
-               bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
-       }
+       buf_flags = XBF_LOCK;
+       if (!(flags & XFS_BLI_INODE_BUF))
+               buf_flags |= XBF_MAPPED;
+
+       bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
        if (XFS_BUF_ISERROR(bp)) {
                xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
                                  bp, blkno);
@@ -2199,7 +2227,7 @@ xlog_recover_do_buffer_trans(
                  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
-               xlog_recover_do_reg_buffer(item, bp, buf_f);
+               xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
        }
        if (error)
                return XFS_ERROR(error);
@@ -2226,9 +2254,8 @@ xlog_recover_do_buffer_trans(
                XFS_BUF_STALE(bp);
                error = xfs_bwrite(mp, bp);
        } else {
-               ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
-                      XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
-               XFS_BUF_SET_FSPRIVATE(bp, mp);
+               ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+               bp->b_mount = mp;
                XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
                xfs_bdwrite(mp, bp);
        }
@@ -2280,11 +2307,13 @@ xlog_recover_do_inode_trans(
        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
                                        in_f->ilf_len, 0)) {
                error = 0;
+               trace_xfs_log_recover_inode_cancel(log, in_f);
                goto error;
        }
+       trace_xfs_log_recover_inode_recover(log, in_f);
 
-       bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
-                               in_f->ilf_len, XFS_BUF_LOCK);
+       bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
+                         XBF_LOCK);
        if (XFS_BUF_ISERROR(bp)) {
                xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
                                  bp, in_f->ilf_blkno);
@@ -2333,6 +2362,7 @@ xlog_recover_do_inode_trans(
                        /* do nothing */
                } else {
                        xfs_buf_relse(bp);
+                       trace_xfs_log_recover_inode_skip(log, in_f);
                        error = 0;
                        goto error;
                }
@@ -2490,17 +2520,10 @@ xlog_recover_do_inode_trans(
        }
 
 write_inode_buffer:
-       if (ITEM_TYPE(item) == XFS_LI_INODE) {
-               ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
-                      XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
-               XFS_BUF_SET_FSPRIVATE(bp, mp);
-               XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
-               xfs_bdwrite(mp, bp);
-       } else {
-               XFS_BUF_STALE(bp);
-               error = xfs_bwrite(mp, bp);
-       }
-
+       ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+       bp->b_mount = mp;
+       XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
+       xfs_bdwrite(mp, bp);
 error:
        if (need_free)
                kmem_free(in_f);
@@ -2569,7 +2592,19 @@ xlog_recover_do_dquot_trans(
                return (0);
 
        recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
-       ASSERT(recddq);
+
+       if (item->ri_buf[1].i_addr == NULL) {
+               cmn_err(CE_ALERT,
+                       "XFS: NULL dquot in %s.", __func__);
+               return XFS_ERROR(EIO);
+       }
+       if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
+               cmn_err(CE_ALERT,
+                       "XFS: dquot too small (%d) in %s.",
+                       item->ri_buf[1].i_len, __func__);
+               return XFS_ERROR(EIO);
+       }
+
        /*
         * This type of quotas was turned off, so ignore this record.
         */
@@ -2624,9 +2659,8 @@ xlog_recover_do_dquot_trans(
        memcpy(ddq, recddq, item->ri_buf[1].i_len);
 
        ASSERT(dq_f->qlf_size == 2);
-       ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
-              XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
-       XFS_BUF_SET_FSPRIVATE(bp, mp);
+       ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+       bp->b_mount = mp;
        XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
        xfs_bdwrite(mp, bp);
 
@@ -2748,53 +2782,49 @@ xlog_recover_do_trans(
        int                     pass)
 {
        int                     error = 0;
-       xlog_recover_item_t     *item, *first_item;
+       xlog_recover_item_t     *item;
 
-       if ((error = xlog_recover_reorder_trans(trans)))
+       error = xlog_recover_reorder_trans(log, trans, pass);
+       if (error)
                return error;
-       first_item = item = trans->r_itemq;
-       do {
-               /*
-                * we don't need to worry about the block number being
-                * truncated in > 1 TB buffers because in user-land,
-                * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
-                * the blknos will get through the user-mode buffer
-                * cache properly.  The only bad case is o32 kernels
-                * where xfs_daddr_t is 32-bits but mount will warn us
-                * off a > 1 TB filesystem before we get here.
-                */
-               if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
-                       if  ((error = xlog_recover_do_buffer_trans(log, item,
-                                                                pass)))
-                               break;
-               } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
-                       if ((error = xlog_recover_do_inode_trans(log, item,
-                                                               pass)))
-                               break;
-               } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
-                       if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
-                                                 pass)))
-                               break;
-               } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
+
+       list_for_each_entry(item, &trans->r_itemq, ri_list) {
+               trace_xfs_log_recover_item_recover(log, trans, item, pass);
+               switch (ITEM_TYPE(item)) {
+               case XFS_LI_BUF:
+                       error = xlog_recover_do_buffer_trans(log, item, pass);
+                       break;
+               case XFS_LI_INODE:
+                       error = xlog_recover_do_inode_trans(log, item, pass);
+                       break;
+               case XFS_LI_EFI:
+                       error = xlog_recover_do_efi_trans(log, item,
+                                                         trans->r_lsn, pass);
+                       break;
+               case XFS_LI_EFD:
                        xlog_recover_do_efd_trans(log, item, pass);
-               } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
-                       if ((error = xlog_recover_do_dquot_trans(log, item,
-                                                                  pass)))
-                                       break;
-               } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
-                       if ((error = xlog_recover_do_quotaoff_trans(log, item,
-                                                                  pass)))
-                                       break;
-               } else {
-                       xlog_warn("XFS: xlog_recover_do_trans");
+                       error = 0;
+                       break;
+               case XFS_LI_DQUOT:
+                       error = xlog_recover_do_dquot_trans(log, item, pass);
+                       break;
+               case XFS_LI_QUOTAOFF:
+                       error = xlog_recover_do_quotaoff_trans(log, item,
+                                                              pass);
+                       break;
+               default:
+                       xlog_warn(
+       "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
                        ASSERT(0);
                        error = XFS_ERROR(EIO);
                        break;
                }
-               item = item->ri_next;
-       } while (first_item != item);
 
-       return error;
+               if (error)
+                       return error;
+       }
+
+       return 0;
 }
 
 /*
@@ -2806,21 +2836,18 @@ STATIC void
 xlog_recover_free_trans(
        xlog_recover_t          *trans)
 {
-       xlog_recover_item_t     *first_item, *item, *free_item;
+       xlog_recover_item_t     *item, *n;
        int                     i;
 
-       item = first_item = trans->r_itemq;
-       do {
-               free_item = item;
-               item = item->ri_next;
-                /* Free the regions in the item. */
-               for (i = 0; i < free_item->ri_cnt; i++) {
-                       kmem_free(free_item->ri_buf[i].i_addr);
-               }
+       list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
+               /* Free the regions in the item. */
+               list_del(&item->ri_list);
+               for (i = 0; i < item->ri_cnt; i++)
+                       kmem_free(item->ri_buf[i].i_addr);
                /* Free the item itself */
-               kmem_free(free_item->ri_buf);
-               kmem_free(free_item);
-       } while (first_item != item);
+               kmem_free(item->ri_buf);
+               kmem_free(item);
+       }
        /* Free the transaction recover structure */
        kmem_free(trans);
 }
@@ -2828,14 +2855,12 @@ xlog_recover_free_trans(
 STATIC int
 xlog_recover_commit_trans(
        xlog_t                  *log,
-       xlog_recover_t          **q,
        xlog_recover_t          *trans,
        int                     pass)
 {
        int                     error;
 
-       if ((error = xlog_recover_unlink_tid(q, trans)))
-               return error;
+       hlist_del(&trans->r_list);
        if ((error = xlog_recover_do_trans(log, trans, pass)))
                return error;
        xlog_recover_free_trans(trans);                 /* no error */
@@ -2863,7 +2888,7 @@ xlog_recover_unmount_trans(
 STATIC int
 xlog_recover_process_data(
        xlog_t                  *log,
-       xlog_recover_t          *rhash[],
+       struct hlist_head       rhash[],
        xlog_rec_header_t       *rhead,
        xfs_caddr_t             dp,
        int                     pass)
@@ -2897,7 +2922,7 @@ xlog_recover_process_data(
                }
                tid = be32_to_cpu(ohead->oh_tid);
                hash = XLOG_RHASH(tid);
-               trans = xlog_recover_find_tid(rhash[hash], tid);
+               trans = xlog_recover_find_tid(&rhash[hash], tid);
                if (trans == NULL) {               /* not found; add new tid */
                        if (ohead->oh_flags & XLOG_START_TRANS)
                                xlog_recover_new_tid(&rhash[hash], tid,
@@ -2915,14 +2940,15 @@ xlog_recover_process_data(
                        switch (flags) {
                        case XLOG_COMMIT_TRANS:
                                error = xlog_recover_commit_trans(log,
-                                               &rhash[hash], trans, pass);
+                                                               trans, pass);
                                break;
                        case XLOG_UNMOUNT_TRANS:
                                error = xlog_recover_unmount_trans(trans);
                                break;
                        case XLOG_WAS_CONT_TRANS:
-                               error = xlog_recover_add_to_cont_trans(trans,
-                                               dp, be32_to_cpu(ohead->oh_len));
+                               error = xlog_recover_add_to_cont_trans(log,
+                                               trans, dp,
+                                               be32_to_cpu(ohead->oh_len));
                                break;
                        case XLOG_START_TRANS:
                                xlog_warn(
@@ -2932,7 +2958,7 @@ xlog_recover_process_data(
                                break;
                        case 0:
                        case XLOG_CONTINUE_TRANS:
-                               error = xlog_recover_add_to_trans(trans,
+                               error = xlog_recover_add_to_trans(log, trans,
                                                dp, be32_to_cpu(ohead->oh_len));
                                break;
                        default:
@@ -3148,13 +3174,12 @@ xlog_recover_process_one_iunlink(
        /*
         * Get the on disk inode to find the next inode in the bucket.
         */
-       ASSERT(ip != NULL);
-       error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+       error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
        if (error)
-               goto fail;
+               goto fail_iput;
 
-       ASSERT(dip != NULL);
        ASSERT(ip->i_d.di_nlink == 0);
+       ASSERT(ip->i_d.di_mode != 0);
 
        /* setup for the next pass */
        agino = be32_to_cpu(dip->di_next_unlinked);
@@ -3166,18 +3191,11 @@ xlog_recover_process_one_iunlink(
         */
        ip->i_d.di_dmevmask = 0;
 
-       /*
-        * If this is a new inode, handle it specially.  Otherwise, just
-        * drop our reference to the inode.  If there are no other
-        * references, this will send the inode to xfs_inactive() which
-        * will truncate the file and free the inode.
-        */
-       if (ip->i_d.di_mode == 0)
-               xfs_iput_new(ip, 0);
-       else
-               IRELE(ip);
+       IRELE(ip);
        return agino;
 
+ fail_iput:
+       IRELE(ip);
  fail:
        /*
         * We can't read in the inode this bucket points to, or this inode
@@ -3203,7 +3221,7 @@ xlog_recover_process_one_iunlink(
  * freeing of the inode and its removal from the list must be
  * atomic.
  */
-void
+STATIC void
 xlog_recover_process_iunlinks(
        xlog_t          *log)
 {
@@ -3457,12 +3475,12 @@ xlog_do_recovery_pass(
 {
        xlog_rec_header_t       *rhead;
        xfs_daddr_t             blk_no;
-       xfs_caddr_t             bufaddr, offset;
+       xfs_caddr_t             offset;
        xfs_buf_t               *hbp, *dbp;
        int                     error = 0, h_size;
        int                     bblks, split_bblks;
        int                     hblks, split_hblks, wrapped_hblks;
-       xlog_recover_t          *rhash[XLOG_RHASH_SIZE];
+       struct hlist_head       rhash[XLOG_RHASH_SIZE];
 
        ASSERT(head_blk != tail_blk);
 
@@ -3479,9 +3497,11 @@ xlog_do_recovery_pass(
                hbp = xlog_get_bp(log, 1);
                if (!hbp)
                        return ENOMEM;
-               if ((error = xlog_bread(log, tail_blk, 1, hbp)))
+
+               error = xlog_bread(log, tail_blk, 1, hbp, &offset);
+               if (error)
                        goto bread_err1;
-               offset = xlog_align(log, tail_blk, 1, hbp);
+
                rhead = (xlog_rec_header_t *)offset;
                error = xlog_valid_rec_header(log, rhead, tail_blk);
                if (error)
@@ -3515,9 +3535,10 @@ xlog_do_recovery_pass(
        memset(rhash, 0, sizeof(rhash));
        if (tail_blk <= head_blk) {
                for (blk_no = tail_blk; blk_no < head_blk; ) {
-                       if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+                       error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+                       if (error)
                                goto bread_err2;
-                       offset = xlog_align(log, blk_no, hblks, hbp);
+
                        rhead = (xlog_rec_header_t *)offset;
                        error = xlog_valid_rec_header(log, rhead, blk_no);
                        if (error)
@@ -3525,10 +3546,11 @@ xlog_do_recovery_pass(
 
                        /* blocks in data section */
                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-                       error = xlog_bread(log, blk_no + hblks, bblks, dbp);
+                       error = xlog_bread(log, blk_no + hblks, bblks, dbp,
+                                          &offset);
                        if (error)
                                goto bread_err2;
-                       offset = xlog_align(log, blk_no + hblks, bblks, dbp);
+
                        xlog_unpack_data(rhead, offset, log);
                        if ((error = xlog_recover_process_data(log,
                                                rhash, rhead, offset, pass)))
@@ -3546,15 +3568,15 @@ xlog_do_recovery_pass(
                        /*
                         * Check for header wrapping around physical end-of-log
                         */
-                       offset = NULL;
+                       offset = XFS_BUF_PTR(hbp);
                        split_hblks = 0;
                        wrapped_hblks = 0;
                        if (blk_no + hblks <= log->l_logBBsize) {
                                /* Read header in one read */
-                               error = xlog_bread(log, blk_no, hblks, hbp);
+                               error = xlog_bread(log, blk_no, hblks, hbp,
+                                                  &offset);
                                if (error)
                                        goto bread_err2;
-                               offset = xlog_align(log, blk_no, hblks, hbp);
                        } else {
                                /* This LR is split across physical log end */
                                if (blk_no != log->l_logBBsize) {
@@ -3562,12 +3584,13 @@ xlog_do_recovery_pass(
                                        ASSERT(blk_no <= INT_MAX);
                                        split_hblks = log->l_logBBsize - (int)blk_no;
                                        ASSERT(split_hblks > 0);
-                                       if ((error = xlog_bread(log, blk_no,
-                                                       split_hblks, hbp)))
+                                       error = xlog_bread(log, blk_no,
+                                                          split_hblks, hbp,
+                                                          &offset);
+                                       if (error)
                                                goto bread_err2;
-                                       offset = xlog_align(log, blk_no,
-                                                       split_hblks, hbp);
                                }
+
                                /*
                                 * Note: this black magic still works with
                                 * large sector sizes (non-512) only because:
@@ -3581,21 +3604,21 @@ xlog_do_recovery_pass(
                                 *   - order is important.
                                 */
                                wrapped_hblks = hblks - split_hblks;
-                               bufaddr = XFS_BUF_PTR(hbp);
                                error = XFS_BUF_SET_PTR(hbp,
-                                               bufaddr + BBTOB(split_hblks),
+                                               offset + BBTOB(split_hblks),
                                                BBTOB(hblks - split_hblks));
-                               if (!error)
-                                       error = xlog_bread(log, 0,
-                                                       wrapped_hblks, hbp);
-                               if (!error)
-                                       error = XFS_BUF_SET_PTR(hbp, bufaddr,
+                               if (error)
+                                       goto bread_err2;
+
+                               error = xlog_bread_noalign(log, 0,
+                                                          wrapped_hblks, hbp);
+                               if (error)
+                                       goto bread_err2;
+
+                               error = XFS_BUF_SET_PTR(hbp, offset,
                                                        BBTOB(hblks));
                                if (error)
                                        goto bread_err2;
-                               if (!offset)
-                                       offset = xlog_align(log, 0,
-                                                       wrapped_hblks, hbp);
                        }
                        rhead = (xlog_rec_header_t *)offset;
                        error = xlog_valid_rec_header(log, rhead,
@@ -3608,14 +3631,14 @@ xlog_do_recovery_pass(
 
                        /* Read in data for log record */
                        if (blk_no + bblks <= log->l_logBBsize) {
-                               error = xlog_bread(log, blk_no, bblks, dbp);
+                               error = xlog_bread(log, blk_no, bblks, dbp,
+                                                  &offset);
                                if (error)
                                        goto bread_err2;
-                               offset = xlog_align(log, blk_no, bblks, dbp);
                        } else {
                                /* This log record is split across the
                                 * physical end of log */
-                               offset = NULL;
+                               offset = XFS_BUF_PTR(dbp);
                                split_bblks = 0;
                                if (blk_no != log->l_logBBsize) {
                                        /* some data is before the physical
@@ -3625,12 +3648,13 @@ xlog_do_recovery_pass(
                                        split_bblks =
                                                log->l_logBBsize - (int)blk_no;
                                        ASSERT(split_bblks > 0);
-                                       if ((error = xlog_bread(log, blk_no,
-                                                       split_bblks, dbp)))
+                                       error = xlog_bread(log, blk_no,
+                                                       split_bblks, dbp,
+                                                       &offset);
+                                       if (error)
                                                goto bread_err2;
-                                       offset = xlog_align(log, blk_no,
-                                                       split_bblks, dbp);
                                }
+
                                /*
                                 * Note: this black magic still works with
                                 * large sector sizes (non-512) only because:
@@ -3643,22 +3667,21 @@ xlog_do_recovery_pass(
                                 *   _first_, then the log start (LR header end)
                                 *   - order is important.
                                 */
-                               bufaddr = XFS_BUF_PTR(dbp);
                                error = XFS_BUF_SET_PTR(dbp,
-                                               bufaddr + BBTOB(split_bblks),
+                                               offset + BBTOB(split_bblks),
                                                BBTOB(bblks - split_bblks));
-                               if (!error)
-                                       error = xlog_bread(log, wrapped_hblks,
-                                                       bblks - split_bblks,
-                                                       dbp);
-                               if (!error)
-                                       error = XFS_BUF_SET_PTR(dbp, bufaddr,
-                                                       h_size);
                                if (error)
                                        goto bread_err2;
-                               if (!offset)
-                                       offset = xlog_align(log, wrapped_hblks,
-                                               bblks - split_bblks, dbp);
+
+                               error = xlog_bread_noalign(log, wrapped_hblks,
+                                               bblks - split_bblks,
+                                               dbp);
+                               if (error)
+                                       goto bread_err2;
+
+                               error = XFS_BUF_SET_PTR(dbp, offset, h_size);
+                               if (error)
+                                       goto bread_err2;
                        }
                        xlog_unpack_data(rhead, offset, log);
                        if ((error = xlog_recover_process_data(log, rhash,
@@ -3672,17 +3695,21 @@ xlog_do_recovery_pass(
 
                /* read first part of physical log */
                while (blk_no < head_blk) {
-                       if ((error = xlog_bread(log, blk_no, hblks, hbp)))
+                       error = xlog_bread(log, blk_no, hblks, hbp, &offset);
+                       if (error)
                                goto bread_err2;
-                       offset = xlog_align(log, blk_no, hblks, hbp);
+
                        rhead = (xlog_rec_header_t *)offset;
                        error = xlog_valid_rec_header(log, rhead, blk_no);
                        if (error)
                                goto bread_err2;
+
                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
-                       if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
+                       error = xlog_bread(log, blk_no+hblks, bblks, dbp,
+                                          &offset);
+                       if (error)
                                goto bread_err2;
-                       offset = xlog_align(log, blk_no+hblks, bblks, dbp);
+
                        xlog_unpack_data(rhead, offset, log);
                        if ((error = xlog_recover_process_data(log, rhash,
                                                        rhead, offset, pass)))
@@ -3914,8 +3941,7 @@ xlog_recover_finish(
                 * case the unlink transactions would have problems
                 * pushing the EFIs out of the way.
                 */
-               xfs_log_force(log->l_mp, (xfs_lsn_t)0,
-                             (XFS_LOG_FORCE | XFS_LOG_SYNC));
+               xfs_log_force(log->l_mp, XFS_LOG_SYNC);
 
                xlog_recover_process_iunlinks(log);