ocfs2: Remove the dentry vote
[safe/jmp/linux-2.6] / fs / ocfs2 / dlm / dlmrecovery.c
index 59c8976..9d950d7 100644 (file)
@@ -95,11 +95,14 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
 static void dlm_request_all_locks_worker(struct dlm_work_item *item,
                                         void *data);
 static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+                                     struct dlm_lock_resource *res,
+                                     u8 *real_master);
 
 static u64 dlm_get_next_mig_cookie(void);
 
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
 static u64 dlm_mig_cookie = 1;
 
 static u64 dlm_get_next_mig_cookie(void)
@@ -134,12 +137,18 @@ static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
        dlm->reco.new_master = master;
 }
 
-static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
 {
-       spin_lock(&dlm->spinlock);
+       assert_spin_locked(&dlm->spinlock);
        clear_bit(dlm->reco.dead_node, dlm->recovery_map);
        dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
        dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+       spin_lock(&dlm->spinlock);
+       __dlm_reset_recovery(dlm);
        spin_unlock(&dlm->spinlock);
 }
 
@@ -151,12 +160,21 @@ void dlm_dispatch_work(void *data)
        struct list_head *iter, *iter2;
        struct dlm_work_item *item;
        dlm_workfunc_t *workfunc;
+       int tot=0;
+
+       if (!dlm_joined(dlm))
+               return;
 
        spin_lock(&dlm->work_lock);
        list_splice_init(&dlm->work_list, &tmp_list);
        spin_unlock(&dlm->work_lock);
 
        list_for_each_safe(iter, iter2, &tmp_list) {
+               tot++;
+       }
+       mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
+       list_for_each_safe(iter, iter2, &tmp_list) {
                item = list_entry(iter, struct dlm_work_item, list);
                workfunc = item->func;
                list_del_init(&item->list);
@@ -239,6 +257,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
  *
  */
 
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+       struct dlm_reco_node_data *ndata;
+       struct dlm_lock_resource *res;
+
+       mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+            dlm->name, dlm->dlm_reco_thread_task->pid,
+            dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+            dlm->reco.dead_node, dlm->reco.new_master);
+
+       list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+               char *st = "unknown";
+               switch (ndata->state) {
+                       case DLM_RECO_NODE_DATA_INIT:
+                               st = "init";
+                               break;
+                       case DLM_RECO_NODE_DATA_REQUESTING:
+                               st = "requesting";
+                               break;
+                       case DLM_RECO_NODE_DATA_DEAD:
+                               st = "dead";
+                               break;
+                       case DLM_RECO_NODE_DATA_RECEIVING:
+                               st = "receiving";
+                               break;
+                       case DLM_RECO_NODE_DATA_REQUESTED:
+                               st = "requested";
+                               break;
+                       case DLM_RECO_NODE_DATA_DONE:
+                               st = "done";
+                               break;
+                       case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                               st = "finalize-sent";
+                               break;
+                       default:
+                               st = "bad";
+                               break;
+               }
+               mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+                    dlm->name, ndata->node_num, st);
+       }
+       list_for_each_entry(res, &dlm->reco.resources, recovering) {
+               mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+                    dlm->name, res->lockname.len, res->lockname.name);
+       }
+}
 
 #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
 
@@ -291,6 +355,18 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
        return dead;
 }
 
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+       int recovered;
+       spin_lock(&dlm->spinlock);
+       recovered = !test_bit(node, dlm->recovery_map);
+       spin_unlock(&dlm->spinlock);
+       return recovered;
+}
+
+
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
        if (timeout) {
@@ -309,6 +385,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
        return 0;
 }
 
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+       if (timeout) {
+               mlog(0, "%s: waiting %dms for notification of "
+                    "recovery of node %u\n", dlm->name, timeout, node);
+               wait_event_timeout(dlm->dlm_reco_thread_wq,
+                          dlm_is_node_recovered(dlm, node),
+                          msecs_to_jiffies(timeout));
+       } else {
+               mlog(0, "%s: waiting indefinitely for notification "
+                    "of recovery of node %u\n", dlm->name, node);
+               wait_event(dlm->dlm_reco_thread_wq,
+                          dlm_is_node_recovered(dlm, node));
+       }
+       /* for now, return 0 */
+       return 0;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins
@@ -327,6 +421,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
 
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 {
+       if (dlm_in_recovery(dlm)) {
+               mlog(0, "%s: reco thread %d in recovery: "
+                    "state=%d, master=%u, dead=%u\n",
+                    dlm->name, dlm->dlm_reco_thread_task->pid,
+                    dlm->reco.state, dlm->reco.new_master,
+                    dlm->reco.dead_node);
+       }
        wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
 }
 
@@ -385,7 +486,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                /* return to main thread loop and sleep. */
                return 0;
        }
-       mlog(0, "recovery thread found node %u in the recovery map!\n",
+       mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+            dlm->name, dlm->dlm_reco_thread_task->pid,
             dlm->reco.dead_node);
        spin_unlock(&dlm->spinlock);
 
@@ -408,8 +510,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                }
                mlog(0, "another node will master this recovery session.\n");
        }
-       mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
-            dlm->name, dlm->reco.new_master,
+       mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
+            dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
             dlm->node_num, dlm->reco.dead_node);
 
        /* it is safe to start everything back up here
@@ -421,11 +523,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
        return 0;
 
 master_here:
-       mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+       mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
+            dlm->dlm_reco_thread_task->pid,
             dlm->name, dlm->reco.dead_node, dlm->node_num);
 
        status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
        if (status < 0) {
+               /* we should never hit this anymore */
                mlog(ML_ERROR, "error %d remastering locks for node %u, "
                     "retrying.\n", status, dlm->reco.dead_node);
                /* yield a bit to allow any final network messages
@@ -452,9 +556,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        int destroy = 0;
        int pass = 0;
 
-       status = dlm_init_recovery_area(dlm, dead_node);
-       if (status < 0)
-               goto leave;
+       do {
+               /* we have become recovery master.  there is no escaping
+                * this, so just keep trying until we get it. */
+               status = dlm_init_recovery_area(dlm, dead_node);
+               if (status < 0) {
+                       mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+                            "retrying\n", dlm->name);
+                       msleep(1000);
+               }
+       } while (status != 0);
 
        /* safe to access the node data list without a lock, since this
         * process is the only one to change the list */
@@ -471,16 +582,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                        continue;
                }
 
-               status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
-               if (status < 0) {
-                       mlog_errno(status);
-                       if (dlm_is_host_down(status))
-                               ndata->state = DLM_RECO_NODE_DATA_DEAD;
-                       else {
-                               destroy = 1;
-                               goto leave;
+               do {
+                       status = dlm_request_all_locks(dlm, ndata->node_num,
+                                                      dead_node);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               if (dlm_is_host_down(status)) {
+                                       /* node died, ignore it for recovery */
+                                       status = 0;
+                                       ndata->state = DLM_RECO_NODE_DATA_DEAD;
+                                       /* wait for the domain map to catch up
+                                        * with the network state. */
+                                       wait_event_timeout(dlm->dlm_reco_thread_wq,
+                                                          dlm_is_node_dead(dlm,
+                                                               ndata->node_num),
+                                                          msecs_to_jiffies(1000));
+                                       mlog(0, "waited 1 sec for %u, "
+                                            "dead? %s\n", ndata->node_num,
+                                            dlm_is_node_dead(dlm, ndata->node_num) ?
+                                            "yes" : "no");
+                               } else {
+                                       /* -ENOMEM on the other node */
+                                       mlog(0, "%s: node %u returned "
+                                            "%d during recovery, retrying "
+                                            "after a short wait\n",
+                                            dlm->name, ndata->node_num,
+                                            status);
+                                       msleep(100);
+                               }
                        }
-               }
+               } while (status != 0);
 
                switch (ndata->state) {
                        case DLM_RECO_NODE_DATA_INIT:
@@ -492,10 +623,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                mlog(0, "node %u died after requesting "
                                     "recovery info for node %u\n",
                                     ndata->node_num, dead_node);
-                               // start all over
-                               destroy = 1;
-                               status = -EAGAIN;
-                               goto leave;
+                               /* fine.  don't need this node's info.
+                                * continue without it. */
+                               break;
                        case DLM_RECO_NODE_DATA_REQUESTING:
                                ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
                                mlog(0, "now receiving recovery data from "
@@ -539,35 +669,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                        BUG();
                                        break;
                                case DLM_RECO_NODE_DATA_DEAD:
-                                       mlog(ML_NOTICE, "node %u died after "
+                                       mlog(0, "node %u died after "
                                             "requesting recovery info for "
                                             "node %u\n", ndata->node_num,
                                             dead_node);
-                                       spin_unlock(&dlm_reco_state_lock);
-                                       // start all over
-                                       destroy = 1;
-                                       status = -EAGAIN;
-                                       /* instead of spinning like crazy here,
-                                        * wait for the domain map to catch up
-                                        * with the network state.  otherwise this
-                                        * can be hit hundreds of times before
-                                        * the node is really seen as dead. */
-                                       wait_event_timeout(dlm->dlm_reco_thread_wq,
-                                                          dlm_is_node_dead(dlm,
-                                                               ndata->node_num),
-                                                          msecs_to_jiffies(1000));
-                                       mlog(0, "waited 1 sec for %u, "
-                                            "dead? %s\n", ndata->node_num,
-                                            dlm_is_node_dead(dlm, ndata->node_num) ?
-                                            "yes" : "no");
-                                       goto leave;
+                                       break;
                                case DLM_RECO_NODE_DATA_RECEIVING:
                                case DLM_RECO_NODE_DATA_REQUESTED:
+                                       mlog(0, "%s: node %u still in state %s\n",
+                                            dlm->name, ndata->node_num,
+                                            ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+                                            "receiving" : "requested");
                                        all_nodes_done = 0;
                                        break;
                                case DLM_RECO_NODE_DATA_DONE:
+                                       mlog(0, "%s: node %u state is done\n",
+                                            dlm->name, ndata->node_num);
                                        break;
                                case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+                                       mlog(0, "%s: node %u state is finalize\n",
+                                            dlm->name, ndata->node_num);
                                        break;
                        }
                }
@@ -597,7 +718,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                             jiffies, dlm->reco.dead_node,
                             dlm->node_num, dlm->reco.new_master);
                        destroy = 1;
-                       status = ret;
+                       status = 0;
                        /* rescan everything marked dirty along the way */
                        dlm_kick_thread(dlm, NULL);
                        break;
@@ -610,7 +731,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 
        }
 
-leave:
        if (destroy)
                dlm_destroy_recovery_area(dlm, dead_node);
 
@@ -636,7 +756,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
                }
                BUG_ON(num == dead_node);
 
-               ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+               ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
                if (!ndata) {
                        dlm_destroy_recovery_area(dlm, dead_node);
                        return -ENOMEM;
@@ -714,20 +834,21 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
                mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
                     "dead_node is %u\n", dlm->name, lr->node_idx,
                     lr->dead_node, dlm->reco.dead_node);
+               dlm_print_reco_node_status(dlm);
                /* this is a hack */
                dlm_put(dlm);
                return -ENOMEM;
        }
        BUG_ON(lr->dead_node != dlm->reco.dead_node);
 
-       item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+       item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!item) {
                dlm_put(dlm);
                return -ENOMEM;
        }
 
        /* this will get freed by dlm_request_all_locks_worker */
-       buf = (char *) __get_free_page(GFP_KERNEL);
+       buf = (char *) __get_free_page(GFP_NOFS);
        if (!buf) {
                kfree(item);
                dlm_put(dlm);
@@ -742,7 +863,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-       schedule_work(&dlm->dispatched_work);
+       queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
        dlm_put(dlm);
        return 0;
@@ -764,26 +885,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        reco_master = item->u.ral.reco_master;
        mres = (struct dlm_migratable_lockres *)data;
 
+       mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+            dlm->name, dead_node, reco_master);
+
        if (dead_node != dlm->reco.dead_node ||
            reco_master != dlm->reco.new_master) {
-               /* show extra debug info if the recovery state is messed */
-               mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
-                    "request(dead=%u, master=%u)\n",
-                    dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
-                    dead_node, reco_master);
-               mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
-                    "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
-                    dlm->name, mres->lockname_len, mres->lockname, mres->master,
-                    mres->num_locks, mres->total_locks, mres->flags,
-                    dlm_get_lock_cookie_node(mres->ml[0].cookie),
-                    dlm_get_lock_cookie_seq(mres->ml[0].cookie),
-                    mres->ml[0].list, mres->ml[0].flags,
-                    mres->ml[0].type, mres->ml[0].convert_type,
-                    mres->ml[0].highest_blocked, mres->ml[0].node);
-               BUG();
+               /* worker could have been created before the recovery master
+                * died.  if so, do not continue, but do not error. */
+               if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+                       mlog(ML_NOTICE, "%s: will not send recovery state, "
+                            "recovery master %u died, thread=(dead=%u,mas=%u)"
+                            " current=(dead=%u,mas=%u)\n", dlm->name,
+                            reco_master, dead_node, reco_master,
+                            dlm->reco.dead_node, dlm->reco.new_master);
+               } else {
+                       mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+                            "master=%u), request(dead=%u, master=%u)\n",
+                            dlm->name, dlm->reco.dead_node,
+                            dlm->reco.new_master, dead_node, reco_master);
+               }
+               goto leave;
        }
-       BUG_ON(dead_node != dlm->reco.dead_node);
-       BUG_ON(reco_master != dlm->reco.new_master);
 
        /* lock resources should have already been moved to the
         * dlm->reco.resources list.  now move items from that list
@@ -802,7 +924,9 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
                ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
                                        DLM_MRES_RECOVERY);
                if (ret < 0) {
-                       mlog_errno(ret);
+                       mlog(ML_ERROR, "%s: node %u went down while sending "
+                            "recovery state for dead node %u, ret=%d\n", dlm->name,
+                            reco_master, dead_node, ret);
                        skip_all_done = 1;
                        break;
                }
@@ -816,10 +940,12 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
        if (!skip_all_done) {
                ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
                if (ret < 0) {
-                       mlog_errno(ret);
+                       mlog(ML_ERROR, "%s: node %u went down while sending "
+                            "recovery all-done for dead node %u, ret=%d\n",
+                            dlm->name, reco_master, dead_node, ret);
                }
        }
-
+leave:
        free_page((unsigned long)data);
 }
 
@@ -865,7 +991,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
        mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
             "node_idx=%u, this node=%u\n", done->dead_node,
             dlm->reco.dead_node, done->node_idx, dlm->node_num);
-       BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+       mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+                       "Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+                       "node_idx=%u, this node=%u\n", done->dead_node,
+                       dlm->reco.dead_node, done->node_idx, dlm->node_num);
 
        spin_lock(&dlm_reco_state_lock);
        list_for_each(iter, &dlm->reco.node_data) {
@@ -1191,8 +1321,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
                mlog(0, "all done flag.  all lockres data received!\n");
 
        ret = -ENOMEM;
-       buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
-       item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+       buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+       item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!buf || !item)
                goto leave;
 
@@ -1283,7 +1413,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-       schedule_work(&dlm->dispatched_work);
+       queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
 leave:
        dlm_put(dlm);
@@ -1357,8 +1487,9 @@ leave:
 
 
 
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res, u8 *real_master)
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+                                     struct dlm_lock_resource *res,
+                                     u8 *real_master)
 {
        struct dlm_node_iter iter;
        int nodenum;
@@ -1600,29 +1731,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                }
                lksb->flags |= (ml->flags &
                                (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
-                       
+
+               if (ml->type == LKM_NLMODE)
+                       goto skip_lvb;
+
                if (!dlm_lvb_is_empty(mres->lvb)) {
                        if (lksb->flags & DLM_LKSB_PUT_LVB) {
                                /* other node was trying to update
                                 * lvb when node died.  recreate the
                                 * lksb with the updated lvb. */
                                memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+                               /* the lock resource lvb update must happen
+                                * NOW, before the spinlock is dropped.
+                                * we no longer wait for the AST to update
+                                * the lvb. */
+                               memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
                        } else {
                                /* otherwise, the node is sending its 
                                 * most recent valid lvb info */
                                BUG_ON(ml->type != LKM_EXMODE &&
                                       ml->type != LKM_PRMODE);
                                if (!dlm_lvb_is_empty(res->lvb) &&
-                                   (ml->type == LKM_EXMODE ||
-                                    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
-                                       mlog(ML_ERROR, "received bad lvb!\n");
-                                       __dlm_print_one_lock_resource(res);
-                                       BUG();
+                                   (ml->type == LKM_EXMODE ||
+                                    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+                                       int i;
+                                       mlog(ML_ERROR, "%s:%.*s: received bad "
+                                            "lvb! type=%d\n", dlm->name,
+                                            res->lockname.len,
+                                            res->lockname.name, ml->type);
+                                       printk("lockres lvb=[");
+                                       for (i=0; i<DLM_LVB_LEN; i++)
+                                               printk("%02x", res->lvb[i]);
+                                       printk("]\nmigrated lvb=[");
+                                       for (i=0; i<DLM_LVB_LEN; i++)
+                                               printk("%02x", mres->lvb[i]);
+                                       printk("]\n");
+                                       dlm_print_one_lock_resource(res);
+                                       BUG();
                                }
                                memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
                        }
                }
-
+skip_lvb:
 
                /* NOTE:
                 * wrt lock queue ordering and recovery:
@@ -1690,8 +1840,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
        struct dlm_lock *lock;
 
        res->state |= DLM_LOCK_RES_RECOVERING;
-       if (!list_empty(&res->recovering))
+       if (!list_empty(&res->recovering)) {
+               mlog(0,
+                    "Recovering res %s:%.*s, is already on recovery list!\n",
+                    dlm->name, res->lockname.len, res->lockname.name);
                list_del_init(&res->recovering);
+       }
+       /* We need to hold a reference while on the recovery list */
+       dlm_lockres_get(res);
        list_add_tail(&res->recovering, &dlm->reco.resources);
 
        /* find any pending locks and put them back on proper list */
@@ -1780,9 +1936,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                        spin_lock(&res->spinlock);
                        dlm_change_lockres_owner(dlm, res, new_master);
                        res->state &= ~DLM_LOCK_RES_RECOVERING;
-                       __dlm_dirty_lockres(dlm, res);
+                       if (!__dlm_lockres_unused(res))
+                               __dlm_dirty_lockres(dlm, res);
                        spin_unlock(&res->spinlock);
                        wake_up(&res->wq);
+                       dlm_lockres_put(res);
                }
        }
 
@@ -1815,11 +1973,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
                                             dlm->name, res->lockname.len,
                                             res->lockname.name, res->owner);
                                        list_del_init(&res->recovering);
+                                       dlm_lockres_put(res);
                                }
                                spin_lock(&res->spinlock);
                                dlm_change_lockres_owner(dlm, res, new_master);
                                res->state &= ~DLM_LOCK_RES_RECOVERING;
-                               __dlm_dirty_lockres(dlm, res);
+                               if (!__dlm_lockres_unused(res))
+                                       __dlm_dirty_lockres(dlm, res);
                                spin_unlock(&res->spinlock);
                                wake_up(&res->wq);
                        }
@@ -1996,6 +2156,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 {
        assert_spin_locked(&dlm->spinlock);
 
+       if (dlm->reco.new_master == idx) {
+               mlog(0, "%s: recovery master %d just died\n",
+                    dlm->name, idx);
+               if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+                       /* finalize1 was reached, so it is safe to clear
+                        * the new_master and dead_node.  that recovery
+                        * is complete. */
+                       mlog(0, "%s: dead master %d had reached "
+                            "finalize1 state, clearing\n", dlm->name, idx);
+                       dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+                       __dlm_reset_recovery(dlm);
+               }
+       }
+
        /* check to see if the node is already considered dead */
        if (!test_bit(idx, dlm->live_nodes_map)) {
                mlog(0, "for domain %s, node %d is already dead. "
@@ -2111,7 +2285,8 @@ again:
        memset(&lksb, 0, sizeof(lksb));
 
        ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
-                     DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+                     DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN,
+                     dlm_reco_ast, dlm, dlm_reco_bast);
 
        mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
             dlm->name, ret, lksb.status);
@@ -2197,6 +2372,10 @@ again:
                mlog(0, "%s: reco master %u is ready to recover %u\n",
                     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
                status = -EEXIST;
+       } else if (ret == DLM_RECOVERING) {
+               mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+                    dlm->name, dlm->node_num);
+               goto again;
        } else {
                struct dlm_lock_resource *res;
 
@@ -2228,7 +2407,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
 
        mlog_entry("%u\n", dead_node);
 
-       mlog(0, "dead node is %u\n", dead_node);
+       mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
 
        spin_lock(&dlm->spinlock);
        dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2286,6 +2465,14 @@ retry:
                         * another ENOMEM */
                        msleep(100);
                        goto retry;
+               } else if (ret == EAGAIN) {
+                       mlog(0, "%s: trying to start recovery of node "
+                            "%u, but node %u is waiting for last recovery "
+                            "to complete, backoff for a bit\n", dlm->name,
+                            dead_node, nodenum);
+                       /* TODO Look into replacing msleep with cond_resched() */
+                       msleep(100);
+                       goto retry;
                }
        }
 
@@ -2301,8 +2488,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
        if (!dlm_grab(dlm))
                return 0;
 
-       mlog(0, "node %u wants to recover node %u\n",
-                 br->node_idx, br->dead_node);
+       spin_lock(&dlm->spinlock);
+       if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+               mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+                    "but this node is in finalize state, waiting on finalize2\n",
+                    dlm->name, br->node_idx, br->dead_node,
+                    dlm->reco.dead_node, dlm->reco.new_master);
+               spin_unlock(&dlm->spinlock);
+               return EAGAIN;
+       }
+       spin_unlock(&dlm->spinlock);
+
+       mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+            dlm->name, br->node_idx, br->dead_node,
+            dlm->reco.dead_node, dlm->reco.new_master);
 
        dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
 
@@ -2344,10 +2543,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_unlock(&dlm->spinlock);
 
        dlm_kick_recovery_thread(dlm);
+
+       mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+            dlm->name, br->node_idx, br->dead_node,
+            dlm->reco.dead_node, dlm->reco.new_master);
+
        dlm_put(dlm);
        return 0;
 }
 
+#define DLM_FINALIZE_STAGE2  0x01
 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 {
        int ret = 0;
@@ -2355,25 +2560,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
        struct dlm_node_iter iter;
        int nodenum;
        int status;
+       int stage = 1;
 
-       mlog(0, "finishing recovery for node %s:%u\n",
-            dlm->name, dlm->reco.dead_node);
+       mlog(0, "finishing recovery for node %s:%u, "
+            "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
 
        spin_lock(&dlm->spinlock);
        dlm_node_iter_init(dlm->domain_map, &iter);
        spin_unlock(&dlm->spinlock);
 
+stage2:
        memset(&fr, 0, sizeof(fr));
        fr.node_idx = dlm->node_num;
        fr.dead_node = dlm->reco.dead_node;
+       if (stage == 2)
+               fr.flags |= DLM_FINALIZE_STAGE2;
 
        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
                if (nodenum == dlm->node_num)
                        continue;
                ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
                                         &fr, sizeof(fr), nodenum, &status);
-               if (ret >= 0) {
+               if (ret >= 0)
                        ret = status;
+               if (ret < 0) {
+                       mlog_errno(ret);
                        if (dlm_is_host_down(ret)) {
                                /* this has no effect on this recovery 
                                 * session, so set the status to zero to 
@@ -2381,13 +2592,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
                                mlog(ML_ERROR, "node %u went down after this "
                                     "node finished recovery.\n", nodenum);
                                ret = 0;
+                               continue;
                        }
-               }
-               if (ret < 0) {
-                       mlog_errno(ret);
                        break;
                }
        }
+       if (stage == 1) {
+               /* reset the node_iter back to the top and send finalize2 */
+               iter.curnode = -1;
+               stage = 2;
+               goto stage2;
+       }
 
        return ret;
 }
@@ -2396,14 +2611,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
 {
        struct dlm_ctxt *dlm = data;
        struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+       int stage = 1;
 
        /* ok to return 0, domain has gone away */
        if (!dlm_grab(dlm))
                return 0;
 
-       mlog(0, "node %u finalizing recovery of node %u\n",
-            fr->node_idx, fr->dead_node);
+       if (fr->flags & DLM_FINALIZE_STAGE2)
+               stage = 2;
 
+       mlog(0, "%s: node %u finalizing recovery stage%d of "
+            "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+            fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
        spin_lock(&dlm->spinlock);
 
        if (dlm->reco.new_master != fr->node_idx) {
@@ -2419,13 +2639,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
                BUG();
        }
 
-       dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
-
-       spin_unlock(&dlm->spinlock);
+       switch (stage) {
+               case 1:
+                       dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+                       if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+                               mlog(ML_ERROR, "%s: received finalize1 from "
+                                    "new master %u for dead node %u, but "
+                                    "this node has already received it!\n",
+                                    dlm->name, fr->node_idx, fr->dead_node);
+                               dlm_print_reco_node_status(dlm);
+                               BUG();
+                       }
+                       dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+                       spin_unlock(&dlm->spinlock);
+                       break;
+               case 2:
+                       if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+                               mlog(ML_ERROR, "%s: received finalize2 from "
+                                    "new master %u for dead node %u, but "
+                                    "this node did not have finalize1!\n",
+                                    dlm->name, fr->node_idx, fr->dead_node);
+                               dlm_print_reco_node_status(dlm);
+                               BUG();
+                       }
+                       dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+                       spin_unlock(&dlm->spinlock);
+                       dlm_reset_recovery(dlm);
+                       dlm_kick_recovery_thread(dlm);
+                       break;
+               default:
+                       BUG();
+       }
 
-       dlm_reset_recovery(dlm);
+       mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+            dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
 
-       dlm_kick_recovery_thread(dlm);
        dlm_put(dlm);
        return 0;
 }