ocfs2: Remove the dentry vote
[safe/jmp/linux-2.6] / fs / ocfs2 / dlm / dlmrecovery.c
index 86199f6..9d950d7 100644 (file)
@@ -95,11 +95,14 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
 static void dlm_request_all_locks_worker(struct dlm_work_item *item,
                                         void *data);
 static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+                                     struct dlm_lock_resource *res,
+                                     u8 *real_master);
 
 static u64 dlm_get_next_mig_cookie(void);
 
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
 static u64 dlm_mig_cookie = 1;
 
 static u64 dlm_get_next_mig_cookie(void)
@@ -157,12 +160,21 @@ void dlm_dispatch_work(void *data)
        struct list_head *iter, *iter2;
        struct dlm_work_item *item;
        dlm_workfunc_t *workfunc;
+       int tot=0;
+
+       if (!dlm_joined(dlm))
+               return;
 
        spin_lock(&dlm->work_lock);
        list_splice_init(&dlm->work_list, &tmp_list);
        spin_unlock(&dlm->work_lock);
 
        list_for_each_safe(iter, iter2, &tmp_list) {
+               tot++;
+       }
+       mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
+       list_for_each_safe(iter, iter2, &tmp_list) {
                item = list_entry(iter, struct dlm_work_item, list);
                workfunc = item->func;
                list_del_init(&item->list);
@@ -343,6 +355,18 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
        return dead;
 }
 
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+       int recovered;
+       spin_lock(&dlm->spinlock);
+       recovered = !test_bit(node, dlm->recovery_map);
+       spin_unlock(&dlm->spinlock);
+       return recovered;
+}
+
+
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
        if (timeout) {
@@ -361,6 +385,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
        return 0;
 }
 
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+       if (timeout) {
+               mlog(0, "%s: waiting %dms for notification of "
+                    "recovery of node %u\n", dlm->name, timeout, node);
+               wait_event_timeout(dlm->dlm_reco_thread_wq,
+                          dlm_is_node_recovered(dlm, node),
+                          msecs_to_jiffies(timeout));
+       } else {
+               mlog(0, "%s: waiting indefinitely for notification "
+                    "of recovery of node %u\n", dlm->name, node);
+               wait_event(dlm->dlm_reco_thread_wq,
+                          dlm_is_node_recovered(dlm, node));
+       }
+       /* for now, return 0 */
+       return 0;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins
@@ -379,6 +421,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
 
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 {
+       if (dlm_in_recovery(dlm)) {
+               mlog(0, "%s: reco thread %d in recovery: "
+                    "state=%d, master=%u, dead=%u\n",
+                    dlm->name, dlm->dlm_reco_thread_task->pid,
+                    dlm->reco.state, dlm->reco.new_master,
+                    dlm->reco.dead_node);
+       }
        wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
 }
 
@@ -480,6 +529,7 @@ master_here:
 
        status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
        if (status < 0) {
+               /* we should never hit this anymore */
                mlog(ML_ERROR, "error %d remastering locks for node %u, "
                     "retrying.\n", status, dlm->reco.dead_node);
                /* yield a bit to allow any final network messages
@@ -506,9 +556,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
        int destroy = 0;
        int pass = 0;
 
-       status = dlm_init_recovery_area(dlm, dead_node);
-       if (status < 0)
-               goto leave;
+       do {
+               /* we have become recovery master.  there is no escaping
+                * this, so just keep trying until we get it. */
+               status = dlm_init_recovery_area(dlm, dead_node);
+               if (status < 0) {
+                       mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+                            "retrying\n", dlm->name);
+                       msleep(1000);
+               }
+       } while (status != 0);
 
        /* safe to access the node data list without a lock, since this
         * process is the only one to change the list */
@@ -525,16 +582,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                        continue;
                }
 
-               status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
-               if (status < 0) {
-                       mlog_errno(status);
-                       if (dlm_is_host_down(status))
-                               ndata->state = DLM_RECO_NODE_DATA_DEAD;
-                       else {
-                               destroy = 1;
-                               goto leave;
+               do {
+                       status = dlm_request_all_locks(dlm, ndata->node_num,
+                                                      dead_node);
+                       if (status < 0) {
+                               mlog_errno(status);
+                               if (dlm_is_host_down(status)) {
+                                       /* node died, ignore it for recovery */
+                                       status = 0;
+                                       ndata->state = DLM_RECO_NODE_DATA_DEAD;
+                                       /* wait for the domain map to catch up
+                                        * with the network state. */
+                                       wait_event_timeout(dlm->dlm_reco_thread_wq,
+                                                          dlm_is_node_dead(dlm,
+                                                               ndata->node_num),
+                                                          msecs_to_jiffies(1000));
+                                       mlog(0, "waited 1 sec for %u, "
+                                            "dead? %s\n", ndata->node_num,
+                                            dlm_is_node_dead(dlm, ndata->node_num) ?
+                                            "yes" : "no");
+                               } else {
+                                       /* -ENOMEM on the other node */
+                                       mlog(0, "%s: node %u returned "
+                                            "%d during recovery, retrying "
+                                            "after a short wait\n",
+                                            dlm->name, ndata->node_num,
+                                            status);
+                                       msleep(100);
+                               }
                        }
-               }
+               } while (status != 0);
 
                switch (ndata->state) {
                        case DLM_RECO_NODE_DATA_INIT:
@@ -546,10 +623,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                mlog(0, "node %u died after requesting "
                                     "recovery info for node %u\n",
                                     ndata->node_num, dead_node);
-                               // start all over
-                               destroy = 1;
-                               status = -EAGAIN;
-                               goto leave;
+                               /* fine.  don't need this node's info.
+                                * continue without it. */
+                               break;
                        case DLM_RECO_NODE_DATA_REQUESTING:
                                ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
                                mlog(0, "now receiving recovery data from "
@@ -593,28 +669,11 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                                        BUG();
                                        break;
                                case DLM_RECO_NODE_DATA_DEAD:
-                                       mlog(ML_NOTICE, "node %u died after "
+                                       mlog(0, "node %u died after "
                                             "requesting recovery info for "
                                             "node %u\n", ndata->node_num,
                                             dead_node);
-                                       spin_unlock(&dlm_reco_state_lock);
-                                       // start all over
-                                       destroy = 1;
-                                       status = -EAGAIN;
-                                       /* instead of spinning like crazy here,
-                                        * wait for the domain map to catch up
-                                        * with the network state.  otherwise this
-                                        * can be hit hundreds of times before
-                                        * the node is really seen as dead. */
-                                       wait_event_timeout(dlm->dlm_reco_thread_wq,
-                                                          dlm_is_node_dead(dlm,
-                                                               ndata->node_num),
-                                                          msecs_to_jiffies(1000));
-                                       mlog(0, "waited 1 sec for %u, "
-                                            "dead? %s\n", ndata->node_num,
-                                            dlm_is_node_dead(dlm, ndata->node_num) ?
-                                            "yes" : "no");
-                                       goto leave;
+                                       break;
                                case DLM_RECO_NODE_DATA_RECEIVING:
                                case DLM_RECO_NODE_DATA_REQUESTED:
                                        mlog(0, "%s: node %u still in state %s\n",
@@ -659,7 +718,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                             jiffies, dlm->reco.dead_node,
                             dlm->node_num, dlm->reco.new_master);
                        destroy = 1;
-                       status = ret;
+                       status = 0;
                        /* rescan everything marked dirty along the way */
                        dlm_kick_thread(dlm, NULL);
                        break;
@@ -672,7 +731,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 
        }
 
-leave:
        if (destroy)
                dlm_destroy_recovery_area(dlm, dead_node);
 
@@ -698,7 +756,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
                }
                BUG_ON(num == dead_node);
 
-               ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+               ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
                if (!ndata) {
                        dlm_destroy_recovery_area(dlm, dead_node);
                        return -ENOMEM;
@@ -783,14 +841,14 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
        }
        BUG_ON(lr->dead_node != dlm->reco.dead_node);
 
-       item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+       item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!item) {
                dlm_put(dlm);
                return -ENOMEM;
        }
 
        /* this will get freed by dlm_request_all_locks_worker */
-       buf = (char *) __get_free_page(GFP_KERNEL);
+       buf = (char *) __get_free_page(GFP_NOFS);
        if (!buf) {
                kfree(item);
                dlm_put(dlm);
@@ -805,7 +863,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-       schedule_work(&dlm->dispatched_work);
+       queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
        dlm_put(dlm);
        return 0;
@@ -832,24 +890,22 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
 
        if (dead_node != dlm->reco.dead_node ||
            reco_master != dlm->reco.new_master) {
-               /* show extra debug info if the recovery state is messed */
-               mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
-                    "request(dead=%u, master=%u)\n",
-                    dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
-                    dead_node, reco_master);
-               mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
-                    "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
-                    dlm->name, mres->lockname_len, mres->lockname, mres->master,
-                    mres->num_locks, mres->total_locks, mres->flags,
-                    dlm_get_lock_cookie_node(mres->ml[0].cookie),
-                    dlm_get_lock_cookie_seq(mres->ml[0].cookie),
-                    mres->ml[0].list, mres->ml[0].flags,
-                    mres->ml[0].type, mres->ml[0].convert_type,
-                    mres->ml[0].highest_blocked, mres->ml[0].node);
-               BUG();
+               /* worker could have been created before the recovery master
+                * died.  if so, do not continue, but do not error. */
+               if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+                       mlog(ML_NOTICE, "%s: will not send recovery state, "
+                            "recovery master %u died, thread=(dead=%u,mas=%u)"
+                            " current=(dead=%u,mas=%u)\n", dlm->name,
+                            reco_master, dead_node, reco_master,
+                            dlm->reco.dead_node, dlm->reco.new_master);
+               } else {
+                       mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+                            "master=%u), request(dead=%u, master=%u)\n",
+                            dlm->name, dlm->reco.dead_node,
+                            dlm->reco.new_master, dead_node, reco_master);
+               }
+               goto leave;
        }
-       BUG_ON(dead_node != dlm->reco.dead_node);
-       BUG_ON(reco_master != dlm->reco.new_master);
 
        /* lock resources should have already been moved to the
         * dlm->reco.resources list.  now move items from that list
@@ -889,7 +945,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
                             dlm->name, reco_master, dead_node, ret);
                }
        }
-
+leave:
        free_page((unsigned long)data);
 }
 
@@ -1265,8 +1321,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
                mlog(0, "all done flag.  all lockres data received!\n");
 
        ret = -ENOMEM;
-       buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
-       item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+       buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+       item = kcalloc(1, sizeof(*item), GFP_NOFS);
        if (!buf || !item)
                goto leave;
 
@@ -1357,7 +1413,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
        spin_lock(&dlm->work_lock);
        list_add_tail(&item->list, &dlm->work_list);
        spin_unlock(&dlm->work_lock);
-       schedule_work(&dlm->dispatched_work);
+       queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
 leave:
        dlm_put(dlm);
@@ -1431,8 +1487,9 @@ leave:
 
 
 
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-                              struct dlm_lock_resource *res, u8 *real_master)
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+                                     struct dlm_lock_resource *res,
+                                     u8 *real_master)
 {
        struct dlm_node_iter iter;
        int nodenum;
@@ -2228,7 +2285,8 @@ again:
        memset(&lksb, 0, sizeof(lksb));
 
        ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
-                     DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+                     DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN,
+                     dlm_reco_ast, dlm, dlm_reco_bast);
 
        mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
             dlm->name, ret, lksb.status);
@@ -2314,6 +2372,10 @@ again:
                mlog(0, "%s: reco master %u is ready to recover %u\n",
                     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
                status = -EEXIST;
+       } else if (ret == DLM_RECOVERING) {
+               mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+                    dlm->name, dlm->node_num);
+               goto again;
        } else {
                struct dlm_lock_resource *res;
 
@@ -2530,6 +2592,7 @@ stage2:
                                mlog(ML_ERROR, "node %u went down after this "
                                     "node finished recovery.\n", nodenum);
                                ret = 0;
+                               continue;
                        }
                        break;
                }