static void dlm_request_all_locks_worker(struct dlm_work_item *item,
void *data);
static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res,
+ u8 *real_master);
static u64 dlm_get_next_mig_cookie(void);
-static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(dlm_reco_state_lock);
+static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
static u64 dlm_mig_cookie = 1;
static u64 dlm_get_next_mig_cookie(void)
dlm->reco.new_master = master;
}
-static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
{
- spin_lock(&dlm->spinlock);
+ assert_spin_locked(&dlm->spinlock);
clear_bit(dlm->reco.dead_node, dlm->recovery_map);
dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
+
+static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+ spin_lock(&dlm->spinlock);
+ __dlm_reset_recovery(dlm);
spin_unlock(&dlm->spinlock);
}
struct list_head *iter, *iter2;
struct dlm_work_item *item;
dlm_workfunc_t *workfunc;
+ int tot=0;
+
+ if (!dlm_joined(dlm))
+ return;
spin_lock(&dlm->work_lock);
list_splice_init(&dlm->work_list, &tmp_list);
spin_unlock(&dlm->work_lock);
list_for_each_safe(iter, iter2, &tmp_list) {
+ tot++;
+ }
+ mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
+ list_for_each_safe(iter, iter2, &tmp_list) {
item = list_entry(iter, struct dlm_work_item, list);
workfunc = item->func;
list_del_init(&item->list);
return dead;
}
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+ int recovered;
+ spin_lock(&dlm->spinlock);
+ recovered = !test_bit(node, dlm->recovery_map);
+ spin_unlock(&dlm->spinlock);
+ return recovered;
+}
+
+
int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
if (timeout) {
return 0;
}
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+ if (timeout) {
+ mlog(0, "%s: waiting %dms for notification of "
+ "recovery of node %u\n", dlm->name, timeout, node);
+ wait_event_timeout(dlm->dlm_reco_thread_wq,
+ dlm_is_node_recovered(dlm, node),
+ msecs_to_jiffies(timeout));
+ } else {
+ mlog(0, "%s: waiting indefinitely for notification "
+ "of recovery of node %u\n", dlm->name, node);
+ wait_event(dlm->dlm_reco_thread_wq,
+ dlm_is_node_recovered(dlm, node));
+ }
+ /* for now, return 0 */
+ return 0;
+}
+
/* callers of the top-level api calls (dlmlock/dlmunlock) should
* block on the dlm->reco.event when recovery is in progress.
* the dlm recovery thread will set this state when it begins
void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
{
+ if (dlm_in_recovery(dlm)) {
+ mlog(0, "%s: reco thread %d in recovery: "
+ "state=%d, master=%u, dead=%u\n",
+ dlm->name, dlm->dlm_reco_thread_task->pid,
+ dlm->reco.state, dlm->reco.new_master,
+ dlm->reco.dead_node);
+ }
wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
}
status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
if (status < 0) {
+ /* we should never hit this anymore */
mlog(ML_ERROR, "error %d remastering locks for node %u, "
"retrying.\n", status, dlm->reco.dead_node);
/* yield a bit to allow any final network messages
int destroy = 0;
int pass = 0;
- status = dlm_init_recovery_area(dlm, dead_node);
- if (status < 0)
- goto leave;
+ do {
+ /* we have become recovery master. there is no escaping
+ * this, so just keep trying until we get it. */
+ status = dlm_init_recovery_area(dlm, dead_node);
+ if (status < 0) {
+ mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+ "retrying\n", dlm->name);
+ msleep(1000);
+ }
+ } while (status != 0);
/* safe to access the node data list without a lock, since this
* process is the only one to change the list */
continue;
}
- status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
- if (status < 0) {
- mlog_errno(status);
- if (dlm_is_host_down(status))
- ndata->state = DLM_RECO_NODE_DATA_DEAD;
- else {
- destroy = 1;
- goto leave;
+ do {
+ status = dlm_request_all_locks(dlm, ndata->node_num,
+ dead_node);
+ if (status < 0) {
+ mlog_errno(status);
+ if (dlm_is_host_down(status)) {
+ /* node died, ignore it for recovery */
+ status = 0;
+ ndata->state = DLM_RECO_NODE_DATA_DEAD;
+ /* wait for the domain map to catch up
+ * with the network state. */
+ wait_event_timeout(dlm->dlm_reco_thread_wq,
+ dlm_is_node_dead(dlm,
+ ndata->node_num),
+ msecs_to_jiffies(1000));
+ mlog(0, "waited 1 sec for %u, "
+ "dead? %s\n", ndata->node_num,
+ dlm_is_node_dead(dlm, ndata->node_num) ?
+ "yes" : "no");
+ } else {
+ /* -ENOMEM on the other node */
+ mlog(0, "%s: node %u returned "
+ "%d during recovery, retrying "
+ "after a short wait\n",
+ dlm->name, ndata->node_num,
+ status);
+ msleep(100);
+ }
}
- }
+ } while (status != 0);
switch (ndata->state) {
case DLM_RECO_NODE_DATA_INIT:
mlog(0, "node %u died after requesting "
"recovery info for node %u\n",
ndata->node_num, dead_node);
- // start all over
- destroy = 1;
- status = -EAGAIN;
- goto leave;
+ /* fine. don't need this node's info.
+ * continue without it. */
+ break;
case DLM_RECO_NODE_DATA_REQUESTING:
ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
mlog(0, "now receiving recovery data from "
BUG();
break;
case DLM_RECO_NODE_DATA_DEAD:
- mlog(ML_NOTICE, "node %u died after "
+ mlog(0, "node %u died after "
"requesting recovery info for "
"node %u\n", ndata->node_num,
dead_node);
- spin_unlock(&dlm_reco_state_lock);
- // start all over
- destroy = 1;
- status = -EAGAIN;
- /* instead of spinning like crazy here,
- * wait for the domain map to catch up
- * with the network state. otherwise this
- * can be hit hundreds of times before
- * the node is really seen as dead. */
- wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_dead(dlm,
- ndata->node_num),
- msecs_to_jiffies(1000));
- mlog(0, "waited 1 sec for %u, "
- "dead? %s\n", ndata->node_num,
- dlm_is_node_dead(dlm, ndata->node_num) ?
- "yes" : "no");
- goto leave;
+ break;
case DLM_RECO_NODE_DATA_RECEIVING:
case DLM_RECO_NODE_DATA_REQUESTED:
mlog(0, "%s: node %u still in state %s\n",
jiffies, dlm->reco.dead_node,
dlm->node_num, dlm->reco.new_master);
destroy = 1;
- status = ret;
+ status = 0;
/* rescan everything marked dirty along the way */
dlm_kick_thread(dlm, NULL);
break;
}
-leave:
if (destroy)
dlm_destroy_recovery_area(dlm, dead_node);
}
BUG_ON(num == dead_node);
- ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+ ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
if (!ndata) {
dlm_destroy_recovery_area(dlm, dead_node);
return -ENOMEM;
}
BUG_ON(lr->dead_node != dlm->reco.dead_node);
- item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+ item = kcalloc(1, sizeof(*item), GFP_NOFS);
if (!item) {
dlm_put(dlm);
return -ENOMEM;
}
/* this will get freed by dlm_request_all_locks_worker */
- buf = (char *) __get_free_page(GFP_KERNEL);
+ buf = (char *) __get_free_page(GFP_NOFS);
if (!buf) {
kfree(item);
dlm_put(dlm);
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
- schedule_work(&dlm->dispatched_work);
+ queue_work(dlm->dlm_worker, &dlm->dispatched_work);
dlm_put(dlm);
return 0;
if (dead_node != dlm->reco.dead_node ||
reco_master != dlm->reco.new_master) {
- /* show extra debug info if the recovery state is messed */
- mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
- "request(dead=%u, master=%u)\n",
- dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
- dead_node, reco_master);
- mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
- "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
- dlm->name, mres->lockname_len, mres->lockname, mres->master,
- mres->num_locks, mres->total_locks, mres->flags,
- dlm_get_lock_cookie_node(mres->ml[0].cookie),
- dlm_get_lock_cookie_seq(mres->ml[0].cookie),
- mres->ml[0].list, mres->ml[0].flags,
- mres->ml[0].type, mres->ml[0].convert_type,
- mres->ml[0].highest_blocked, mres->ml[0].node);
- BUG();
+ /* worker could have been created before the recovery master
+ * died. if so, do not continue, but do not error. */
+ if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+ mlog(ML_NOTICE, "%s: will not send recovery state, "
+ "recovery master %u died, thread=(dead=%u,mas=%u)"
+ " current=(dead=%u,mas=%u)\n", dlm->name,
+ reco_master, dead_node, reco_master,
+ dlm->reco.dead_node, dlm->reco.new_master);
+ } else {
+ mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+ "master=%u), request(dead=%u, master=%u)\n",
+ dlm->name, dlm->reco.dead_node,
+ dlm->reco.new_master, dead_node, reco_master);
+ }
+ goto leave;
}
- BUG_ON(dead_node != dlm->reco.dead_node);
- BUG_ON(reco_master != dlm->reco.new_master);
/* lock resources should have already been moved to the
* dlm->reco.resources list. now move items from that list
dlm->name, reco_master, dead_node, ret);
}
}
-
+leave:
free_page((unsigned long)data);
}
mlog(0, "all done flag. all lockres data received!\n");
ret = -ENOMEM;
- buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
- item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+ buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+ item = kcalloc(1, sizeof(*item), GFP_NOFS);
if (!buf || !item)
goto leave;
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
- schedule_work(&dlm->dispatched_work);
+ queue_work(dlm->dlm_worker, &dlm->dispatched_work);
leave:
dlm_put(dlm);
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
- struct dlm_lock_resource *res, u8 *real_master)
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res,
+ u8 *real_master)
{
struct dlm_node_iter iter;
int nodenum;
}
lksb->flags |= (ml->flags &
(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
-
+
+ if (ml->type == LKM_NLMODE)
+ goto skip_lvb;
+
if (!dlm_lvb_is_empty(mres->lvb)) {
if (lksb->flags & DLM_LKSB_PUT_LVB) {
/* other node was trying to update
* lvb when node died. recreate the
* lksb with the updated lvb. */
memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+ /* the lock resource lvb update must happen
+ * NOW, before the spinlock is dropped.
+ * we no longer wait for the AST to update
+ * the lvb. */
+ memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
} else {
/* otherwise, the node is sending its
* most recent valid lvb info */
BUG_ON(ml->type != LKM_EXMODE &&
ml->type != LKM_PRMODE);
if (!dlm_lvb_is_empty(res->lvb) &&
- (ml->type == LKM_EXMODE ||
- memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
- mlog(ML_ERROR, "received bad lvb!\n");
- __dlm_print_one_lock_resource(res);
- BUG();
+ (ml->type == LKM_EXMODE ||
+ memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+ int i;
+ mlog(ML_ERROR, "%s:%.*s: received bad "
+ "lvb! type=%d\n", dlm->name,
+ res->lockname.len,
+ res->lockname.name, ml->type);
+ printk("lockres lvb=[");
+ for (i=0; i<DLM_LVB_LEN; i++)
+ printk("%02x", res->lvb[i]);
+ printk("]\nmigrated lvb=[");
+ for (i=0; i<DLM_LVB_LEN; i++)
+ printk("%02x", mres->lvb[i]);
+ printk("]\n");
+ dlm_print_one_lock_resource(res);
+ BUG();
}
memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
}
}
-
+skip_lvb:
/* NOTE:
* wrt lock queue ordering and recovery:
struct dlm_lock *lock;
res->state |= DLM_LOCK_RES_RECOVERING;
- if (!list_empty(&res->recovering))
+ if (!list_empty(&res->recovering)) {
+ mlog(0,
+ "Recovering res %s:%.*s, is already on recovery list!\n",
+ dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->recovering);
+ }
+ /* We need to hold a reference while on the recovery list */
+ dlm_lockres_get(res);
list_add_tail(&res->recovering, &dlm->reco.resources);
/* find any pending locks and put them back on proper list */
spin_lock(&res->spinlock);
dlm_change_lockres_owner(dlm, res, new_master);
res->state &= ~DLM_LOCK_RES_RECOVERING;
- __dlm_dirty_lockres(dlm, res);
+ if (!__dlm_lockres_unused(res))
+ __dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
+ dlm_lockres_put(res);
}
}
dlm->name, res->lockname.len,
res->lockname.name, res->owner);
list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
spin_lock(&res->spinlock);
dlm_change_lockres_owner(dlm, res, new_master);
res->state &= ~DLM_LOCK_RES_RECOVERING;
- __dlm_dirty_lockres(dlm, res);
+ if (!__dlm_lockres_unused(res))
+ __dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
}
{
assert_spin_locked(&dlm->spinlock);
+ if (dlm->reco.new_master == idx) {
+ mlog(0, "%s: recovery master %d just died\n",
+ dlm->name, idx);
+ if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+ /* finalize1 was reached, so it is safe to clear
+ * the new_master and dead_node. that recovery
+ * is complete. */
+ mlog(0, "%s: dead master %d had reached "
+ "finalize1 state, clearing\n", dlm->name, idx);
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ __dlm_reset_recovery(dlm);
+ }
+ }
+
/* check to see if the node is already considered dead */
if (!test_bit(idx, dlm->live_nodes_map)) {
mlog(0, "for domain %s, node %d is already dead. "
memset(&lksb, 0, sizeof(lksb));
ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
- DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
+ DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN,
+ dlm_reco_ast, dlm, dlm_reco_bast);
mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
dlm->name, ret, lksb.status);
mlog(0, "%s: reco master %u is ready to recover %u\n",
dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
status = -EEXIST;
+ } else if (ret == DLM_RECOVERING) {
+ mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+ dlm->name, dlm->node_num);
+ goto again;
} else {
struct dlm_lock_resource *res;
* another ENOMEM */
msleep(100);
goto retry;
+ } else if (ret == EAGAIN) {
+ mlog(0, "%s: trying to start recovery of node "
+ "%u, but node %u is waiting for last recovery "
+ "to complete, backoff for a bit\n", dlm->name,
+ dead_node, nodenum);
+ /* TODO Look into replacing msleep with cond_resched() */
+ msleep(100);
+ goto retry;
}
}
if (!dlm_grab(dlm))
return 0;
+ spin_lock(&dlm->spinlock);
+ if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+ mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+ "but this node is in finalize state, waiting on finalize2\n",
+ dlm->name, br->node_idx, br->dead_node,
+ dlm->reco.dead_node, dlm->reco.new_master);
+ spin_unlock(&dlm->spinlock);
+ return EAGAIN;
+ }
+ spin_unlock(&dlm->spinlock);
+
mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
dlm->name, br->node_idx, br->dead_node,
dlm->reco.dead_node, dlm->reco.new_master);
return 0;
}
+#define DLM_FINALIZE_STAGE2 0x01
static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
{
int ret = 0;
struct dlm_node_iter iter;
int nodenum;
int status;
+ int stage = 1;
- mlog(0, "finishing recovery for node %s:%u\n",
- dlm->name, dlm->reco.dead_node);
+ mlog(0, "finishing recovery for node %s:%u, "
+ "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
spin_lock(&dlm->spinlock);
dlm_node_iter_init(dlm->domain_map, &iter);
spin_unlock(&dlm->spinlock);
+stage2:
memset(&fr, 0, sizeof(fr));
fr.node_idx = dlm->node_num;
fr.dead_node = dlm->reco.dead_node;
+ if (stage == 2)
+ fr.flags |= DLM_FINALIZE_STAGE2;
while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
if (nodenum == dlm->node_num)
continue;
ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
&fr, sizeof(fr), nodenum, &status);
- if (ret >= 0) {
+ if (ret >= 0)
ret = status;
+ if (ret < 0) {
+ mlog_errno(ret);
if (dlm_is_host_down(ret)) {
/* this has no effect on this recovery
* session, so set the status to zero to
mlog(ML_ERROR, "node %u went down after this "
"node finished recovery.\n", nodenum);
ret = 0;
+ continue;
}
- }
- if (ret < 0) {
- mlog_errno(ret);
break;
}
}
+ if (stage == 1) {
+ /* reset the node_iter back to the top and send finalize2 */
+ iter.curnode = -1;
+ stage = 2;
+ goto stage2;
+ }
return ret;
}
{
struct dlm_ctxt *dlm = data;
struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+ int stage = 1;
/* ok to return 0, domain has gone away */
if (!dlm_grab(dlm))
return 0;
- mlog(0, "%s: node %u finalizing recovery of node %u (%u:%u)\n",
- dlm->name, fr->node_idx, fr->dead_node,
- dlm->reco.dead_node, dlm->reco.new_master);
+ if (fr->flags & DLM_FINALIZE_STAGE2)
+ stage = 2;
+ mlog(0, "%s: node %u finalizing recovery stage%d of "
+ "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+ fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+
spin_lock(&dlm->spinlock);
if (dlm->reco.new_master != fr->node_idx) {
BUG();
}
- dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
-
- spin_unlock(&dlm->spinlock);
-
- dlm_reset_recovery(dlm);
+ switch (stage) {
+ case 1:
+ dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+ if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+ mlog(ML_ERROR, "%s: received finalize1 from "
+ "new master %u for dead node %u, but "
+ "this node has already received it!\n",
+ dlm->name, fr->node_idx, fr->dead_node);
+ dlm_print_reco_node_status(dlm);
+ BUG();
+ }
+ dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+ break;
+ case 2:
+ if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+ mlog(ML_ERROR, "%s: received finalize2 from "
+ "new master %u for dead node %u, but "
+ "this node did not have finalize1!\n",
+ dlm->name, fr->node_idx, fr->dead_node);
+ dlm_print_reco_node_status(dlm);
+ BUG();
+ }
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+ dlm_reset_recovery(dlm);
+ dlm_kick_recovery_thread(dlm);
+ break;
+ default:
+ BUG();
+ }
- dlm_kick_recovery_thread(dlm);
mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);