From d255e5ff5fc6cc6c60dd014d1261448a7bbc8134 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 27 May 2010 09:45:45 +0200 Subject: [PATCH] drbd: fix hang on local read errors while disconnected "canceled" w_read_retry_remote never completed, if they have been canceled after drbd_disconnect connection teardown cleanup has already run (or we are currently not connected anyways). Fixed by not queueing a remote retry if we already know it won't work (pdsk not uptodate), and cleanup ourselves on "cancel", in case we hit a race with drbd_disconnect. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 22 +++++++++++++--------- drivers/block/drbd/drbd_req.h | 1 + drivers/block/drbd/drbd_worker.c | 6 ++---- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index e6c4d57..8915644 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -452,20 +452,21 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", (unsigned long long)req->sector, req->size); - /* _req_mod(req,to_be_send); oops, recursion... */ D_ASSERT(!(req->rq_state & RQ_NET_MASK)); - req->rq_state |= RQ_NET_PENDING; - inc_ap_pending(mdev); __drbd_chk_io_error(mdev, FALSE); put_ldev(mdev); - /* NOTE: if we have no connection, - * or know the peer has no good data either, - * then we don't actually need to "queue_for_net_read", - * but we do so anyways, since the drbd_io_error() - * and the potential state change to "Diskless" - * needs to be done from process context */ + /* no point in retrying if there is no good remote data, + * or we have no connection. */ + if (mdev->state.pdsk != D_UP_TO_DATE) { + _req_may_be_done(req, m); + break; + } + + /* _req_mod(req,to_be_send); oops, recursion... */ + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); /* fall through: _req_mod(req,queue_for_net_read); */ case queue_for_net_read: @@ -575,6 +576,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); break; + case read_retry_remote_canceled: + req->rq_state &= ~RQ_NET_QUEUED; + /* fall through, in case we raced with drbd_disconnect */ case connection_lost_while_pending: /* transfer log cleanup after connection loss */ /* assert something? */ diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 16119d7..02d575d 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -91,6 +91,7 @@ enum drbd_req_event { send_failed, handed_over_to_network, connection_lost_while_pending, + read_retry_remote_canceled, recv_acked_by_peer, write_acked_by_peer, write_acked_by_peer_and_sis, /* and set_in_sync */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index a12b447..67371fc 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -266,10 +266,8 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) * to give the disk the chance to relocate that block */ spin_lock_irq(&mdev->req_lock); - if (cancel || - mdev->state.conn < C_CONNECTED || - mdev->state.pdsk <= D_INCONSISTENT) { - _req_mod(req, send_canceled); + if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { + _req_mod(req, read_retry_remote_canceled); spin_unlock_irq(&mdev->req_lock); dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); return 1; -- 1.8.2.3