IPoIB: Fix loss of connectivity after bonding failover on both sides
authorYossi Etigin <yosefe@Voltaire.COM>
Fri, 9 Jan 2009 22:05:11 +0000 (14:05 -0800)
committerRoland Dreier <rolandd@cisco.com>
Fri, 9 Jan 2009 22:05:11 +0000 (14:05 -0800)
Fix bonding failover in the case both peers failover and the
gratuitous ARP is lost.  In that case, the sender side will create an
ipoib_neigh and issue a path request with the old GID first.  When
skb->dst->neighbour->ha changes due to ARP refresh, this ipoib_neigh
will not be added to the path->list of the path of the new GID,
because the ipoib_neigh already exists.  It will not have an AH
either, because of sender-side failover.  Therefore, it will not get
an AH when the path is resolved.

The solution here is to compare GIDs in ipoib_start_xmit() even if
neigh->ah is invalid.  Comparing with an uninitialized value of
neigh->dgid should be fine, since a spurious match is harmless (and
astronomically unlikely too).

Signed-off-by: Moni Shoua <monis@voltaire.com>
Signed-off-by: Yossi Etigin <yosefe@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
drivers/infiniband/ulp/ipoib/ipoib_main.c

index 19e06bc..dce0443 100644 (file)
@@ -711,26 +711,26 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
                neigh = *to_ipoib_neigh(skb->dst->neighbour);
 
-               if (neigh->ah)
-                       if (unlikely((memcmp(&neigh->dgid.raw,
-                                           skb->dst->neighbour->ha + 4,
-                                           sizeof(union ib_gid))) ||
-                                        (neigh->dev != dev))) {
-                               spin_lock_irqsave(&priv->lock, flags);
-                               /*
-                                * It's safe to call ipoib_put_ah() inside
-                                * priv->lock here, because we know that
-                                * path->ah will always hold one more reference,
-                                * so ipoib_put_ah() will never do more than
-                                * decrement the ref count.
-                                */
+               if (unlikely((memcmp(&neigh->dgid.raw,
+                                    skb->dst->neighbour->ha + 4,
+                                    sizeof(union ib_gid))) ||
+                            (neigh->dev != dev))) {
+                       spin_lock_irqsave(&priv->lock, flags);
+                       /*
+                        * It's safe to call ipoib_put_ah() inside
+                        * priv->lock here, because we know that
+                        * path->ah will always hold one more reference,
+                        * so ipoib_put_ah() will never do more than
+                        * decrement the ref count.
+                        */
+                       if (neigh->ah)
                                ipoib_put_ah(neigh->ah);
-                               list_del(&neigh->list);
-                               ipoib_neigh_free(dev, neigh);
-                               spin_unlock_irqrestore(&priv->lock, flags);
-                               ipoib_path_lookup(skb, dev);
-                               return NETDEV_TX_OK;
-                       }
+                       list_del(&neigh->list);
+                       ipoib_neigh_free(dev, neigh);
+                       spin_unlock_irqrestore(&priv->lock, flags);
+                       ipoib_path_lookup(skb, dev);
+                       return NETDEV_TX_OK;
+               }
 
                if (ipoib_cm_get(neigh)) {
                        if (ipoib_cm_up(neigh)) {