ipvs: Fix possible deadlock in sync code

[safe/jmp/linux-2.6] / net / ipv4 / ipvs / ip_vs_core.c
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c

index 09cac38..a7879ea 100644 (file)
--- a/net/ipv4/ipvs/ip_vs_core.c
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -5,8 +5,6 @@
   *              high-performance and highly available server based on a
   *              cluster of servers.
   *
- * Version:     $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
- *
   * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
   *              Peter Kese <peter.kese@ijs.si>
   *              Julian Anastasov <ja@ssi.bg>
@@ -423,7 +421,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
            and the destination is RTN_UNICAST (and not local), then create
            a cache_bypass connection entry */
         if (sysctl_ip_vs_cache_bypass && svc->fwmark
-           && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
+           && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) {
                 int ret, cs;
                 struct ip_vs_conn *cp;
  
@@ -481,19 +479,19 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
  
  
  /*
- *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
+ *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
   *      chain, and is used for VS/NAT.
   *      It detects packets for VS/NAT connections and sends the packets
   *      immediately. This can avoid that iptable_nat mangles the packets
   *      for VS/NAT.
   */
  static unsigned int ip_vs_post_routing(unsigned int hooknum,
-                                      struct sk_buff **pskb,
+                                      struct sk_buff *skb,
                                        const struct net_device *in,
                                        const struct net_device *out,
                                        int (*okfn)(struct sk_buff *))
  {
-       if (!((*pskb)->ipvs_property))
+       if (!skb->ipvs_property)
                 return NF_ACCEPT;
         /* The packet was sent from IPVS, exit this chain */
         return NF_STOP;
@@ -569,9 +567,8 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
   *     Currently handles error types - unreachable, quench, ttl exceeded.
   *     (Only used in VS/NAT)
   */
-static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
  {
-       struct sk_buff *skb = *pskb;
         struct iphdr *iph;
         struct icmphdr  _icmph, *ic;
         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
@@ -638,7 +635,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
         verdict = NF_DROP;
  
         if (IP_VS_FWD_METHOD(cp) != 0) {
-               IP_VS_ERR("shouldn't reach here, because the box is on the"
+               IP_VS_ERR("shouldn't reach here, because the box is on the "
                           "half connection in the tun/dr module.\n");
         }
  
@@ -680,16 +677,15 @@ static inline int is_tcp_reset(const struct sk_buff *skb)
  }
  
  /*
- *     It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
+ *     It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
   *     Check if outgoing packet belongs to the established ip_vs_conn,
   *      rewrite addresses of the packet and send it on its way...
   */
  static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
           const struct net_device *in, const struct net_device *out,
           int (*okfn)(struct sk_buff *))
  {
-       struct sk_buff  *skb = *pskb;
         struct iphdr    *iph;
         struct ip_vs_protocol *pp;
         struct ip_vs_conn *cp;
@@ -702,11 +698,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
  
         iph = ip_hdr(skb);
         if (unlikely(iph->protocol == IPPROTO_ICMP)) {
-               int related, verdict = ip_vs_out_icmp(pskb, &related);
+               int related, verdict = ip_vs_out_icmp(skb, &related);
  
                 if (related)
                         return verdict;
-               skb = *pskb;
                 iph = ip_hdr(skb);
         }
  
@@ -765,9 +760,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
                 goto drop;
  
         /* mangle the packet */
-       if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
+       if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
                 goto drop;
-       skb = *pskb;
         ip_hdr(skb)->saddr = cp->vaddr;
         ip_send_check(ip_hdr(skb));
  
@@ -777,9 +771,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
          * if it came from this machine itself.  So re-compute
          * the routing information.
          */
-       if (ip_route_me_harder(pskb, RTN_LOCAL) != 0)
+       if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
                 goto drop;
-       skb = *pskb;
  
         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
  
@@ -794,7 +787,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
  
    drop:
         ip_vs_conn_put(cp);
-       kfree_skb(*pskb);
+       kfree_skb(skb);
         return NF_STOLEN;
  }
  
@@ -806,9 +799,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
   *     Currently handles error types - unreachable, quench, ttl exceeded.
   */
  static int
-ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
+ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
  {
-       struct sk_buff *skb = *pskb;
         struct iphdr *iph;
         struct icmphdr  _icmph, *ic;
         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
@@ -820,7 +812,7 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
  
         /* reassemble IP fragments */
         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
-               if (ip_vs_gather_frags(skb, hooknum == NF_IP_LOCAL_IN ?
+               if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
                         return NF_STOLEN;
         }
@@ -901,11 +893,10 @@ ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
   *     and send it on its way...
   */
  static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
          const struct net_device *in, const struct net_device *out,
          int (*okfn)(struct sk_buff *))
  {
-       struct sk_buff  *skb = *pskb;
         struct iphdr    *iph;
         struct ip_vs_protocol *pp;
         struct ip_vs_conn *cp;
@@ -927,11 +918,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
  
         iph = ip_hdr(skb);
         if (unlikely(iph->protocol == IPPROTO_ICMP)) {
-               int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
+               int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
  
                 if (related)
                         return verdict;
-               skb = *pskb;
                 iph = ip_hdr(skb);
         }
  
@@ -987,15 +977,24 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
                 ret = NF_ACCEPT;
         }
  
-       /* increase its packet counter and check if it is needed
-          to be synchronized */
+       /* Increase its packet counter and check if it is needed
+        * to be synchronized
+        *
+        * Sync connection if it is about to close to
+        * encorage the standby servers to update the connections timeout
+        */
         atomic_inc(&cp->in_pkts);
         if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
-           (cp->protocol != IPPROTO_TCP ||
-            cp->state == IP_VS_TCP_S_ESTABLISHED) &&
-           (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
-            == sysctl_ip_vs_sync_threshold[0]))
+           (((cp->protocol != IPPROTO_TCP ||
+              cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+             (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+              == sysctl_ip_vs_sync_threshold[0])) ||
+            ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
+             ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
+              (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
+              (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
                 ip_vs_sync_conn(cp);
+       cp->old_state = cp->state;
  
         ip_vs_conn_put(cp);
         return ret;
@@ -1003,65 +1002,64 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
  
  
  /*
- *     It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
+ *     It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
   *      related packets destined for 0.0.0.0/0.
   *      When fwmark-based virtual service is used, such as transparent
   *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
   *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
- *      sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
+ *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
   *      and send them to ip_vs_in_icmp.
   */
  static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
                    const struct net_device *in, const struct net_device *out,
                    int (*okfn)(struct sk_buff *))
  {
         int r;
  
-       if (ip_hdr(*pskb)->protocol != IPPROTO_ICMP)
+       if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
                 return NF_ACCEPT;
  
-       return ip_vs_in_icmp(pskb, &r, hooknum);
+       return ip_vs_in_icmp(skb, &r, hooknum);
  }
  
  
-/* After packet filtering, forward packet through VS/DR, VS/TUN,
-   or VS/NAT(change destination), so that filtering rules can be
-   applied to IPVS. */
-static struct nf_hook_ops ip_vs_in_ops = {
-       .hook           = ip_vs_in,
-       .owner          = THIS_MODULE,
-       .pf             = PF_INET,
-       .hooknum        = NF_IP_LOCAL_IN,
-       .priority       = 100,
-};
-
-/* After packet filtering, change source only for VS/NAT */
-static struct nf_hook_ops ip_vs_out_ops = {
-       .hook           = ip_vs_out,
-       .owner          = THIS_MODULE,
-       .pf             = PF_INET,
-       .hooknum        = NF_IP_FORWARD,
-       .priority       = 100,
-};
-
-/* After packet filtering (but before ip_vs_out_icmp), catch icmp
-   destined for 0.0.0.0/0, which is for incoming IPVS connections */
-static struct nf_hook_ops ip_vs_forward_icmp_ops = {
-       .hook           = ip_vs_forward_icmp,
-       .owner          = THIS_MODULE,
-       .pf             = PF_INET,
-       .hooknum        = NF_IP_FORWARD,
-       .priority       = 99,
-};
-
-/* Before the netfilter connection tracking, exit from POST_ROUTING */
-static struct nf_hook_ops ip_vs_post_routing_ops = {
-       .hook           = ip_vs_post_routing,
-       .owner          = THIS_MODULE,
-       .pf             = PF_INET,
-       .hooknum        = NF_IP_POST_ROUTING,
-       .priority       = NF_IP_PRI_NAT_SRC-1,
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+       /* After packet filtering, forward packet through VS/DR, VS/TUN,
+        * or VS/NAT(change destination), so that filtering rules can be
+        * applied to IPVS. */
+       {
+               .hook           = ip_vs_in,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_LOCAL_IN,
+               .priority       = 100,
+       },
+       /* After packet filtering, change source only for VS/NAT */
+       {
+               .hook           = ip_vs_out,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 100,
+       },
+       /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+        * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+       {
+               .hook           = ip_vs_forward_icmp,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_FORWARD,
+               .priority       = 99,
+       },
+       /* Before the netfilter connection tracking, exit from POST_ROUTING */
+       {
+               .hook           = ip_vs_post_routing,
+               .owner          = THIS_MODULE,
+               .pf             = PF_INET,
+               .hooknum        = NF_INET_POST_ROUTING,
+               .priority       = NF_IP_PRI_NAT_SRC-1,
+       },
  };
  
  
@@ -1092,37 +1090,15 @@ static int __init ip_vs_init(void)
                 goto cleanup_app;
         }
  
-       ret = nf_register_hook(&ip_vs_in_ops);
+       ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
         if (ret < 0) {
-               IP_VS_ERR("can't register in hook.\n");
+               IP_VS_ERR("can't register hooks.\n");
                 goto cleanup_conn;
         }
  
-       ret = nf_register_hook(&ip_vs_out_ops);
-       if (ret < 0) {
-               IP_VS_ERR("can't register out hook.\n");
-               goto cleanup_inops;
-       }
-       ret = nf_register_hook(&ip_vs_post_routing_ops);
-       if (ret < 0) {
-               IP_VS_ERR("can't register post_routing hook.\n");
-               goto cleanup_outops;
-       }
-       ret = nf_register_hook(&ip_vs_forward_icmp_ops);
-       if (ret < 0) {
-               IP_VS_ERR("can't register forward_icmp hook.\n");
-               goto cleanup_postroutingops;
-       }
-
         IP_VS_INFO("ipvs loaded.\n");
         return ret;
  
-  cleanup_postroutingops:
-       nf_unregister_hook(&ip_vs_post_routing_ops);
-  cleanup_outops:
-       nf_unregister_hook(&ip_vs_out_ops);
-  cleanup_inops:
-       nf_unregister_hook(&ip_vs_in_ops);
    cleanup_conn:
         ip_vs_conn_cleanup();
    cleanup_app:
@@ -1136,10 +1112,7 @@ static int __init ip_vs_init(void)
  
  static void __exit ip_vs_cleanup(void)
  {
-       nf_unregister_hook(&ip_vs_forward_icmp_ops);
-       nf_unregister_hook(&ip_vs_post_routing_ops);
-       nf_unregister_hook(&ip_vs_out_ops);
-       nf_unregister_hook(&ip_vs_in_ops);
+       nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
         ip_vs_conn_cleanup();
         ip_vs_app_cleanup();
         ip_vs_protocol_cleanup();