ipvs: ip_vs_wrr.c: use lib/gcd.c
[safe/jmp/linux-2.6] / net / netfilter / nf_conntrack_core.c
index 5276a2d..0e98c32 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/netfilter.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
 #include <linux/vmalloc.h>
@@ -47,7 +48,7 @@
 
 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
                                      enum nf_nat_manip_type manip,
-                                     struct nlattr *attr) __read_mostly;
+                                     const struct nlattr *attr) __read_mostly;
 EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
 
 DEFINE_SPINLOCK(nf_conntrack_lock);
@@ -335,7 +336,8 @@ begin:
        h = __nf_conntrack_find(net, tuple);
        if (h) {
                ct = nf_ct_tuplehash_to_ctrack(h);
-               if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+               if (unlikely(nf_ct_is_dying(ct) ||
+                            !atomic_inc_not_zero(&ct->ct_general.use)))
                        h = NULL;
                else {
                        if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple))) {
@@ -425,7 +427,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
        /* Remove from unconfirmed list */
        hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
 
-       __nf_conntrack_hash_insert(ct, hash, repl_hash);
        /* Timer relative to confirmation time, not original
           setting time, otherwise we'd get timer wrap in
           weird delay cases. */
@@ -433,8 +434,16 @@ __nf_conntrack_confirm(struct sk_buff *skb)
        add_timer(&ct->timeout);
        atomic_inc(&ct->ct_general.use);
        set_bit(IPS_CONFIRMED_BIT, &ct->status);
+
+       /* Since the lookup is lockless, hash insertion must be done after
+        * starting the timer and setting the CONFIRMED bit. The RCU barriers
+        * guarantee that no other CPU can find the conntrack before the above
+        * stores are visible.
+        */
+       __nf_conntrack_hash_insert(ct, hash, repl_hash);
        NF_CT_STAT_INC(net, insert);
        spin_unlock_bh(&nf_conntrack_lock);
+
        help = nfct_help(ct);
        if (help && help->helper)
                nf_conntrack_event_cache(IPCT_HELPER, ct);
@@ -503,10 +512,17 @@ static noinline int early_drop(struct net *net, unsigned int hash)
                        cnt++;
                }
 
-               if (ct && unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
-                       ct = NULL;
-               if (ct || cnt >= NF_CT_EVICTION_RANGE)
+               if (ct != NULL) {
+                       if (likely(!nf_ct_is_dying(ct) &&
+                                  atomic_inc_not_zero(&ct->ct_general.use)))
+                               break;
+                       else
+                               ct = NULL;
+               }
+
+               if (cnt >= NF_CT_EVICTION_RANGE)
                        break;
+
                hash = (hash + 1) % nf_conntrack_htable_size;
        }
        rcu_read_unlock();
@@ -552,23 +568,38 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
                }
        }
 
-       ct = kmem_cache_zalloc(nf_conntrack_cachep, gfp);
+       /*
+        * Do not use kmem_cache_zalloc(), as this cache uses
+        * SLAB_DESTROY_BY_RCU.
+        */
+       ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
        if (ct == NULL) {
                pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
                atomic_dec(&net->ct.count);
                return ERR_PTR(-ENOMEM);
        }
-
+       /*
+        * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next
+        * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
+        */
+       memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
+              sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
        spin_lock_init(&ct->lock);
-       atomic_set(&ct->ct_general.use, 1);
        ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+       ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
        ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+       ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
        /* Don't set timer yet: wait for confirmation */
        setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
 #ifdef CONFIG_NET_NS
        ct->ct_net = net;
 #endif
 
+       /*
+        * changes to lookup keys must be done before setting refcnt to 1
+        */
+       smp_wmb();
+       atomic_set(&ct->ct_general.use, 1);
        return ct;
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
@@ -1065,14 +1096,14 @@ void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
 
-static void nf_ct_release_dying_list(void)
+static void nf_ct_release_dying_list(struct net *net)
 {
        struct nf_conntrack_tuple_hash *h;
        struct nf_conn *ct;
        struct hlist_nulls_node *n;
 
        spin_lock_bh(&nf_conntrack_lock);
-       hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
+       hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
                ct = nf_ct_tuplehash_to_ctrack(h);
                /* never fails to remove them, no listeners at this point */
                nf_ct_kill(ct);
@@ -1091,7 +1122,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
 {
  i_see_dead_people:
        nf_ct_iterate_cleanup(net, kill_all, NULL);
-       nf_ct_release_dying_list();
+       nf_ct_release_dying_list(net);
        if (atomic_read(&net->ct.count) != 0) {
                schedule();
                goto i_see_dead_people;
@@ -1221,9 +1252,9 @@ static int nf_conntrack_init_init_net(void)
         * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
        if (!nf_conntrack_htable_size) {
                nf_conntrack_htable_size
-                       = (((num_physpages << PAGE_SHIFT) / 16384)
+                       = (((totalram_pages << PAGE_SHIFT) / 16384)
                           / sizeof(struct hlist_head));
-               if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+               if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
                        nf_conntrack_htable_size = 16384;
                if (nf_conntrack_htable_size < 32)
                        nf_conntrack_htable_size = 32;
@@ -1326,6 +1357,11 @@ err_stat:
        return ret;
 }
 
+s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
+                       enum ip_conntrack_dir dir,
+                       u32 seq);
+EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
+
 int nf_conntrack_init(struct net *net)
 {
        int ret;
@@ -1343,6 +1379,9 @@ int nf_conntrack_init(struct net *net)
                /* For use by REJECT target */
                rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
                rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
+
+               /* Howto get NAT offsets */
+               rcu_assign_pointer(nf_ct_nat_offset, NULL);
        }
        return 0;