X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fnetfilter%2Fnf_conntrack_core.c;h=471e2a79d26fad0712d2797e14126b3129b2640e;hb=b2a15a604d379af323645e330638e2cfcc696aff;hp=8020db6274b86471149533b40eecd89c496c24d5;hpb=01e6de64d9c8d0e75dca3bb4cf898db73abe00d4;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8020db6..471e2a7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +40,7 @@ #include #include #include +#include #include #include @@ -46,7 +48,7 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, enum nf_nat_manip_type manip, - struct nlattr *attr) __read_mostly; + const struct nlattr *attr) __read_mostly; EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); DEFINE_SPINLOCK(nf_conntrack_lock); @@ -182,10 +184,6 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); - if (!test_bit(IPS_DYING_BIT, &ct->status)) - nf_conntrack_event(IPCT_DESTROY, ct); - set_bit(IPS_DYING_BIT, &ct->status); - /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to nf_conntrack_lock!!! -HW */ @@ -219,27 +217,70 @@ destroy_conntrack(struct nf_conntrack *nfct) nf_conntrack_free(ct); } -static void death_by_timeout(unsigned long ul_conntrack) +void nf_ct_delete_from_lists(struct nf_conn *ct) { - struct nf_conn *ct = (void *)ul_conntrack; struct net *net = nf_ct_net(ct); - struct nf_conn_help *help = nfct_help(ct); - struct nf_conntrack_helper *helper; - - if (help) { - rcu_read_lock(); - helper = rcu_dereference(help->helper); - if (helper && helper->destroy) - helper->destroy(ct); - rcu_read_unlock(); - } + nf_ct_helper_destroy(ct); spin_lock_bh(&nf_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ NF_CT_STAT_INC(net, delete_list); clean_from_lists(ct); spin_unlock_bh(&nf_conntrack_lock); +} +EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists); + +static void death_by_event(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); + + if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { + /* bad luck, let's retry again */ + ct->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ct->timeout); + return; + } + /* we've got the event delivered, now it's dying */ + set_bit(IPS_DYING_BIT, &ct->status); + spin_lock(&nf_conntrack_lock); + hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + spin_unlock(&nf_conntrack_lock); + nf_ct_put(ct); +} + +void nf_ct_insert_dying_list(struct nf_conn *ct) +{ + struct net *net = nf_ct_net(ct); + + /* add this conntrack to the dying list */ + spin_lock_bh(&nf_conntrack_lock); + hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, + &net->ct.dying); + spin_unlock_bh(&nf_conntrack_lock); + /* set a new timer to retry event delivery */ + setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); + ct->timeout.expires = jiffies + + (random32() % net->ct.sysctl_events_retry_timeout); + add_timer(&ct->timeout); +} +EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct nf_conn *ct = (void *)ul_conntrack; + + if (!test_bit(IPS_DYING_BIT, &ct->status) && + unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { + /* destroy event was not delivered */ + nf_ct_delete_from_lists(ct); + nf_ct_insert_dying_list(ct); + return; + } + set_bit(IPS_DYING_BIT, &ct->status); + nf_ct_delete_from_lists(ct); nf_ct_put(ct); } @@ -295,7 +336,8 @@ begin: h = __nf_conntrack_find(net, tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (unlikely(nf_ct_is_dying(ct) || + !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple))) { @@ -385,7 +427,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* Remove from unconfirmed list */ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); - __nf_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ @@ -393,16 +434,20 @@ __nf_conntrack_confirm(struct sk_buff *skb) add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); set_bit(IPS_CONFIRMED_BIT, &ct->status); + + /* Since the lookup is lockless, hash insertion must be done after + * starting the timer and setting the CONFIRMED bit. The RCU barriers + * guarantee that no other CPU can find the conntrack before the above + * stores are visible. + */ + __nf_conntrack_hash_insert(ct, hash, repl_hash); NF_CT_STAT_INC(net, insert); spin_unlock_bh(&nf_conntrack_lock); + help = nfct_help(ct); if (help && help->helper) nf_conntrack_event_cache(IPCT_HELPER, ct); -#ifdef CONFIG_NF_NAT_NEEDED - if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || - test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_NATINFO, ct); -#endif + nf_conntrack_event_cache(master_ct(ct) ? IPCT_RELATED : IPCT_NEW, ct); return NF_ACCEPT; @@ -467,10 +512,17 @@ static noinline int early_drop(struct net *net, unsigned int hash) cnt++; } - if (ct && unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) - ct = NULL; - if (ct || cnt >= NF_CT_EVICTION_RANGE) + if (ct != NULL) { + if (likely(!nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use))) + break; + else + ct = NULL; + } + + if (cnt >= NF_CT_EVICTION_RANGE) break; + hash = (hash + 1) % nf_conntrack_htable_size; } rcu_read_unlock(); @@ -516,22 +568,38 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, } } - ct = kmem_cache_zalloc(nf_conntrack_cachep, gfp); + /* + * Do not use kmem_cache_zalloc(), as this cache uses + * SLAB_DESTROY_BY_RCU. + */ + ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) { pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); } - - atomic_set(&ct->ct_general.use, 1); + /* + * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next + * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. + */ + memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, + sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); + spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; + ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL; /* Don't set timer yet: wait for confirmation */ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); #ifdef CONFIG_NET_NS ct->ct_net = net; #endif + /* + * changes to lookup keys must be done before setting refcnt to 1 + */ + smp_wmb(); + atomic_set(&ct->ct_general.use, 1); return ct; } EXPORT_SYMBOL_GPL(nf_conntrack_alloc); @@ -550,7 +618,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static struct nf_conntrack_tuple_hash * -init_conntrack(struct net *net, +init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, @@ -560,6 +628,7 @@ init_conntrack(struct net *net, struct nf_conn *ct; struct nf_conn_help *help; struct nf_conntrack_tuple repl_tuple; + struct nf_conntrack_ecache *ecache; struct nf_conntrack_expect *exp; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { @@ -581,6 +650,11 @@ init_conntrack(struct net *net, nf_ct_acct_ext_add(ct, GFP_ATOMIC); + ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; + nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, + ecache ? ecache->expmask : 0, + GFP_ATOMIC); + spin_lock_bh(&nf_conntrack_lock); exp = nf_ct_find_expectation(net, tuple); if (exp) { @@ -604,7 +678,7 @@ init_conntrack(struct net *net, nf_conntrack_get(&ct->master->ct_general); NF_CT_STAT_INC(net, expect_new); } else { - __nf_ct_try_assign_helper(ct, GFP_ATOMIC); + __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); NF_CT_STAT_INC(net, new); } @@ -625,7 +699,7 @@ init_conntrack(struct net *net, /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ static inline struct nf_conn * -resolve_normal_ct(struct net *net, +resolve_normal_ct(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, @@ -649,7 +723,8 @@ resolve_normal_ct(struct net *net, /* look for tuple match */ h = nf_conntrack_find_get(net, &tuple); if (!h) { - h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff); + h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, + skb, dataoff); if (!h) return NULL; if (IS_ERR(h)) @@ -686,7 +761,7 @@ unsigned int nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { - struct nf_conn *ct; + struct nf_conn *ct, *tmpl = NULL; enum ip_conntrack_info ctinfo; struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -695,10 +770,14 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, int set_reply = 0; int ret; - /* Previously seen (loopback or untracked)? Ignore. */ if (skb->nfct) { - NF_CT_STAT_INC_ATOMIC(net, ignore); - return NF_ACCEPT; + /* Previously seen (loopback or untracked)? Ignore. */ + tmpl = (struct nf_conn *)skb->nfct; + if (!nf_ct_is_template(tmpl)) { + NF_CT_STAT_INC_ATOMIC(net, ignore); + return NF_ACCEPT; + } + skb->nfct = NULL; } /* rcu_read_lock()ed by nf_hook_slow */ @@ -709,7 +788,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, pr_debug("not prepared to track yet or error occured\n"); NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - return -ret; + ret = -ret; + goto out; } l4proto = __nf_ct_l4proto_find(pf, protonum); @@ -722,22 +802,25 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, if (ret <= 0) { NF_CT_STAT_INC_ATOMIC(net, error); NF_CT_STAT_INC_ATOMIC(net, invalid); - return -ret; + ret = -ret; + goto out; } } - ct = resolve_normal_ct(net, skb, dataoff, pf, protonum, + ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l3proto, l4proto, &set_reply, &ctinfo); if (!ct) { /* Not valid part of a connection */ NF_CT_STAT_INC_ATOMIC(net, invalid); - return NF_ACCEPT; + ret = NF_ACCEPT; + goto out; } if (IS_ERR(ct)) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(net, drop); - return NF_DROP; + ret = NF_DROP; + goto out; } NF_CT_ASSERT(skb->nfct); @@ -752,11 +835,15 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, NF_CT_STAT_INC_ATOMIC(net, invalid); if (ret == -NF_DROP) NF_CT_STAT_INC_ATOMIC(net, drop); - return -ret; + ret = -ret; + goto out; } if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_STATUS, ct); + nf_conntrack_event_cache(IPCT_REPLY, ct); +out: + if (tmpl) + nf_ct_put(tmpl); return ret; } @@ -795,7 +882,7 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, return; rcu_read_lock(); - __nf_ct_try_assign_helper(ct, GFP_ATOMIC); + __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); @@ -807,13 +894,9 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, unsigned long extra_jiffies, int do_acct) { - int event = 0; - NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); NF_CT_ASSERT(skb); - spin_lock_bh(&nf_conntrack_lock); - /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) goto acct; @@ -821,19 +904,14 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, /* If not in hash table, timer will not be active yet */ if (!nf_ct_is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; - event = IPCT_REFRESH; } else { unsigned long newtime = jiffies + extra_jiffies; /* Only update the timeout if the new timeout is at least HZ jiffies from the old timeout. Need del_timer for race avoidance (may already be dying). */ - if (newtime - ct->timeout.expires >= HZ - && del_timer(&ct->timeout)) { - ct->timeout.expires = newtime; - add_timer(&ct->timeout); - event = IPCT_REFRESH; - } + if (newtime - ct->timeout.expires >= HZ) + mod_timer_pending(&ct->timeout, newtime); } acct: @@ -842,17 +920,13 @@ acct: acct = nf_conn_acct_find(ct); if (acct) { + spin_lock_bh(&ct->lock); acct[CTINFO2DIR(ctinfo)].packets++; acct[CTINFO2DIR(ctinfo)].bytes += skb->len - skb_network_offset(skb); + spin_unlock_bh(&ct->lock); } } - - spin_unlock_bh(&nf_conntrack_lock); - - /* must be unlocked when calling event cache */ - if (event) - nf_conntrack_event_cache(event, ct); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -864,14 +938,14 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, if (do_acct) { struct nf_conn_counter *acct; - spin_lock_bh(&nf_conntrack_lock); acct = nf_conn_acct_find(ct); if (acct) { + spin_lock_bh(&ct->lock); acct[CTINFO2DIR(ctinfo)].packets++; acct[CTINFO2DIR(ctinfo)].bytes += skb->len - skb_network_offset(skb); + spin_unlock_bh(&ct->lock); } - spin_unlock_bh(&nf_conntrack_lock); } if (del_timer(&ct->timeout)) { @@ -1001,15 +1075,22 @@ struct __nf_ct_flush_report { int report; }; -static int kill_all(struct nf_conn *i, void *data) +static int kill_report(struct nf_conn *i, void *data) { struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; - /* get_next_corpse sets the dying bit for us */ - nf_conntrack_event_report(IPCT_DESTROY, - i, - fr->pid, - fr->report); + /* If we fail to deliver the event, death_by_timeout() will retry */ + if (nf_conntrack_event_report(IPCT_DESTROY, i, + fr->pid, fr->report) < 0) + return 1; + + /* Avoid the delivery of the destroy event in death_by_timeout(). */ + set_bit(IPS_DYING_BIT, &i->status); + return 1; +} + +static int kill_all(struct nf_conn *i, void *data) +{ return 1; } @@ -1023,15 +1104,30 @@ void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush(struct net *net, u32 pid, int report) +void nf_conntrack_flush_report(struct net *net, u32 pid, int report) { struct __nf_ct_flush_report fr = { .pid = pid, .report = report, }; - nf_ct_iterate_cleanup(net, kill_all, &fr); + nf_ct_iterate_cleanup(net, kill_report, &fr); +} +EXPORT_SYMBOL_GPL(nf_conntrack_flush_report); + +static void nf_ct_release_dying_list(struct net *net) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct hlist_nulls_node *n; + + spin_lock_bh(&nf_conntrack_lock); + hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* never fails to remove them, no listeners at this point */ + nf_ct_kill(ct); + } + spin_unlock_bh(&nf_conntrack_lock); } -EXPORT_SYMBOL_GPL(nf_conntrack_flush); static void nf_conntrack_cleanup_init_net(void) { @@ -1042,10 +1138,9 @@ static void nf_conntrack_cleanup_init_net(void) static void nf_conntrack_cleanup_net(struct net *net) { - nf_ct_event_cache_flush(net); - nf_conntrack_ecache_fini(net); i_see_dead_people: - nf_conntrack_flush(net, 0, 0); + nf_ct_iterate_cleanup(net, kill_all, NULL); + nf_ct_release_dying_list(net); if (atomic_read(&net->ct.count) != 0) { schedule(); goto i_see_dead_people; @@ -1056,6 +1151,7 @@ static void nf_conntrack_cleanup_net(struct net *net) nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); + nf_conntrack_ecache_fini(net); nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); free_percpu(net->ct.stat); @@ -1174,9 +1270,9 @@ static int nf_conntrack_init_init_net(void) * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ if (!nf_conntrack_htable_size) { nf_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) + = (((totalram_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 16384; if (nf_conntrack_htable_size < 32) nf_conntrack_htable_size = 32; @@ -1220,20 +1316,24 @@ err_cache: return ret; } +/* + * We need to use special "null" values, not used in hash table + */ +#define UNCONFIRMED_NULLS_VAL ((1<<30)+0) +#define DYING_NULLS_VAL ((1<<30)+1) + static int nf_conntrack_init_net(struct net *net) { int ret; atomic_set(&net->ct.count, 0); - INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0); + INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL); + INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL); net->ct.stat = alloc_percpu(struct ip_conntrack_stat); if (!net->ct.stat) { ret = -ENOMEM; goto err_stat; } - ret = nf_conntrack_ecache_init(net); - if (ret < 0) - goto err_ecache; net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, &net->ct.hash_vmalloc, 1); if (!net->ct.hash) { @@ -1247,6 +1347,9 @@ static int nf_conntrack_init_net(struct net *net) ret = nf_conntrack_acct_init(net); if (ret < 0) goto err_acct; + ret = nf_conntrack_ecache_init(net); + if (ret < 0) + goto err_ecache; /* Set up fake conntrack: - to never be deleted, not in any hashes */ @@ -1259,19 +1362,24 @@ static int nf_conntrack_init_net(struct net *net) return 0; +err_ecache: + nf_conntrack_acct_fini(net); err_acct: nf_conntrack_expect_fini(net); err_expect: nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); err_hash: - nf_conntrack_ecache_fini(net); -err_ecache: free_percpu(net->ct.stat); err_stat: return ret; } +s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq); +EXPORT_SYMBOL_GPL(nf_ct_nat_offset); + int nf_conntrack_init(struct net *net) { int ret; @@ -1289,6 +1397,9 @@ int nf_conntrack_init(struct net *net) /* For use by REJECT target */ rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + + /* Howto get NAT offsets */ + rcu_assign_pointer(nf_ct_nat_offset, NULL); } return 0;