X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Fnetfilter%2Fip_conntrack_core.c;h=a297da7bbef5dc735d1c68d57fe35b1db901d820;hb=ca3ba88d0cf4b5d7a628caf505c231162dde9429;hp=5c3f16eae2d83ff537da0081aa0f31274cc0d421;hpb=a86888b925299330053d20e0eba03ac4d2648c4b;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 5c3f16e..a297da7 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -50,7 +50,7 @@ #include #include -#define IP_CONNTRACK_VERSION "2.3" +#define IP_CONNTRACK_VERSION "2.4" #if 0 #define DEBUGP printk @@ -70,18 +70,18 @@ static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; int ip_conntrack_max; struct list_head *ip_conntrack_hash; -static kmem_cache_t *ip_conntrack_cachep; -static kmem_cache_t *ip_conntrack_expect_cachep; +static kmem_cache_t *ip_conntrack_cachep __read_mostly; +static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; struct ip_conntrack ip_conntrack_untracked; unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); static int ip_conntrack_vmalloc; -static unsigned int ip_conntrack_next_id = 1; -static unsigned int ip_conntrack_expect_next_id = 1; +static unsigned int ip_conntrack_next_id; +static unsigned int ip_conntrack_expect_next_id; #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS -struct notifier_block *ip_conntrack_chain; -struct notifier_block *ip_conntrack_expect_chain; +ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain); +ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain); DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); @@ -92,7 +92,7 @@ __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) { DEBUGP("ecache: delivering events for %p\n", ecache->ct); if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) - notifier_call_chain(&ip_conntrack_chain, ecache->events, + atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events, ecache->ct); ecache->events = 0; ip_conntrack_put(ecache->ct); @@ -133,7 +133,7 @@ static void ip_ct_event_cache_flush(void) struct ip_conntrack_ecache *ecache; int cpu; - for_each_cpu(cpu) { + for_each_possible_cpu(cpu) { ecache = &per_cpu(ip_conntrack_ecache, cpu); if (ecache->ct) ip_conntrack_put(ecache->ct); @@ -148,16 +148,20 @@ DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; -static u_int32_t -hash_conntrack(const struct ip_conntrack_tuple *tuple) +static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple, + unsigned int size, unsigned int rnd) { -#if 0 - dump_tuple(tuple); -#endif return (jhash_3words(tuple->src.ip, (tuple->dst.ip ^ tuple->dst.protonum), (tuple->src.u.all | (tuple->dst.u.all << 16)), - ip_conntrack_hash_rnd) % ip_conntrack_htable_size); + rnd) % size); +} + +static u_int32_t +hash_conntrack(const struct ip_conntrack_tuple *tuple) +{ + return __hash_conntrack(tuple, ip_conntrack_htable_size, + ip_conntrack_hash_rnd); } int @@ -197,18 +201,13 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, /* ip_conntrack_expect helper functions */ -static void unlink_expect(struct ip_conntrack_expect *exp) +void ip_ct_unlink_expect(struct ip_conntrack_expect *exp) { ASSERT_WRITE_LOCK(&ip_conntrack_lock); IP_NF_ASSERT(!timer_pending(&exp->timeout)); list_del(&exp->list); CONNTRACK_STAT_INC(expect_delete); exp->master->expecting--; -} - -void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) -{ - unlink_expect(exp); ip_conntrack_expect_put(exp); } @@ -217,7 +216,7 @@ static void expectation_timed_out(unsigned long ul_expect) struct ip_conntrack_expect *exp = (void *)ul_expect; write_lock_bh(&ip_conntrack_lock); - unlink_expect(exp); + ip_ct_unlink_expect(exp); write_unlock_bh(&ip_conntrack_lock); ip_conntrack_expect_put(exp); } @@ -238,7 +237,7 @@ __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) /* Just find a expectation corresponding to a tuple. */ struct ip_conntrack_expect * -ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) +ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) { struct ip_conntrack_expect *i; @@ -263,10 +262,14 @@ find_expectation(const struct ip_conntrack_tuple *tuple) master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) - && is_confirmed(i->master) - && del_timer(&i->timeout)) { - unlink_expect(i); - return i; + && is_confirmed(i->master)) { + if (i->flags & IP_CT_EXPECT_PERMANENT) { + atomic_inc(&i->use); + return i; + } else if (del_timer(&i->timeout)) { + ip_ct_unlink_expect(i); + return i; + } } } return NULL; @@ -283,7 +286,7 @@ void ip_ct_remove_expectations(struct ip_conntrack *ct) list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { if (i->master == ct && del_timer(&i->timeout)) { - unlink_expect(i); + ip_ct_unlink_expect(i); ip_conntrack_expect_put(i); } } @@ -316,6 +319,7 @@ destroy_conntrack(struct nf_conntrack *nfct) IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + ip_conntrack_event(IPCT_DESTROY, ct); set_bit(IPS_DYING_BIT, &ct->status); /* To make sure we don't get any weird locking issues here: @@ -355,7 +359,6 @@ static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; - ip_conntrack_event(IPCT_DESTROY, ct); write_lock_bh(&ip_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ @@ -654,7 +657,7 @@ struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); - return NULL; + return ERR_PTR(-ENOMEM); } memset(conntrack, 0, sizeof(*conntrack)); @@ -695,8 +698,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, return NULL; } - if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple))) - return NULL; + conntrack = ip_conntrack_alloc(tuple, &repl_tuple); + if (conntrack == NULL || IS_ERR(conntrack)) + return (struct ip_conntrack_tuple_hash *)conntrack; if (!protocol->new(conntrack, skb)) { ip_conntrack_free(conntrack); @@ -923,7 +927,7 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) /* choose the the oldest expectation to evict */ list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { if (expect_matches(i, exp) && del_timer(&i->timeout)) { - unlink_expect(i); + ip_ct_unlink_expect(i); write_unlock_bh(&ip_conntrack_lock); ip_conntrack_expect_put(i); return; @@ -932,6 +936,9 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) write_unlock_bh(&ip_conntrack_lock); } +/* We don't increase the master conntrack refcount for non-fulfilled + * conntracks. During the conntrack destruction, the expectations are + * always killed before the conntrack itself */ struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me) { struct ip_conntrack_expect *new; @@ -942,17 +949,14 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me) return NULL; } new->master = me; - atomic_inc(&new->master->ct_general.use); atomic_set(&new->use, 1); return new; } void ip_conntrack_expect_put(struct ip_conntrack_expect *exp) { - if (atomic_dec_and_test(&exp->use)) { - ip_conntrack_put(exp->master); + if (atomic_dec_and_test(&exp->use)) kmem_cache_free(ip_conntrack_expect_cachep, exp); - } } static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) @@ -980,7 +984,7 @@ static void evict_oldest_expect(struct ip_conntrack *master) list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { if (i->master == master) { if (del_timer(&i->timeout)) { - unlink_expect(i); + ip_ct_unlink_expect(i); ip_conntrack_expect_put(i); } break; @@ -1097,7 +1101,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) /* Get rid of expectations */ list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { if (exp->master->helper == me && del_timer(&exp->timeout)) { - unlink_expect(exp); + ip_ct_unlink_expect(exp); ip_conntrack_expect_put(exp); } } @@ -1112,42 +1116,49 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) synchronize_net(); } -static inline void ct_add_counters(struct ip_conntrack *ct, - enum ip_conntrack_info ctinfo, - const struct sk_buff *skb) -{ -#ifdef CONFIG_IP_NF_CT_ACCT - if (skb) { - ct->counters[CTINFO2DIR(ctinfo)].packets++; - ct->counters[CTINFO2DIR(ctinfo)].bytes += - ntohs(skb->nh.iph->tot_len); - } -#endif -} - -/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */ -void ip_ct_refresh_acct(struct ip_conntrack *ct, +/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ +void __ip_ct_refresh_acct(struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, const struct sk_buff *skb, - unsigned long extra_jiffies) + unsigned long extra_jiffies, + int do_acct) { + int event = 0; + IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); + IP_NF_ASSERT(skb); + + write_lock_bh(&ip_conntrack_lock); /* If not in hash table, timer will not be active yet */ if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; - ct_add_counters(ct, ctinfo, skb); + event = IPCT_REFRESH; } else { - write_lock_bh(&ip_conntrack_lock); /* Need del_timer for race avoidance (may already be dying). */ if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); - ip_conntrack_event_cache(IPCT_REFRESH, skb); + event = IPCT_REFRESH; } - ct_add_counters(ct, ctinfo, skb); - write_unlock_bh(&ip_conntrack_lock); } + +#ifdef CONFIG_IP_NF_CT_ACCT + if (do_acct) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000) + || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000)) + event |= IPCT_COUNTER_FILLING; + } +#endif + + write_unlock_bh(&ip_conntrack_lock); + + /* must be unlocked when calling event cache */ + if (event) + ip_conntrack_event_cache(event, skb); } #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ @@ -1307,6 +1318,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u.tcp.port; sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); @@ -1334,18 +1346,26 @@ static int kill_all(struct ip_conntrack *i, void *data) return 1; } -static void free_conntrack_hash(void) +void ip_conntrack_flush(void) { - if (ip_conntrack_vmalloc) - vfree(ip_conntrack_hash); + ip_ct_iterate_cleanup(kill_all, NULL); +} + +static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size) +{ + if (vmalloced) + vfree(hash); else - free_pages((unsigned long)ip_conntrack_hash, - get_order(sizeof(struct list_head) - * ip_conntrack_htable_size)); + free_pages((unsigned long)hash, + get_order(sizeof(struct list_head) * size)); } -void ip_conntrack_flush() +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) { + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ @@ -1353,7 +1373,7 @@ void ip_conntrack_flush() ip_ct_event_cache_flush(); i_see_dead_people: - ip_ct_iterate_cleanup(kill_all, NULL); + ip_conntrack_flush(); if (atomic_read(&ip_conntrack_count) != 0) { schedule(); goto i_see_dead_people; @@ -1361,22 +1381,86 @@ void ip_conntrack_flush() /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); -} -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) -{ - ip_ct_attach = NULL; - ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); - free_conntrack_hash(); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); nf_unregister_sockopt(&so_getorigdst); } -static int hashsize; -module_param(hashsize, int, 0400); +static struct list_head *alloc_hashtable(int size, int *vmalloced) +{ + struct list_head *hash; + unsigned int i; + + *vmalloced = 0; + hash = (void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + * size)); + if (!hash) { + *vmalloced = 1; + printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n"); + hash = vmalloc(sizeof(struct list_head) * size); + } + + if (hash) + for (i = 0; i < size; i++) + INIT_LIST_HEAD(&hash[i]); + + return hash; +} + +static int set_hashsize(const char *val, struct kernel_param *kp) +{ + int i, bucket, hashsize, vmalloced; + int old_vmalloced, old_size; + int rnd; + struct list_head *hash, *old_hash; + struct ip_conntrack_tuple_hash *h; + + /* On boot, we can set this without any fancy locking. */ + if (!ip_conntrack_htable_size) + return param_set_int(val, kp); + + hashsize = simple_strtol(val, NULL, 0); + if (!hashsize) + return -EINVAL; + + hash = alloc_hashtable(hashsize, &vmalloced); + if (!hash) + return -ENOMEM; + + /* We have to rehash for the new table anyway, so we also can + * use a new random seed */ + get_random_bytes(&rnd, 4); + + write_lock_bh(&ip_conntrack_lock); + for (i = 0; i < ip_conntrack_htable_size; i++) { + while (!list_empty(&ip_conntrack_hash[i])) { + h = list_entry(ip_conntrack_hash[i].next, + struct ip_conntrack_tuple_hash, list); + list_del(&h->list); + bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + list_add_tail(&h->list, &hash[bucket]); + } + } + old_size = ip_conntrack_htable_size; + old_vmalloced = ip_conntrack_vmalloc; + old_hash = ip_conntrack_hash; + + ip_conntrack_htable_size = hashsize; + ip_conntrack_vmalloc = vmalloced; + ip_conntrack_hash = hash; + ip_conntrack_hash_rnd = rnd; + write_unlock_bh(&ip_conntrack_lock); + + free_conntrack_hash(old_hash, old_vmalloced, old_size); + return 0; +} + +module_param_call(hashsize, set_hashsize, param_get_uint, + &ip_conntrack_htable_size, 0600); int __init ip_conntrack_init(void) { @@ -1385,9 +1469,7 @@ int __init ip_conntrack_init(void) /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ - if (hashsize) { - ip_conntrack_htable_size = hashsize; - } else { + if (!ip_conntrack_htable_size) { ip_conntrack_htable_size = (((num_physpages << PAGE_SHIFT) / 16384) / sizeof(struct list_head)); @@ -1409,20 +1491,8 @@ int __init ip_conntrack_init(void) return ret; } - /* AK: the hash table is twice as big than needed because it - uses list_head. it would be much nicer to caches to use a - single pointer list head here. */ - ip_conntrack_vmalloc = 0; - ip_conntrack_hash - =(void*)__get_free_pages(GFP_KERNEL, - get_order(sizeof(struct list_head) - *ip_conntrack_htable_size)); - if (!ip_conntrack_hash) { - ip_conntrack_vmalloc = 1; - printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); - ip_conntrack_hash = vmalloc(sizeof(struct list_head) - * ip_conntrack_htable_size); - } + ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size, + &ip_conntrack_vmalloc); if (!ip_conntrack_hash) { printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); goto err_unreg_sockopt; @@ -1454,9 +1524,6 @@ int __init ip_conntrack_init(void) ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; write_unlock_bh(&ip_conntrack_lock); - for (i = 0; i < ip_conntrack_htable_size; i++) - INIT_LIST_HEAD(&ip_conntrack_hash[i]); - /* For use by ipt_REJECT */ ip_ct_attach = ip_conntrack_attach; @@ -1471,7 +1538,8 @@ int __init ip_conntrack_init(void) err_free_conntrack_slab: kmem_cache_destroy(ip_conntrack_cachep); err_free_hash: - free_conntrack_hash(); + free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc, + ip_conntrack_htable_size); err_unreg_sockopt: nf_unregister_sockopt(&so_getorigdst);