#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/listhelp.h>
-#define IP_CONNTRACK_VERSION "2.3"
+#define IP_CONNTRACK_VERSION "2.4"
#if 0
#define DEBUGP printk
unsigned int ip_conntrack_htable_size = 0;
int ip_conntrack_max;
struct list_head *ip_conntrack_hash;
-static kmem_cache_t *ip_conntrack_cachep;
-static kmem_cache_t *ip_conntrack_expect_cachep;
+static kmem_cache_t *ip_conntrack_cachep __read_mostly;
+static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
struct ip_conntrack ip_conntrack_untracked;
unsigned int ip_ct_log_invalid;
static LIST_HEAD(unconfirmed);
static int ip_conntrack_vmalloc;
-static unsigned int ip_conntrack_next_id = 1;
-static unsigned int ip_conntrack_expect_next_id = 1;
+static unsigned int ip_conntrack_next_id;
+static unsigned int ip_conntrack_expect_next_id;
#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
-struct notifier_block *ip_conntrack_chain;
-struct notifier_block *ip_conntrack_expect_chain;
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
+ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
{
DEBUGP("ecache: delivering events for %p\n", ecache->ct);
if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
- notifier_call_chain(&ip_conntrack_chain, ecache->events,
+ atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
ecache->ct);
ecache->events = 0;
ip_conntrack_put(ecache->ct);
struct ip_conntrack_ecache *ecache;
int cpu;
- for_each_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
ecache = &per_cpu(ip_conntrack_ecache, cpu);
if (ecache->ct)
ip_conntrack_put(ecache->ct);
static int ip_conntrack_hash_rnd_initted;
static unsigned int ip_conntrack_hash_rnd;
-static u_int32_t
-hash_conntrack(const struct ip_conntrack_tuple *tuple)
+static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
+ unsigned int size, unsigned int rnd)
{
-#if 0
- dump_tuple(tuple);
-#endif
return (jhash_3words(tuple->src.ip,
(tuple->dst.ip ^ tuple->dst.protonum),
(tuple->src.u.all | (tuple->dst.u.all << 16)),
- ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+ rnd) % size);
+}
+
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+ return __hash_conntrack(tuple, ip_conntrack_htable_size,
+ ip_conntrack_hash_rnd);
}
int
/* ip_conntrack_expect helper functions */
-static void unlink_expect(struct ip_conntrack_expect *exp)
+void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
{
ASSERT_WRITE_LOCK(&ip_conntrack_lock);
IP_NF_ASSERT(!timer_pending(&exp->timeout));
list_del(&exp->list);
CONNTRACK_STAT_INC(expect_delete);
exp->master->expecting--;
-}
-
-void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp)
-{
- unlink_expect(exp);
ip_conntrack_expect_put(exp);
}
struct ip_conntrack_expect *exp = (void *)ul_expect;
write_lock_bh(&ip_conntrack_lock);
- unlink_expect(exp);
+ ip_ct_unlink_expect(exp);
write_unlock_bh(&ip_conntrack_lock);
ip_conntrack_expect_put(exp);
}
/* Just find a expectation corresponding to a tuple. */
struct ip_conntrack_expect *
-ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
+ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
{
struct ip_conntrack_expect *i;
master ct never got confirmed, we'd hold a reference to it
and weird things would happen to future packets). */
if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
- && is_confirmed(i->master)
- && del_timer(&i->timeout)) {
- unlink_expect(i);
- return i;
+ && is_confirmed(i->master)) {
+ if (i->flags & IP_CT_EXPECT_PERMANENT) {
+ atomic_inc(&i->use);
+ return i;
+ } else if (del_timer(&i->timeout)) {
+ ip_ct_unlink_expect(i);
+ return i;
+ }
}
}
return NULL;
list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
if (i->master == ct && del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
ip_conntrack_expect_put(i);
}
}
IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
IP_NF_ASSERT(!timer_pending(&ct->timeout));
+ ip_conntrack_event(IPCT_DESTROY, ct);
set_bit(IPS_DYING_BIT, &ct->status);
/* To make sure we don't get any weird locking issues here:
{
struct ip_conntrack *ct = (void *)ul_conntrack;
- ip_conntrack_event(IPCT_DESTROY, ct);
write_lock_bh(&ip_conntrack_lock);
/* Inside lock so preempt is disabled on module removal path.
* Otherwise we can get spurious warnings. */
conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
if (!conntrack) {
DEBUGP("Can't allocate conntrack.\n");
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
memset(conntrack, 0, sizeof(*conntrack));
return NULL;
}
- if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple)))
- return NULL;
+ conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
+ if (conntrack == NULL || IS_ERR(conntrack))
+ return (struct ip_conntrack_tuple_hash *)conntrack;
if (!protocol->new(conntrack, skb)) {
ip_conntrack_free(conntrack);
/* choose the the oldest expectation to evict */
list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
if (expect_matches(i, exp) && del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
write_unlock_bh(&ip_conntrack_lock);
ip_conntrack_expect_put(i);
return;
write_unlock_bh(&ip_conntrack_lock);
}
+/* We don't increase the master conntrack refcount for non-fulfilled
+ * conntracks. During the conntrack destruction, the expectations are
+ * always killed before the conntrack itself */
struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
{
struct ip_conntrack_expect *new;
return NULL;
}
new->master = me;
- atomic_inc(&new->master->ct_general.use);
atomic_set(&new->use, 1);
return new;
}
void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
{
- if (atomic_dec_and_test(&exp->use)) {
- ip_conntrack_put(exp->master);
+ if (atomic_dec_and_test(&exp->use))
kmem_cache_free(ip_conntrack_expect_cachep, exp);
- }
}
static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
if (i->master == master) {
if (del_timer(&i->timeout)) {
- unlink_expect(i);
+ ip_ct_unlink_expect(i);
ip_conntrack_expect_put(i);
}
break;
/* Get rid of expectations */
list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
if (exp->master->helper == me && del_timer(&exp->timeout)) {
- unlink_expect(exp);
+ ip_ct_unlink_expect(exp);
ip_conntrack_expect_put(exp);
}
}
synchronize_net();
}
-static inline void ct_add_counters(struct ip_conntrack *ct,
- enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb)
-{
-#ifdef CONFIG_IP_NF_CT_ACCT
- if (skb) {
- ct->counters[CTINFO2DIR(ctinfo)].packets++;
- ct->counters[CTINFO2DIR(ctinfo)].bytes +=
- ntohs(skb->nh.iph->tot_len);
- }
-#endif
-}
-
-/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
-void ip_ct_refresh_acct(struct ip_conntrack *ct,
+/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
+void __ip_ct_refresh_acct(struct ip_conntrack *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb,
- unsigned long extra_jiffies)
+ unsigned long extra_jiffies,
+ int do_acct)
{
+ int event = 0;
+
IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
+ IP_NF_ASSERT(skb);
+
+ write_lock_bh(&ip_conntrack_lock);
/* If not in hash table, timer will not be active yet */
if (!is_confirmed(ct)) {
ct->timeout.expires = extra_jiffies;
- ct_add_counters(ct, ctinfo, skb);
+ event = IPCT_REFRESH;
} else {
- write_lock_bh(&ip_conntrack_lock);
/* Need del_timer for race avoidance (may already be dying). */
if (del_timer(&ct->timeout)) {
ct->timeout.expires = jiffies + extra_jiffies;
add_timer(&ct->timeout);
- ip_conntrack_event_cache(IPCT_REFRESH, skb);
+ event = IPCT_REFRESH;
}
- ct_add_counters(ct, ctinfo, skb);
- write_unlock_bh(&ip_conntrack_lock);
}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+ if (do_acct) {
+ ct->counters[CTINFO2DIR(ctinfo)].packets++;
+ ct->counters[CTINFO2DIR(ctinfo)].bytes +=
+ ntohs(skb->nh.iph->tot_len);
+ if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
+ || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
+ event |= IPCT_COUNTER_FILLING;
+ }
+#endif
+
+ write_unlock_bh(&ip_conntrack_lock);
+
+ /* must be unlocked when calling event cache */
+ if (event)
+ ip_conntrack_event_cache(event, skb);
}
#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
.tuple.dst.u.tcp.port;
sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
.tuple.dst.ip;
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
return 1;
}
-static void free_conntrack_hash(void)
+void ip_conntrack_flush(void)
{
- if (ip_conntrack_vmalloc)
- vfree(ip_conntrack_hash);
+ ip_ct_iterate_cleanup(kill_all, NULL);
+}
+
+static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
+{
+ if (vmalloced)
+ vfree(hash);
else
- free_pages((unsigned long)ip_conntrack_hash,
- get_order(sizeof(struct list_head)
- * ip_conntrack_htable_size));
+ free_pages((unsigned long)hash,
+ get_order(sizeof(struct list_head) * size));
}
-void ip_conntrack_flush()
+/* Mishearing the voices in his head, our hero wonders how he's
+ supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
{
+ ip_ct_attach = NULL;
+
/* This makes sure all current packets have passed through
netfilter framework. Roll on, two-stage module
delete... */
ip_ct_event_cache_flush();
i_see_dead_people:
- ip_ct_iterate_cleanup(kill_all, NULL);
+ ip_conntrack_flush();
if (atomic_read(&ip_conntrack_count) != 0) {
schedule();
goto i_see_dead_people;
/* wait until all references to ip_conntrack_untracked are dropped */
while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
schedule();
-}
-/* Mishearing the voices in his head, our hero wonders how he's
- supposed to kill the mall. */
-void ip_conntrack_cleanup(void)
-{
- ip_ct_attach = NULL;
- ip_conntrack_flush();
kmem_cache_destroy(ip_conntrack_cachep);
kmem_cache_destroy(ip_conntrack_expect_cachep);
- free_conntrack_hash();
+ free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
nf_unregister_sockopt(&so_getorigdst);
}
-static int hashsize;
-module_param(hashsize, int, 0400);
+static struct list_head *alloc_hashtable(int size, int *vmalloced)
+{
+ struct list_head *hash;
+ unsigned int i;
+
+ *vmalloced = 0;
+ hash = (void*)__get_free_pages(GFP_KERNEL,
+ get_order(sizeof(struct list_head)
+ * size));
+ if (!hash) {
+ *vmalloced = 1;
+ printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
+ hash = vmalloc(sizeof(struct list_head) * size);
+ }
+
+ if (hash)
+ for (i = 0; i < size; i++)
+ INIT_LIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+static int set_hashsize(const char *val, struct kernel_param *kp)
+{
+ int i, bucket, hashsize, vmalloced;
+ int old_vmalloced, old_size;
+ int rnd;
+ struct list_head *hash, *old_hash;
+ struct ip_conntrack_tuple_hash *h;
+
+ /* On boot, we can set this without any fancy locking. */
+ if (!ip_conntrack_htable_size)
+ return param_set_int(val, kp);
+
+ hashsize = simple_strtol(val, NULL, 0);
+ if (!hashsize)
+ return -EINVAL;
+
+ hash = alloc_hashtable(hashsize, &vmalloced);
+ if (!hash)
+ return -ENOMEM;
+
+ /* We have to rehash for the new table anyway, so we also can
+ * use a new random seed */
+ get_random_bytes(&rnd, 4);
+
+ write_lock_bh(&ip_conntrack_lock);
+ for (i = 0; i < ip_conntrack_htable_size; i++) {
+ while (!list_empty(&ip_conntrack_hash[i])) {
+ h = list_entry(ip_conntrack_hash[i].next,
+ struct ip_conntrack_tuple_hash, list);
+ list_del(&h->list);
+ bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
+ list_add_tail(&h->list, &hash[bucket]);
+ }
+ }
+ old_size = ip_conntrack_htable_size;
+ old_vmalloced = ip_conntrack_vmalloc;
+ old_hash = ip_conntrack_hash;
+
+ ip_conntrack_htable_size = hashsize;
+ ip_conntrack_vmalloc = vmalloced;
+ ip_conntrack_hash = hash;
+ ip_conntrack_hash_rnd = rnd;
+ write_unlock_bh(&ip_conntrack_lock);
+
+ free_conntrack_hash(old_hash, old_vmalloced, old_size);
+ return 0;
+}
+
+module_param_call(hashsize, set_hashsize, param_get_uint,
+ &ip_conntrack_htable_size, 0600);
int __init ip_conntrack_init(void)
{
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 256 buckets. >= 1GB machines have 8192 buckets. */
- if (hashsize) {
- ip_conntrack_htable_size = hashsize;
- } else {
+ if (!ip_conntrack_htable_size) {
ip_conntrack_htable_size
= (((num_physpages << PAGE_SHIFT) / 16384)
/ sizeof(struct list_head));
return ret;
}
- /* AK: the hash table is twice as big than needed because it
- uses list_head. it would be much nicer to caches to use a
- single pointer list head here. */
- ip_conntrack_vmalloc = 0;
- ip_conntrack_hash
- =(void*)__get_free_pages(GFP_KERNEL,
- get_order(sizeof(struct list_head)
- *ip_conntrack_htable_size));
- if (!ip_conntrack_hash) {
- ip_conntrack_vmalloc = 1;
- printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
- ip_conntrack_hash = vmalloc(sizeof(struct list_head)
- * ip_conntrack_htable_size);
- }
+ ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
+ &ip_conntrack_vmalloc);
if (!ip_conntrack_hash) {
printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
goto err_unreg_sockopt;
ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
write_unlock_bh(&ip_conntrack_lock);
- for (i = 0; i < ip_conntrack_htable_size; i++)
- INIT_LIST_HEAD(&ip_conntrack_hash[i]);
-
/* For use by ipt_REJECT */
ip_ct_attach = ip_conntrack_attach;
err_free_conntrack_slab:
kmem_cache_destroy(ip_conntrack_cachep);
err_free_hash:
- free_conntrack_hash();
+ free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
+ ip_conntrack_htable_size);
err_unreg_sockopt:
nf_unregister_sockopt(&so_getorigdst);