#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/module.h>
+#include <linux/sched.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
enum nf_nat_manip_type manip,
- struct nlattr *attr) __read_mostly;
+ const struct nlattr *attr) __read_mostly;
EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
DEFINE_SPINLOCK(nf_conntrack_lock);
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
- if (!test_bit(IPS_DYING_BIT, &ct->status))
- nf_conntrack_event(IPCT_DESTROY, ct);
- set_bit(IPS_DYING_BIT, &ct->status);
-
/* To make sure we don't get any weird locking issues here:
* destroy_conntrack() MUST NOT be called with a write lock
* to nf_conntrack_lock!!! -HW */
nf_conntrack_free(ct);
}
-static void death_by_timeout(unsigned long ul_conntrack)
+void nf_ct_delete_from_lists(struct nf_conn *ct)
{
- struct nf_conn *ct = (void *)ul_conntrack;
struct net *net = nf_ct_net(ct);
- struct nf_conn_help *help = nfct_help(ct);
- struct nf_conntrack_helper *helper;
-
- if (help) {
- rcu_read_lock();
- helper = rcu_dereference(help->helper);
- if (helper && helper->destroy)
- helper->destroy(ct);
- rcu_read_unlock();
- }
+ nf_ct_helper_destroy(ct);
spin_lock_bh(&nf_conntrack_lock);
/* Inside lock so preempt is disabled on module removal path.
* Otherwise we can get spurious warnings. */
NF_CT_STAT_INC(net, delete_list);
clean_from_lists(ct);
spin_unlock_bh(&nf_conntrack_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
+
+static void death_by_event(unsigned long ul_conntrack)
+{
+ struct nf_conn *ct = (void *)ul_conntrack;
+ struct net *net = nf_ct_net(ct);
+
+ if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
+ /* bad luck, let's retry again */
+ ct->timeout.expires = jiffies +
+ (random32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ct->timeout);
+ return;
+ }
+ /* we've got the event delivered, now it's dying */
+ set_bit(IPS_DYING_BIT, &ct->status);
+ spin_lock(&nf_conntrack_lock);
+ hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ spin_unlock(&nf_conntrack_lock);
+ nf_ct_put(ct);
+}
+
+void nf_ct_insert_dying_list(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+
+ /* add this conntrack to the dying list */
+ spin_lock_bh(&nf_conntrack_lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &net->ct.dying);
+ spin_unlock_bh(&nf_conntrack_lock);
+ /* set a new timer to retry event delivery */
+ setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
+ ct->timeout.expires = jiffies +
+ (random32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ct->timeout);
+}
+EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ struct nf_conn *ct = (void *)ul_conntrack;
+
+ if (!test_bit(IPS_DYING_BIT, &ct->status) &&
+ unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+ /* destroy event was not delivered */
+ nf_ct_delete_from_lists(ct);
+ nf_ct_insert_dying_list(ct);
+ return;
+ }
+ set_bit(IPS_DYING_BIT, &ct->status);
+ nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
}
h = __nf_conntrack_find(net, tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
- if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ if (unlikely(nf_ct_is_dying(ct) ||
+ !atomic_inc_not_zero(&ct->ct_general.use)))
h = NULL;
else {
if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple))) {
/* Remove from unconfirmed list */
hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
- __nf_conntrack_hash_insert(ct, hash, repl_hash);
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
add_timer(&ct->timeout);
atomic_inc(&ct->ct_general.use);
set_bit(IPS_CONFIRMED_BIT, &ct->status);
+
+ /* Since the lookup is lockless, hash insertion must be done after
+ * starting the timer and setting the CONFIRMED bit. The RCU barriers
+ * guarantee that no other CPU can find the conntrack before the above
+ * stores are visible.
+ */
+ __nf_conntrack_hash_insert(ct, hash, repl_hash);
NF_CT_STAT_INC(net, insert);
spin_unlock_bh(&nf_conntrack_lock);
+
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
-#ifdef CONFIG_NF_NAT_NEEDED
- if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
- test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
- nf_conntrack_event_cache(IPCT_NATINFO, ct);
-#endif
+
nf_conntrack_event_cache(master_ct(ct) ?
IPCT_RELATED : IPCT_NEW, ct);
return NF_ACCEPT;
cnt++;
}
- if (ct && unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
- ct = NULL;
- if (ct || cnt >= NF_CT_EVICTION_RANGE)
+ if (ct != NULL) {
+ if (likely(!nf_ct_is_dying(ct) &&
+ atomic_inc_not_zero(&ct->ct_general.use)))
+ break;
+ else
+ ct = NULL;
+ }
+
+ if (cnt >= NF_CT_EVICTION_RANGE)
break;
+
hash = (hash + 1) % nf_conntrack_htable_size;
}
rcu_read_unlock();
}
}
- ct = kmem_cache_zalloc(nf_conntrack_cachep, gfp);
+ /*
+ * Do not use kmem_cache_zalloc(), as this cache uses
+ * SLAB_DESTROY_BY_RCU.
+ */
+ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
if (ct == NULL) {
pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
atomic_dec(&net->ct.count);
return ERR_PTR(-ENOMEM);
}
-
- atomic_set(&ct->ct_general.use, 1);
+ /*
+ * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next
+ * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
+ */
+ memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
+ sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
+ spin_lock_init(&ct->lock);
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
+ ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
/* Don't set timer yet: wait for confirmation */
setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
#ifdef CONFIG_NET_NS
ct->ct_net = net;
#endif
+ /*
+ * changes to lookup keys must be done before setting refcnt to 1
+ */
+ smp_wmb();
+ atomic_set(&ct->ct_general.use, 1);
return ct;
}
EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
/* Allocate a new conntrack: we return -ENOMEM if classification
failed due to stress. Otherwise it really is unclassifiable. */
static struct nf_conntrack_tuple_hash *
-init_conntrack(struct net *net,
+init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_l3proto *l3proto,
struct nf_conntrack_l4proto *l4proto,
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
+ struct nf_conntrack_ecache *ecache;
struct nf_conntrack_expect *exp;
if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
+ nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+ ecache ? ecache->expmask : 0,
+ GFP_ATOMIC);
+
spin_lock_bh(&nf_conntrack_lock);
exp = nf_ct_find_expectation(net, tuple);
if (exp) {
nf_conntrack_get(&ct->master->ct_general);
NF_CT_STAT_INC(net, expect_new);
} else {
- __nf_ct_try_assign_helper(ct, GFP_ATOMIC);
+ __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
NF_CT_STAT_INC(net, new);
}
/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
static inline struct nf_conn *
-resolve_normal_ct(struct net *net,
+resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int dataoff,
u_int16_t l3num,
/* look for tuple match */
h = nf_conntrack_find_get(net, &tuple);
if (!h) {
- h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff);
+ h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
+ skb, dataoff);
if (!h)
return NULL;
if (IS_ERR(h))
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
struct sk_buff *skb)
{
- struct nf_conn *ct;
+ struct nf_conn *ct, *tmpl = NULL;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_l3proto *l3proto;
struct nf_conntrack_l4proto *l4proto;
int set_reply = 0;
int ret;
- /* Previously seen (loopback or untracked)? Ignore. */
if (skb->nfct) {
- NF_CT_STAT_INC_ATOMIC(net, ignore);
- return NF_ACCEPT;
+ /* Previously seen (loopback or untracked)? Ignore. */
+ tmpl = (struct nf_conn *)skb->nfct;
+ if (!nf_ct_is_template(tmpl)) {
+ NF_CT_STAT_INC_ATOMIC(net, ignore);
+ return NF_ACCEPT;
+ }
+ skb->nfct = NULL;
}
/* rcu_read_lock()ed by nf_hook_slow */
pr_debug("not prepared to track yet or error occured\n");
NF_CT_STAT_INC_ATOMIC(net, error);
NF_CT_STAT_INC_ATOMIC(net, invalid);
- return -ret;
+ ret = -ret;
+ goto out;
}
l4proto = __nf_ct_l4proto_find(pf, protonum);
if (ret <= 0) {
NF_CT_STAT_INC_ATOMIC(net, error);
NF_CT_STAT_INC_ATOMIC(net, invalid);
- return -ret;
+ ret = -ret;
+ goto out;
}
}
- ct = resolve_normal_ct(net, skb, dataoff, pf, protonum,
+ ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
l3proto, l4proto, &set_reply, &ctinfo);
if (!ct) {
/* Not valid part of a connection */
NF_CT_STAT_INC_ATOMIC(net, invalid);
- return NF_ACCEPT;
+ ret = NF_ACCEPT;
+ goto out;
}
if (IS_ERR(ct)) {
/* Too stressed to deal. */
NF_CT_STAT_INC_ATOMIC(net, drop);
- return NF_DROP;
+ ret = NF_DROP;
+ goto out;
}
NF_CT_ASSERT(skb->nfct);
NF_CT_STAT_INC_ATOMIC(net, invalid);
if (ret == -NF_DROP)
NF_CT_STAT_INC_ATOMIC(net, drop);
- return -ret;
+ ret = -ret;
+ goto out;
}
if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
- nf_conntrack_event_cache(IPCT_STATUS, ct);
+ nf_conntrack_event_cache(IPCT_REPLY, ct);
+out:
+ if (tmpl)
+ nf_ct_put(tmpl);
return ret;
}
return;
rcu_read_lock();
- __nf_ct_try_assign_helper(ct, GFP_ATOMIC);
+ __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
unsigned long extra_jiffies,
int do_acct)
{
- int event = 0;
-
NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
NF_CT_ASSERT(skb);
- spin_lock_bh(&nf_conntrack_lock);
-
/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
goto acct;
/* If not in hash table, timer will not be active yet */
if (!nf_ct_is_confirmed(ct)) {
ct->timeout.expires = extra_jiffies;
- event = IPCT_REFRESH;
} else {
unsigned long newtime = jiffies + extra_jiffies;
/* Only update the timeout if the new timeout is at least
HZ jiffies from the old timeout. Need del_timer for race
avoidance (may already be dying). */
- if (newtime - ct->timeout.expires >= HZ
- && del_timer(&ct->timeout)) {
- ct->timeout.expires = newtime;
- add_timer(&ct->timeout);
- event = IPCT_REFRESH;
- }
+ if (newtime - ct->timeout.expires >= HZ)
+ mod_timer_pending(&ct->timeout, newtime);
}
acct:
acct = nf_conn_acct_find(ct);
if (acct) {
+ spin_lock_bh(&ct->lock);
acct[CTINFO2DIR(ctinfo)].packets++;
acct[CTINFO2DIR(ctinfo)].bytes +=
skb->len - skb_network_offset(skb);
+ spin_unlock_bh(&ct->lock);
}
}
-
- spin_unlock_bh(&nf_conntrack_lock);
-
- /* must be unlocked when calling event cache */
- if (event)
- nf_conntrack_event_cache(event, ct);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
if (do_acct) {
struct nf_conn_counter *acct;
- spin_lock_bh(&nf_conntrack_lock);
acct = nf_conn_acct_find(ct);
if (acct) {
+ spin_lock_bh(&ct->lock);
acct[CTINFO2DIR(ctinfo)].packets++;
acct[CTINFO2DIR(ctinfo)].bytes +=
skb->len - skb_network_offset(skb);
+ spin_unlock_bh(&ct->lock);
}
- spin_unlock_bh(&nf_conntrack_lock);
}
if (del_timer(&ct->timeout)) {
int report;
};
-static int kill_all(struct nf_conn *i, void *data)
+static int kill_report(struct nf_conn *i, void *data)
{
struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
- /* get_next_corpse sets the dying bit for us */
- nf_conntrack_event_report(IPCT_DESTROY,
- i,
- fr->pid,
- fr->report);
+ /* If we fail to deliver the event, death_by_timeout() will retry */
+ if (nf_conntrack_event_report(IPCT_DESTROY, i,
+ fr->pid, fr->report) < 0)
+ return 1;
+
+ /* Avoid the delivery of the destroy event in death_by_timeout(). */
+ set_bit(IPS_DYING_BIT, &i->status);
+ return 1;
+}
+
+static int kill_all(struct nf_conn *i, void *data)
+{
return 1;
}
}
EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
-void nf_conntrack_flush(struct net *net, u32 pid, int report)
+void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
{
struct __nf_ct_flush_report fr = {
.pid = pid,
.report = report,
};
- nf_ct_iterate_cleanup(net, kill_all, &fr);
+ nf_ct_iterate_cleanup(net, kill_report, &fr);
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
+
+static void nf_ct_release_dying_list(struct net *net)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct hlist_nulls_node *n;
+
+ spin_lock_bh(&nf_conntrack_lock);
+ hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* never fails to remove them, no listeners at this point */
+ nf_ct_kill(ct);
+ }
+ spin_unlock_bh(&nf_conntrack_lock);
}
-EXPORT_SYMBOL_GPL(nf_conntrack_flush);
static void nf_conntrack_cleanup_init_net(void)
{
static void nf_conntrack_cleanup_net(struct net *net)
{
- nf_ct_event_cache_flush(net);
- nf_conntrack_ecache_fini(net);
i_see_dead_people:
- nf_conntrack_flush(net, 0, 0);
+ nf_ct_iterate_cleanup(net, kill_all, NULL);
+ nf_ct_release_dying_list(net);
if (atomic_read(&net->ct.count) != 0) {
schedule();
goto i_see_dead_people;
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
+ nf_conntrack_ecache_fini(net);
nf_conntrack_acct_fini(net);
nf_conntrack_expect_fini(net);
free_percpu(net->ct.stat);
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */
if (!nf_conntrack_htable_size) {
nf_conntrack_htable_size
- = (((num_physpages << PAGE_SHIFT) / 16384)
+ = (((totalram_pages << PAGE_SHIFT) / 16384)
/ sizeof(struct hlist_head));
- if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+ if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
nf_conntrack_htable_size = 16384;
if (nf_conntrack_htable_size < 32)
nf_conntrack_htable_size = 32;
return ret;
}
+/*
+ * We need to use special "null" values, not used in hash table
+ */
+#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
+#define DYING_NULLS_VAL ((1<<30)+1)
+
static int nf_conntrack_init_net(struct net *net)
{
int ret;
atomic_set(&net->ct.count, 0);
- INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0);
+ INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
+ INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
if (!net->ct.stat) {
ret = -ENOMEM;
goto err_stat;
}
- ret = nf_conntrack_ecache_init(net);
- if (ret < 0)
- goto err_ecache;
net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
&net->ct.hash_vmalloc, 1);
if (!net->ct.hash) {
ret = nf_conntrack_acct_init(net);
if (ret < 0)
goto err_acct;
+ ret = nf_conntrack_ecache_init(net);
+ if (ret < 0)
+ goto err_ecache;
/* Set up fake conntrack:
- to never be deleted, not in any hashes */
return 0;
+err_ecache:
+ nf_conntrack_acct_fini(net);
err_acct:
nf_conntrack_expect_fini(net);
err_expect:
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
err_hash:
- nf_conntrack_ecache_fini(net);
-err_ecache:
free_percpu(net->ct.stat);
err_stat:
return ret;
}
+s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ u32 seq);
+EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
+
int nf_conntrack_init(struct net *net)
{
int ret;
/* For use by REJECT target */
rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
+
+ /* Howto get NAT offsets */
+ rcu_assign_pointer(nf_ct_nat_offset, NULL);
}
return 0;