[NETFILTER]: Add nf_conntrack subsystem.
[safe/jmp/linux-2.6] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2005 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  *
24  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
25  */
26
27 #include <linux/config.h>
28 #include <linux/types.h>
29 #include <linux/netfilter.h>
30 #include <linux/module.h>
31 #include <linux/skbuff.h>
32 #include <linux/proc_fs.h>
33 #include <linux/vmalloc.h>
34 #include <linux/stddef.h>
35 #include <linux/slab.h>
36 #include <linux/random.h>
37 #include <linux/jhash.h>
38 #include <linux/err.h>
39 #include <linux/percpu.h>
40 #include <linux/moduleparam.h>
41 #include <linux/notifier.h>
42 #include <linux/kernel.h>
43 #include <linux/netdevice.h>
44 #include <linux/socket.h>
45
46 /* This rwlock protects the main hash table, protocol/helper/expected
47    registrations, conntrack timers*/
48 #define ASSERT_READ_LOCK(x)
49 #define ASSERT_WRITE_LOCK(x)
50
51 #include <net/netfilter/nf_conntrack.h>
52 #include <net/netfilter/nf_conntrack_l3proto.h>
53 #include <net/netfilter/nf_conntrack_protocol.h>
54 #include <net/netfilter/nf_conntrack_helper.h>
55 #include <net/netfilter/nf_conntrack_core.h>
56 #include <linux/netfilter_ipv4/listhelp.h>
57
58 #define NF_CONNTRACK_VERSION    "0.4.1"
59
60 #if 0
61 #define DEBUGP printk
62 #else
63 #define DEBUGP(format, args...)
64 #endif
65
66 DEFINE_RWLOCK(nf_conntrack_lock);
67
68 /* nf_conntrack_standalone needs this */
69 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
70
71 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
72 LIST_HEAD(nf_conntrack_expect_list);
73 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
74 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
75 static LIST_HEAD(helpers);
76 unsigned int nf_conntrack_htable_size = 0;
77 int nf_conntrack_max;
78 struct list_head *nf_conntrack_hash;
79 static kmem_cache_t *nf_conntrack_expect_cachep;
80 struct nf_conn nf_conntrack_untracked;
81 unsigned int nf_ct_log_invalid;
82 static LIST_HEAD(unconfirmed);
83 static int nf_conntrack_vmalloc;
84
85 #ifdef CONFIG_NF_CONNTRACK_EVENTS
86 struct notifier_block *nf_conntrack_chain;
87 struct notifier_block *nf_conntrack_expect_chain;
88
89 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
90
91 /* deliver cached events and clear cache entry - must be called with locally
92  * disabled softirqs */
93 static inline void
94 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
95 {
96         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
97         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
98             && ecache->events)
99                 notifier_call_chain(&nf_conntrack_chain, ecache->events,
100                                     ecache->ct);
101
102         ecache->events = 0;
103         nf_ct_put(ecache->ct);
104         ecache->ct = NULL;
105 }
106
107 /* Deliver all cached events for a particular conntrack. This is called
108  * by code prior to async packet handling for freeing the skb */
109 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
110 {
111         struct nf_conntrack_ecache *ecache;
112
113         local_bh_disable();
114         ecache = &__get_cpu_var(nf_conntrack_ecache);
115         if (ecache->ct == ct)
116                 __nf_ct_deliver_cached_events(ecache);
117         local_bh_enable();
118 }
119
120 /* Deliver cached events for old pending events, if current conntrack != old */
121 void __nf_ct_event_cache_init(struct nf_conn *ct)
122 {
123         struct nf_conntrack_ecache *ecache;
124         
125         /* take care of delivering potentially old events */
126         ecache = &__get_cpu_var(nf_conntrack_ecache);
127         BUG_ON(ecache->ct == ct);
128         if (ecache->ct)
129                 __nf_ct_deliver_cached_events(ecache);
130         /* initialize for this conntrack/packet */
131         ecache->ct = ct;
132         nf_conntrack_get(&ct->ct_general);
133 }
134
135 /* flush the event cache - touches other CPU's data and must not be called
136  * while packets are still passing through the code */
137 static void nf_ct_event_cache_flush(void)
138 {
139         struct nf_conntrack_ecache *ecache;
140         int cpu;
141
142         for_each_cpu(cpu) {
143                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
144                 if (ecache->ct)
145                         nf_ct_put(ecache->ct);
146         }
147 }
148 #else
149 static inline void nf_ct_event_cache_flush(void) {}
150 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
151
152 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
153 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
154
155 /*
156  * This scheme offers various size of "struct nf_conn" dependent on
157  * features(helper, nat, ...)
158  */
159
160 #define NF_CT_FEATURES_NAMELEN  256
161 static struct {
162         /* name of slab cache. printed in /proc/slabinfo */
163         char *name;
164
165         /* size of slab cache */
166         size_t size;
167
168         /* slab cache pointer */
169         kmem_cache_t *cachep;
170
171         /* allocated slab cache + modules which uses this slab cache */
172         int use;
173
174         /* Initialization */
175         int (*init_conntrack)(struct nf_conn *, u_int32_t);
176
177 } nf_ct_cache[NF_CT_F_NUM];
178
179 /* protect members of nf_ct_cache except of "use" */
180 DEFINE_RWLOCK(nf_ct_cache_lock);
181
182 /* This avoids calling kmem_cache_create() with same name simultaneously */
183 DECLARE_MUTEX(nf_ct_cache_mutex);
184
185 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
186 struct nf_conntrack_protocol *
187 nf_ct_find_proto(u_int16_t l3proto, u_int8_t protocol)
188 {
189         if (unlikely(nf_ct_protos[l3proto] == NULL))
190                 return &nf_conntrack_generic_protocol;
191
192         return nf_ct_protos[l3proto][protocol];
193 }
194
195 static int nf_conntrack_hash_rnd_initted;
196 static unsigned int nf_conntrack_hash_rnd;
197
198 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
199                                   unsigned int size, unsigned int rnd)
200 {
201         unsigned int a, b;
202         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
203                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
204         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
205                         (tuple->src.u.all << 16) | tuple->dst.u.all);
206
207         return jhash_2words(a, b, rnd) % size;
208 }
209
210 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
211 {
212         return __hash_conntrack(tuple, nf_conntrack_htable_size,
213                                 nf_conntrack_hash_rnd);
214 }
215
216 /* Initialize "struct nf_conn" which has spaces for helper */
217 static int
218 init_conntrack_for_helper(struct nf_conn *conntrack, u_int32_t features)
219 {
220
221         conntrack->help = (union nf_conntrack_help *)
222                 (((unsigned long)conntrack->data
223                   + (__alignof__(union nf_conntrack_help) - 1))
224                  & (~((unsigned long)(__alignof__(union nf_conntrack_help) -1))));
225         return 0;
226 }
227
228 int nf_conntrack_register_cache(u_int32_t features, const char *name,
229                                 size_t size,
230                                 int (*init)(struct nf_conn *, u_int32_t))
231 {
232         int ret = 0;
233         char *cache_name;
234         kmem_cache_t *cachep;
235
236         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
237                features, name, size);
238
239         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
240                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
241                         features);
242                 return -EINVAL;
243         }
244
245         down(&nf_ct_cache_mutex);
246
247         write_lock_bh(&nf_ct_cache_lock);
248         /* e.g: multiple helpers are loaded */
249         if (nf_ct_cache[features].use > 0) {
250                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
251                 if ((!strncmp(nf_ct_cache[features].name, name,
252                               NF_CT_FEATURES_NAMELEN))
253                     && nf_ct_cache[features].size == size
254                     && nf_ct_cache[features].init_conntrack == init) {
255                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
256                         nf_ct_cache[features].use++;
257                         ret = 0;
258                 } else
259                         ret = -EBUSY;
260
261                 write_unlock_bh(&nf_ct_cache_lock);
262                 up(&nf_ct_cache_mutex);
263                 return ret;
264         }
265         write_unlock_bh(&nf_ct_cache_lock);
266
267         /*
268          * The memory space for name of slab cache must be alive until
269          * cache is destroyed.
270          */
271         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
272         if (cache_name == NULL) {
273                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
274                 ret = -ENOMEM;
275                 goto out_up_mutex;
276         }
277
278         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
279                                                 >= NF_CT_FEATURES_NAMELEN) {
280                 printk("nf_conntrack_register_cache: name too long\n");
281                 ret = -EINVAL;
282                 goto out_free_name;
283         }
284
285         cachep = kmem_cache_create(cache_name, size, 0, 0,
286                                    NULL, NULL);
287         if (!cachep) {
288                 printk("nf_conntrack_register_cache: Can't create slab cache "
289                        "for the features = 0x%x\n", features);
290                 ret = -ENOMEM;
291                 goto out_free_name;
292         }
293
294         write_lock_bh(&nf_ct_cache_lock);
295         nf_ct_cache[features].use = 1;
296         nf_ct_cache[features].size = size;
297         nf_ct_cache[features].init_conntrack = init;
298         nf_ct_cache[features].cachep = cachep;
299         nf_ct_cache[features].name = cache_name;
300         write_unlock_bh(&nf_ct_cache_lock);
301
302         goto out_up_mutex;
303
304 out_free_name:
305         kfree(cache_name);
306 out_up_mutex:
307         up(&nf_ct_cache_mutex);
308         return ret;
309 }
310
311 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
312 void nf_conntrack_unregister_cache(u_int32_t features)
313 {
314         kmem_cache_t *cachep;
315         char *name;
316
317         /*
318          * This assures that kmem_cache_create() isn't called before destroying
319          * slab cache.
320          */
321         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
322         down(&nf_ct_cache_mutex);
323
324         write_lock_bh(&nf_ct_cache_lock);
325         if (--nf_ct_cache[features].use > 0) {
326                 write_unlock_bh(&nf_ct_cache_lock);
327                 up(&nf_ct_cache_mutex);
328                 return;
329         }
330         cachep = nf_ct_cache[features].cachep;
331         name = nf_ct_cache[features].name;
332         nf_ct_cache[features].cachep = NULL;
333         nf_ct_cache[features].name = NULL;
334         nf_ct_cache[features].init_conntrack = NULL;
335         nf_ct_cache[features].size = 0;
336         write_unlock_bh(&nf_ct_cache_lock);
337
338         synchronize_net();
339
340         kmem_cache_destroy(cachep);
341         kfree(name);
342
343         up(&nf_ct_cache_mutex);
344 }
345
346 int
347 nf_ct_get_tuple(const struct sk_buff *skb,
348                 unsigned int nhoff,
349                 unsigned int dataoff,
350                 u_int16_t l3num,
351                 u_int8_t protonum,
352                 struct nf_conntrack_tuple *tuple,
353                 const struct nf_conntrack_l3proto *l3proto,
354                 const struct nf_conntrack_protocol *protocol)
355 {
356         NF_CT_TUPLE_U_BLANK(tuple);
357
358         tuple->src.l3num = l3num;
359         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
360                 return 0;
361
362         tuple->dst.protonum = protonum;
363         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
364
365         return protocol->pkt_to_tuple(skb, dataoff, tuple);
366 }
367
368 int
369 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
370                    const struct nf_conntrack_tuple *orig,
371                    const struct nf_conntrack_l3proto *l3proto,
372                    const struct nf_conntrack_protocol *protocol)
373 {
374         NF_CT_TUPLE_U_BLANK(inverse);
375
376         inverse->src.l3num = orig->src.l3num;
377         if (l3proto->invert_tuple(inverse, orig) == 0)
378                 return 0;
379
380         inverse->dst.dir = !orig->dst.dir;
381
382         inverse->dst.protonum = orig->dst.protonum;
383         return protocol->invert_tuple(inverse, orig);
384 }
385
386 /* nf_conntrack_expect helper functions */
387 static void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
388 {
389         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
390         NF_CT_ASSERT(!timer_pending(&exp_timeout));
391         list_del(&exp->list);
392         NF_CT_STAT_INC(expect_delete);
393         exp->master->expecting--;
394         nf_conntrack_expect_put(exp);
395 }
396
397 static void expectation_timed_out(unsigned long ul_expect)
398 {
399         struct nf_conntrack_expect *exp = (void *)ul_expect;
400
401         write_lock_bh(&nf_conntrack_lock);
402         nf_ct_unlink_expect(exp);
403         write_unlock_bh(&nf_conntrack_lock);
404         nf_conntrack_expect_put(exp);
405 }
406
407 /* If an expectation for this connection is found, it gets delete from
408  * global list then returned. */
409 static struct nf_conntrack_expect *
410 find_expectation(const struct nf_conntrack_tuple *tuple)
411 {
412         struct nf_conntrack_expect *i;
413
414         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
415         /* If master is not in hash table yet (ie. packet hasn't left
416            this machine yet), how can other end know about expected?
417            Hence these are not the droids you are looking for (if
418            master ct never got confirmed, we'd hold a reference to it
419            and weird things would happen to future packets). */
420                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
421                     && nf_ct_is_confirmed(i->master)) {
422                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
423                                 atomic_inc(&i->use);
424                                 return i;
425                         } else if (del_timer(&i->timeout)) {
426                                 nf_ct_unlink_expect(i);
427                                 return i;
428                         }
429                 }
430         }
431         return NULL;
432 }
433
434 /* delete all expectations for this conntrack */
435 static void remove_expectations(struct nf_conn *ct)
436 {
437         struct nf_conntrack_expect *i, *tmp;
438
439         /* Optimization: most connection never expect any others. */
440         if (ct->expecting == 0)
441                 return;
442
443         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
444                 if (i->master == ct && del_timer(&i->timeout)) {
445                         nf_ct_unlink_expect(i);
446                         nf_conntrack_expect_put(i);
447                 }
448         }
449 }
450
451 static void
452 clean_from_lists(struct nf_conn *ct)
453 {
454         unsigned int ho, hr;
455         
456         DEBUGP("clean_from_lists(%p)\n", ct);
457         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
458
459         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
460         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
461         LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
462         LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
463
464         /* Destroy all pending expectations */
465         remove_expectations(ct);
466 }
467
468 static void
469 destroy_conntrack(struct nf_conntrack *nfct)
470 {
471         struct nf_conn *ct = (struct nf_conn *)nfct;
472         struct nf_conntrack_l3proto *l3proto;
473         struct nf_conntrack_protocol *proto;
474
475         DEBUGP("destroy_conntrack(%p)\n", ct);
476         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
477         NF_CT_ASSERT(!timer_pending(&ct->timeout));
478
479         nf_conntrack_event(IPCT_DESTROY, ct);
480         set_bit(IPS_DYING_BIT, &ct->status);
481
482         /* To make sure we don't get any weird locking issues here:
483          * destroy_conntrack() MUST NOT be called with a write lock
484          * to nf_conntrack_lock!!! -HW */
485         l3proto = nf_ct_find_l3proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
486         if (l3proto && l3proto->destroy)
487                 l3proto->destroy(ct);
488
489         proto = nf_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num,
490                                  ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
491         if (proto && proto->destroy)
492                 proto->destroy(ct);
493
494         if (nf_conntrack_destroyed)
495                 nf_conntrack_destroyed(ct);
496
497         write_lock_bh(&nf_conntrack_lock);
498         /* Expectations will have been removed in clean_from_lists,
499          * except TFTP can create an expectation on the first packet,
500          * before connection is in the list, so we need to clean here,
501          * too. */
502         remove_expectations(ct);
503
504         /* We overload first tuple to link into unconfirmed list. */
505         if (!nf_ct_is_confirmed(ct)) {
506                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
507                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
508         }
509
510         NF_CT_STAT_INC(delete);
511         write_unlock_bh(&nf_conntrack_lock);
512
513         if (ct->master)
514                 nf_ct_put(ct->master);
515
516         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
517         nf_conntrack_free(ct);
518 }
519
520 static void death_by_timeout(unsigned long ul_conntrack)
521 {
522         struct nf_conn *ct = (void *)ul_conntrack;
523
524         write_lock_bh(&nf_conntrack_lock);
525         /* Inside lock so preempt is disabled on module removal path.
526          * Otherwise we can get spurious warnings. */
527         NF_CT_STAT_INC(delete_list);
528         clean_from_lists(ct);
529         write_unlock_bh(&nf_conntrack_lock);
530         nf_ct_put(ct);
531 }
532
533 static inline int
534 conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
535                     const struct nf_conntrack_tuple *tuple,
536                     const struct nf_conn *ignored_conntrack)
537 {
538         ASSERT_READ_LOCK(&nf_conntrack_lock);
539         return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
540                 && nf_ct_tuple_equal(tuple, &i->tuple);
541 }
542
543 static struct nf_conntrack_tuple_hash *
544 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
545                     const struct nf_conn *ignored_conntrack)
546 {
547         struct nf_conntrack_tuple_hash *h;
548         unsigned int hash = hash_conntrack(tuple);
549
550         ASSERT_READ_LOCK(&nf_conntrack_lock);
551         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
552                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
553                         NF_CT_STAT_INC(found);
554                         return h;
555                 }
556                 NF_CT_STAT_INC(searched);
557         }
558
559         return NULL;
560 }
561
562 /* Find a connection corresponding to a tuple. */
563 struct nf_conntrack_tuple_hash *
564 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
565                       const struct nf_conn *ignored_conntrack)
566 {
567         struct nf_conntrack_tuple_hash *h;
568
569         read_lock_bh(&nf_conntrack_lock);
570         h = __nf_conntrack_find(tuple, ignored_conntrack);
571         if (h)
572                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
573         read_unlock_bh(&nf_conntrack_lock);
574
575         return h;
576 }
577
578 /* Confirm a connection given skb; places it in hash table */
579 int
580 __nf_conntrack_confirm(struct sk_buff **pskb)
581 {
582         unsigned int hash, repl_hash;
583         struct nf_conn *ct;
584         enum ip_conntrack_info ctinfo;
585
586         ct = nf_ct_get(*pskb, &ctinfo);
587
588         /* ipt_REJECT uses nf_conntrack_attach to attach related
589            ICMP/TCP RST packets in other direction.  Actual packet
590            which created connection will be IP_CT_NEW or for an
591            expected connection, IP_CT_RELATED. */
592         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
593                 return NF_ACCEPT;
594
595         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
596         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
597
598         /* We're not in hash table, and we refuse to set up related
599            connections for unconfirmed conns.  But packet copies and
600            REJECT will give spurious warnings here. */
601         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
602
603         /* No external references means noone else could have
604            confirmed us. */
605         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
606         DEBUGP("Confirming conntrack %p\n", ct);
607
608         write_lock_bh(&nf_conntrack_lock);
609
610         /* See if there's one in the list already, including reverse:
611            NAT could have grabbed it without realizing, since we're
612            not in the hash.  If there is, we lost race. */
613         if (!LIST_FIND(&nf_conntrack_hash[hash],
614                        conntrack_tuple_cmp,
615                        struct nf_conntrack_tuple_hash *,
616                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
617             && !LIST_FIND(&nf_conntrack_hash[repl_hash],
618                           conntrack_tuple_cmp,
619                           struct nf_conntrack_tuple_hash *,
620                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
621                 /* Remove from unconfirmed list */
622                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
623
624                 list_prepend(&nf_conntrack_hash[hash],
625                              &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
626                 list_prepend(&nf_conntrack_hash[repl_hash],
627                              &ct->tuplehash[IP_CT_DIR_REPLY]);
628                 /* Timer relative to confirmation time, not original
629                    setting time, otherwise we'd get timer wrap in
630                    weird delay cases. */
631                 ct->timeout.expires += jiffies;
632                 add_timer(&ct->timeout);
633                 atomic_inc(&ct->ct_general.use);
634                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
635                 NF_CT_STAT_INC(insert);
636                 write_unlock_bh(&nf_conntrack_lock);
637                 if (ct->helper)
638                         nf_conntrack_event_cache(IPCT_HELPER, *pskb);
639 #ifdef CONFIG_NF_NAT_NEEDED
640                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
641                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
642                         nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
643 #endif
644                 nf_conntrack_event_cache(master_ct(ct) ?
645                                          IPCT_RELATED : IPCT_NEW, *pskb);
646                 return NF_ACCEPT;
647         }
648
649         NF_CT_STAT_INC(insert_failed);
650         write_unlock_bh(&nf_conntrack_lock);
651         return NF_DROP;
652 }
653
654 /* Returns true if a connection correspondings to the tuple (required
655    for NAT). */
656 int
657 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
658                          const struct nf_conn *ignored_conntrack)
659 {
660         struct nf_conntrack_tuple_hash *h;
661
662         read_lock_bh(&nf_conntrack_lock);
663         h = __nf_conntrack_find(tuple, ignored_conntrack);
664         read_unlock_bh(&nf_conntrack_lock);
665
666         return h != NULL;
667 }
668
669 /* There's a small race here where we may free a just-assured
670    connection.  Too bad: we're in trouble anyway. */
671 static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
672 {
673         return !(test_bit(IPS_ASSURED_BIT,
674                           &nf_ct_tuplehash_to_ctrack(i)->status));
675 }
676
677 static int early_drop(struct list_head *chain)
678 {
679         /* Traverse backwards: gives us oldest, which is roughly LRU */
680         struct nf_conntrack_tuple_hash *h;
681         struct nf_conn *ct = NULL;
682         int dropped = 0;
683
684         read_lock_bh(&nf_conntrack_lock);
685         h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
686         if (h) {
687                 ct = nf_ct_tuplehash_to_ctrack(h);
688                 atomic_inc(&ct->ct_general.use);
689         }
690         read_unlock_bh(&nf_conntrack_lock);
691
692         if (!ct)
693                 return dropped;
694
695         if (del_timer(&ct->timeout)) {
696                 death_by_timeout((unsigned long)ct);
697                 dropped = 1;
698                 NF_CT_STAT_INC(early_drop);
699         }
700         nf_ct_put(ct);
701         return dropped;
702 }
703
704 static inline int helper_cmp(const struct nf_conntrack_helper *i,
705                              const struct nf_conntrack_tuple *rtuple)
706 {
707         return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
708 }
709
710 static struct nf_conntrack_helper *
711 nf_ct_find_helper(const struct nf_conntrack_tuple *tuple)
712 {
713         return LIST_FIND(&helpers, helper_cmp,
714                          struct nf_conntrack_helper *,
715                          tuple);
716 }
717
718 static struct nf_conn *
719 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
720                      const struct nf_conntrack_tuple *repl,
721                      const struct nf_conntrack_l3proto *l3proto)
722 {
723         struct nf_conn *conntrack = NULL;
724         u_int32_t features = 0;
725
726         if (!nf_conntrack_hash_rnd_initted) {
727                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
728                 nf_conntrack_hash_rnd_initted = 1;
729         }
730
731         if (nf_conntrack_max
732             && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
733                 unsigned int hash = hash_conntrack(orig);
734                 /* Try dropping from this hash chain. */
735                 if (!early_drop(&nf_conntrack_hash[hash])) {
736                         if (net_ratelimit())
737                                 printk(KERN_WARNING
738                                        "nf_conntrack: table full, dropping"
739                                        " packet.\n");
740                         return ERR_PTR(-ENOMEM);
741                 }
742         }
743
744         /*  find features needed by this conntrack. */
745         features = l3proto->get_features(orig);
746         read_lock_bh(&nf_conntrack_lock);
747         if (nf_ct_find_helper(repl) != NULL)
748                 features |= NF_CT_F_HELP;
749         read_unlock_bh(&nf_conntrack_lock);
750
751         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
752
753         read_lock_bh(&nf_ct_cache_lock);
754
755         if (!nf_ct_cache[features].use) {
756                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
757                         features);
758                 goto out;
759         }
760
761         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
762         if (conntrack == NULL) {
763                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
764                 goto out;
765         }
766
767         memset(conntrack, 0, nf_ct_cache[features].size);
768         conntrack->features = features;
769         if (nf_ct_cache[features].init_conntrack &&
770             nf_ct_cache[features].init_conntrack(conntrack, features) < 0) {
771                 DEBUGP("nf_conntrack_alloc: failed to init\n");
772                 kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
773                 conntrack = NULL;
774                 goto out;
775         }
776
777         atomic_set(&conntrack->ct_general.use, 1);
778         conntrack->ct_general.destroy = destroy_conntrack;
779         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
780         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
781         /* Don't set timer yet: wait for confirmation */
782         init_timer(&conntrack->timeout);
783         conntrack->timeout.data = (unsigned long)conntrack;
784         conntrack->timeout.function = death_by_timeout;
785
786         atomic_inc(&nf_conntrack_count);
787 out:
788         read_unlock_bh(&nf_ct_cache_lock);
789         return conntrack;
790 }
791
792 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
793                                    const struct nf_conntrack_tuple *repl)
794 {
795         struct nf_conntrack_l3proto *l3proto;
796
797         l3proto = nf_ct_find_l3proto(orig->src.l3num);
798         return __nf_conntrack_alloc(orig, repl, l3proto);
799 }
800
801 void nf_conntrack_free(struct nf_conn *conntrack)
802 {
803         u_int32_t features = conntrack->features;
804         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
805         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
806                conntrack);
807         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
808         atomic_dec(&nf_conntrack_count);
809 }
810
811 /* Allocate a new conntrack: we return -ENOMEM if classification
812    failed due to stress.  Otherwise it really is unclassifiable. */
813 static struct nf_conntrack_tuple_hash *
814 init_conntrack(const struct nf_conntrack_tuple *tuple,
815                struct nf_conntrack_l3proto *l3proto,
816                struct nf_conntrack_protocol *protocol,
817                struct sk_buff *skb,
818                unsigned int dataoff)
819 {
820         struct nf_conn *conntrack;
821         struct nf_conntrack_tuple repl_tuple;
822         struct nf_conntrack_expect *exp;
823
824         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
825                 DEBUGP("Can't invert tuple.\n");
826                 return NULL;
827         }
828
829         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
830         if (conntrack == NULL || IS_ERR(conntrack)) {
831                 DEBUGP("Can't allocate conntrack.\n");
832                 return (struct nf_conntrack_tuple_hash *)conntrack;
833         }
834
835         if (!protocol->new(conntrack, skb, dataoff)) {
836                 nf_conntrack_free(conntrack);
837                 DEBUGP("init conntrack: can't track with proto module\n");
838                 return NULL;
839         }
840
841         write_lock_bh(&nf_conntrack_lock);
842         exp = find_expectation(tuple);
843
844         if (exp) {
845                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
846                         conntrack, exp);
847                 /* Welcome, Mr. Bond.  We've been expecting you... */
848                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
849                 conntrack->master = exp->master;
850 #ifdef CONFIG_NF_CONNTRACK_MARK
851                 conntrack->mark = exp->master->mark;
852 #endif
853                 nf_conntrack_get(&conntrack->master->ct_general);
854                 NF_CT_STAT_INC(expect_new);
855         } else {
856                 conntrack->helper = nf_ct_find_helper(&repl_tuple);
857
858                 NF_CT_STAT_INC(new);
859         }
860
861         /* Overload tuple linked list to put us in unconfirmed list. */
862         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
863
864         write_unlock_bh(&nf_conntrack_lock);
865
866         if (exp) {
867                 if (exp->expectfn)
868                         exp->expectfn(conntrack, exp);
869                 nf_conntrack_expect_put(exp);
870         }
871
872         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
873 }
874
875 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
876 static inline struct nf_conn *
877 resolve_normal_ct(struct sk_buff *skb,
878                   unsigned int dataoff,
879                   u_int16_t l3num,
880                   u_int8_t protonum,
881                   struct nf_conntrack_l3proto *l3proto,
882                   struct nf_conntrack_protocol *proto,
883                   int *set_reply,
884                   enum ip_conntrack_info *ctinfo)
885 {
886         struct nf_conntrack_tuple tuple;
887         struct nf_conntrack_tuple_hash *h;
888         struct nf_conn *ct;
889
890         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
891                              dataoff, l3num, protonum, &tuple, l3proto,
892                              proto)) {
893                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
894                 return NULL;
895         }
896
897         /* look for tuple match */
898         h = nf_conntrack_find_get(&tuple, NULL);
899         if (!h) {
900                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
901                 if (!h)
902                         return NULL;
903                 if (IS_ERR(h))
904                         return (void *)h;
905         }
906         ct = nf_ct_tuplehash_to_ctrack(h);
907
908         /* It exists; we have (non-exclusive) reference. */
909         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
910                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
911                 /* Please set reply bit if this packet OK */
912                 *set_reply = 1;
913         } else {
914                 /* Once we've had two way comms, always ESTABLISHED. */
915                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
916                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
917                         *ctinfo = IP_CT_ESTABLISHED;
918                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
919                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
920                         *ctinfo = IP_CT_RELATED;
921                 } else {
922                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
923                         *ctinfo = IP_CT_NEW;
924                 }
925                 *set_reply = 0;
926         }
927         skb->nfct = &ct->ct_general;
928         skb->nfctinfo = *ctinfo;
929         return ct;
930 }
931
932 unsigned int
933 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
934 {
935         struct nf_conn *ct;
936         enum ip_conntrack_info ctinfo;
937         struct nf_conntrack_l3proto *l3proto;
938         struct nf_conntrack_protocol *proto;
939         unsigned int dataoff;
940         u_int8_t protonum;
941         int set_reply = 0;
942         int ret;
943
944         /* Previously seen (loopback or untracked)?  Ignore. */
945         if ((*pskb)->nfct) {
946                 NF_CT_STAT_INC(ignore);
947                 return NF_ACCEPT;
948         }
949
950         l3proto = nf_ct_find_l3proto((u_int16_t)pf);
951         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
952                 DEBUGP("not prepared to track yet or error occured\n");
953                 return -ret;
954         }
955
956         proto = nf_ct_find_proto((u_int16_t)pf, protonum);
957
958         /* It may be an special packet, error, unclean...
959          * inverse of the return code tells to the netfilter
960          * core what to do with the packet. */
961         if (proto->error != NULL &&
962             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
963                 NF_CT_STAT_INC(error);
964                 NF_CT_STAT_INC(invalid);
965                 return -ret;
966         }
967
968         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
969                                &set_reply, &ctinfo);
970         if (!ct) {
971                 /* Not valid part of a connection */
972                 NF_CT_STAT_INC(invalid);
973                 return NF_ACCEPT;
974         }
975
976         if (IS_ERR(ct)) {
977                 /* Too stressed to deal. */
978                 NF_CT_STAT_INC(drop);
979                 return NF_DROP;
980         }
981
982         NF_CT_ASSERT((*pskb)->nfct);
983
984         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
985         if (ret < 0) {
986                 /* Invalid: inverse of the return code tells
987                  * the netfilter core what to do */
988                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
989                 nf_conntrack_put((*pskb)->nfct);
990                 (*pskb)->nfct = NULL;
991                 NF_CT_STAT_INC(invalid);
992                 return -ret;
993         }
994
995         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
996                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
997
998         return ret;
999 }
1000
1001 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1002                          const struct nf_conntrack_tuple *orig)
1003 {
1004         return nf_ct_invert_tuple(inverse, orig,
1005                                   nf_ct_find_l3proto(orig->src.l3num),
1006                                   nf_ct_find_proto(orig->src.l3num,
1007                                                    orig->dst.protonum));
1008 }
1009
1010 /* Would two expected things clash? */
1011 static inline int expect_clash(const struct nf_conntrack_expect *a,
1012                                const struct nf_conntrack_expect *b)
1013 {
1014         /* Part covered by intersection of masks must be unequal,
1015            otherwise they clash */
1016         struct nf_conntrack_tuple intersect_mask;
1017         int count;
1018
1019         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1020         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1021         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1022         intersect_mask.dst.protonum = a->mask.dst.protonum
1023                                         & b->mask.dst.protonum;
1024
1025         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1026                 intersect_mask.src.u3.all[count] =
1027                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1028         }
1029
1030         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1031                 intersect_mask.dst.u3.all[count] =
1032                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1033         }
1034
1035         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1036 }
1037
1038 static inline int expect_matches(const struct nf_conntrack_expect *a,
1039                                  const struct nf_conntrack_expect *b)
1040 {
1041         return a->master == b->master
1042                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1043                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1044 }
1045
1046 /* Generally a bad idea to call this: could have matched already. */
1047 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1048 {
1049         struct nf_conntrack_expect *i;
1050
1051         write_lock_bh(&nf_conntrack_lock);
1052         /* choose the the oldest expectation to evict */
1053         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1054                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1055                         nf_ct_unlink_expect(i);
1056                         write_unlock_bh(&nf_conntrack_lock);
1057                         nf_conntrack_expect_put(i);
1058                         return;
1059                 }
1060         }
1061         write_unlock_bh(&nf_conntrack_lock);
1062 }
1063
1064 /* We don't increase the master conntrack refcount for non-fulfilled
1065  * conntracks. During the conntrack destruction, the expectations are
1066  * always killed before the conntrack itself */
1067 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1068 {
1069         struct nf_conntrack_expect *new;
1070
1071         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1072         if (!new) {
1073                 DEBUGP("expect_related: OOM allocating expect\n");
1074                 return NULL;
1075         }
1076         new->master = me;
1077         atomic_set(&new->use, 1);
1078         return new;
1079 }
1080
1081 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1082 {
1083         if (atomic_dec_and_test(&exp->use))
1084                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1085 }
1086
1087 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1088 {
1089         atomic_inc(&exp->use);
1090         exp->master->expecting++;
1091         list_add(&exp->list, &nf_conntrack_expect_list);
1092
1093         init_timer(&exp->timeout);
1094         exp->timeout.data = (unsigned long)exp;
1095         exp->timeout.function = expectation_timed_out;
1096         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
1097         add_timer(&exp->timeout);
1098
1099         atomic_inc(&exp->use);
1100         NF_CT_STAT_INC(expect_create);
1101 }
1102
1103 /* Race with expectations being used means we could have none to find; OK. */
1104 static void evict_oldest_expect(struct nf_conn *master)
1105 {
1106         struct nf_conntrack_expect *i;
1107
1108         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1109                 if (i->master == master) {
1110                         if (del_timer(&i->timeout)) {
1111                                 nf_ct_unlink_expect(i);
1112                                 nf_conntrack_expect_put(i);
1113                         }
1114                         break;
1115                 }
1116         }
1117 }
1118
1119 static inline int refresh_timer(struct nf_conntrack_expect *i)
1120 {
1121         if (!del_timer(&i->timeout))
1122                 return 0;
1123
1124         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1125         add_timer(&i->timeout);
1126         return 1;
1127 }
1128
1129 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1130 {
1131         struct nf_conntrack_expect *i;
1132         int ret;
1133
1134         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1135         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1136         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1137
1138         write_lock_bh(&nf_conntrack_lock);
1139         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1140                 if (expect_matches(i, expect)) {
1141                         /* Refresh timer: if it's dying, ignore.. */
1142                         if (refresh_timer(i)) {
1143                                 ret = 0;
1144                                 goto out;
1145                         }
1146                 } else if (expect_clash(i, expect)) {
1147                         ret = -EBUSY;
1148                         goto out;
1149                 }
1150         }
1151         /* Will be over limit? */
1152         if (expect->master->helper->max_expected && 
1153             expect->master->expecting >= expect->master->helper->max_expected)
1154                 evict_oldest_expect(expect->master);
1155
1156         nf_conntrack_expect_insert(expect);
1157         nf_conntrack_expect_event(IPEXP_NEW, expect);
1158         ret = 0;
1159 out:
1160         write_unlock_bh(&nf_conntrack_lock);
1161         return ret;
1162 }
1163
1164 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1165    implicitly racy: see __nf_conntrack_confirm */
1166 void nf_conntrack_alter_reply(struct nf_conn *conntrack,
1167                               const struct nf_conntrack_tuple *newreply)
1168 {
1169         write_lock_bh(&nf_conntrack_lock);
1170         /* Should be unconfirmed, so not in hash table yet */
1171         NF_CT_ASSERT(!nf_ct_is_confirmed(conntrack));
1172
1173         DEBUGP("Altering reply tuple of %p to ", conntrack);
1174         NF_CT_DUMP_TUPLE(newreply);
1175
1176         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1177         if (!conntrack->master && conntrack->expecting == 0)
1178                 conntrack->helper = nf_ct_find_helper(newreply);
1179         write_unlock_bh(&nf_conntrack_lock);
1180 }
1181
1182 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1183 {
1184         int ret;
1185         BUG_ON(me->timeout == 0);
1186
1187         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1188                                           sizeof(struct nf_conn)
1189                                           + sizeof(union nf_conntrack_help)
1190                                           + __alignof__(union nf_conntrack_help),
1191                                           init_conntrack_for_helper);
1192         if (ret < 0) {
1193                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1194                 return ret;
1195         }
1196         write_lock_bh(&nf_conntrack_lock);
1197         list_prepend(&helpers, me);
1198         write_unlock_bh(&nf_conntrack_lock);
1199
1200         return 0;
1201 }
1202
1203 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1204                          const struct nf_conntrack_helper *me)
1205 {
1206         if (nf_ct_tuplehash_to_ctrack(i)->helper == me) {
1207                 nf_conntrack_event(IPCT_HELPER, nf_ct_tuplehash_to_ctrack(i));
1208                 nf_ct_tuplehash_to_ctrack(i)->helper = NULL;
1209         }
1210         return 0;
1211 }
1212
1213 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1214 {
1215         unsigned int i;
1216         struct nf_conntrack_expect *exp, *tmp;
1217
1218         /* Need write lock here, to delete helper. */
1219         write_lock_bh(&nf_conntrack_lock);
1220         LIST_DELETE(&helpers, me);
1221
1222         /* Get rid of expectations */
1223         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1224                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1225                         nf_ct_unlink_expect(exp);
1226                         nf_conntrack_expect_put(exp);
1227                 }
1228         }
1229
1230         /* Get rid of expecteds, set helpers to NULL. */
1231         LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1232         for (i = 0; i < nf_conntrack_htable_size; i++)
1233                 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1234                             struct nf_conntrack_tuple_hash *, me);
1235         write_unlock_bh(&nf_conntrack_lock);
1236
1237         /* Someone could be still looking at the helper in a bh. */
1238         synchronize_net();
1239 }
1240
1241 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1242 void __nf_ct_refresh_acct(struct nf_conn *ct,
1243                           enum ip_conntrack_info ctinfo,
1244                           const struct sk_buff *skb,
1245                           unsigned long extra_jiffies,
1246                           int do_acct)
1247 {
1248         int event = 0;
1249
1250         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1251         NF_CT_ASSERT(skb);
1252
1253         write_lock_bh(&nf_conntrack_lock);
1254
1255         /* If not in hash table, timer will not be active yet */
1256         if (!nf_ct_is_confirmed(ct)) {
1257                 ct->timeout.expires = extra_jiffies;
1258                 event = IPCT_REFRESH;
1259         } else {
1260                 /* Need del_timer for race avoidance (may already be dying). */
1261                 if (del_timer(&ct->timeout)) {
1262                         ct->timeout.expires = jiffies + extra_jiffies;
1263                         add_timer(&ct->timeout);
1264                         event = IPCT_REFRESH;
1265                 }
1266         }
1267
1268 #ifdef CONFIG_NF_CT_ACCT
1269         if (do_acct) {
1270                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1271                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1272                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1273         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1274             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1275                 event |= IPCT_COUNTER_FILLING;
1276         }
1277 #endif
1278
1279         write_unlock_bh(&nf_conntrack_lock);
1280
1281         /* must be unlocked when calling event cache */
1282         if (event)
1283                 nf_conntrack_event_cache(event, skb);
1284 }
1285
1286 /* Used by ipt_REJECT and ip6t_REJECT. */
1287 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1288 {
1289         struct nf_conn *ct;
1290         enum ip_conntrack_info ctinfo;
1291
1292         /* This ICMP is in reverse direction to the packet which caused it */
1293         ct = nf_ct_get(skb, &ctinfo);
1294         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1295                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1296         else
1297                 ctinfo = IP_CT_RELATED;
1298
1299         /* Attach to new skbuff, and increment count */
1300         nskb->nfct = &ct->ct_general;
1301         nskb->nfctinfo = ctinfo;
1302         nf_conntrack_get(nskb->nfct);
1303 }
1304
1305 static inline int
1306 do_iter(const struct nf_conntrack_tuple_hash *i,
1307         int (*iter)(struct nf_conn *i, void *data),
1308         void *data)
1309 {
1310         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1311 }
1312
1313 /* Bring out ya dead! */
1314 static struct nf_conntrack_tuple_hash *
1315 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1316                 void *data, unsigned int *bucket)
1317 {
1318         struct nf_conntrack_tuple_hash *h = NULL;
1319
1320         write_lock_bh(&nf_conntrack_lock);
1321         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1322                 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1323                                 struct nf_conntrack_tuple_hash *, iter, data);
1324                 if (h)
1325                         break;
1326         }
1327         if (!h)
1328                 h = LIST_FIND_W(&unconfirmed, do_iter,
1329                                 struct nf_conntrack_tuple_hash *, iter, data);
1330         if (h)
1331                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1332         write_unlock_bh(&nf_conntrack_lock);
1333
1334         return h;
1335 }
1336
1337 void
1338 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1339 {
1340         struct nf_conntrack_tuple_hash *h;
1341         unsigned int bucket = 0;
1342
1343         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1344                 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1345                 /* Time to push up daises... */
1346                 if (del_timer(&ct->timeout))
1347                         death_by_timeout((unsigned long)ct);
1348                 /* ... else the timer will get him soon. */
1349
1350                 nf_ct_put(ct);
1351         }
1352 }
1353
1354 static int kill_all(struct nf_conn *i, void *data)
1355 {
1356         return 1;
1357 }
1358
1359 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1360 {
1361         if (vmalloced)
1362                 vfree(hash);
1363         else
1364                 free_pages((unsigned long)hash, 
1365                            get_order(sizeof(struct list_head) * size));
1366 }
1367
1368 /* Mishearing the voices in his head, our hero wonders how he's
1369    supposed to kill the mall. */
1370 void nf_conntrack_cleanup(void)
1371 {
1372         int i;
1373
1374         /* This makes sure all current packets have passed through
1375            netfilter framework.  Roll on, two-stage module
1376            delete... */
1377         synchronize_net();
1378
1379         nf_ct_event_cache_flush();
1380  i_see_dead_people:
1381         nf_ct_iterate_cleanup(kill_all, NULL);
1382         if (atomic_read(&nf_conntrack_count) != 0) {
1383                 schedule();
1384                 goto i_see_dead_people;
1385         }
1386
1387         for (i = 0; i < NF_CT_F_NUM; i++) {
1388                 if (nf_ct_cache[i].use == 0)
1389                         continue;
1390
1391                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1392                 nf_ct_cache[i].use = 1;
1393                 nf_conntrack_unregister_cache(i);
1394         }
1395         kmem_cache_destroy(nf_conntrack_expect_cachep);
1396         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1397                             nf_conntrack_htable_size);
1398 }
1399
1400 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1401 {
1402         struct list_head *hash;
1403         unsigned int i;
1404
1405         *vmalloced = 0; 
1406         hash = (void*)__get_free_pages(GFP_KERNEL, 
1407                                        get_order(sizeof(struct list_head)
1408                                                  * size));
1409         if (!hash) { 
1410                 *vmalloced = 1;
1411                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1412                 hash = vmalloc(sizeof(struct list_head) * size);
1413         }
1414
1415         if (hash)
1416                 for (i = 0; i < size; i++) 
1417                         INIT_LIST_HEAD(&hash[i]);
1418
1419         return hash;
1420 }
1421
1422 int set_hashsize(const char *val, struct kernel_param *kp)
1423 {
1424         int i, bucket, hashsize, vmalloced;
1425         int old_vmalloced, old_size;
1426         int rnd;
1427         struct list_head *hash, *old_hash;
1428         struct nf_conntrack_tuple_hash *h;
1429
1430         /* On boot, we can set this without any fancy locking. */
1431         if (!nf_conntrack_htable_size)
1432                 return param_set_uint(val, kp);
1433
1434         hashsize = simple_strtol(val, NULL, 0);
1435         if (!hashsize)
1436                 return -EINVAL;
1437
1438         hash = alloc_hashtable(hashsize, &vmalloced);
1439         if (!hash)
1440                 return -ENOMEM;
1441
1442         /* We have to rehahs for the new table anyway, so we also can
1443          * use a newrandom seed */
1444         get_random_bytes(&rnd, 4);
1445
1446         write_lock_bh(&nf_conntrack_lock);
1447         for (i = 0; i < nf_conntrack_htable_size; i++) {
1448                 while (!list_empty(&nf_conntrack_hash[i])) {
1449                         h = list_entry(nf_conntrack_hash[i].next,
1450                                        struct nf_conntrack_tuple_hash, list);
1451                         list_del(&h->list);
1452                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1453                         list_add_tail(&h->list, &hash[bucket]);
1454                 }
1455         }
1456         old_size = nf_conntrack_htable_size;
1457         old_vmalloced = nf_conntrack_vmalloc;
1458         old_hash = nf_conntrack_hash;
1459
1460         nf_conntrack_htable_size = hashsize;
1461         nf_conntrack_vmalloc = vmalloced;
1462         nf_conntrack_hash = hash;
1463         nf_conntrack_hash_rnd = rnd;
1464         write_unlock_bh(&nf_conntrack_lock);
1465
1466         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1467         return 0;
1468 }
1469
1470 module_param_call(hashsize, set_hashsize, param_get_uint,
1471                   &nf_conntrack_htable_size, 0600);
1472
1473 int __init nf_conntrack_init(void)
1474 {
1475         unsigned int i;
1476         int ret;
1477
1478         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1479          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1480         if (!nf_conntrack_htable_size) {
1481                 nf_conntrack_htable_size
1482                         = (((num_physpages << PAGE_SHIFT) / 16384)
1483                            / sizeof(struct list_head));
1484                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1485                         nf_conntrack_htable_size = 8192;
1486                 if (nf_conntrack_htable_size < 16)
1487                         nf_conntrack_htable_size = 16;
1488         }
1489         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1490
1491         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1492                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1493                nf_conntrack_max);
1494
1495         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1496                                             &nf_conntrack_vmalloc);
1497         if (!nf_conntrack_hash) {
1498                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1499                 goto err_out;
1500         }
1501
1502         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1503                                           sizeof(struct nf_conn), NULL);
1504         if (ret < 0) {
1505                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1506                 goto err_free_hash;
1507         }
1508
1509         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1510                                         sizeof(struct nf_conntrack_expect),
1511                                         0, 0, NULL, NULL);
1512         if (!nf_conntrack_expect_cachep) {
1513                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1514                 goto err_free_conntrack_slab;
1515         }
1516
1517         /* Don't NEED lock here, but good form anyway. */
1518         write_lock_bh(&nf_conntrack_lock);
1519         for (i = 0; i < PF_MAX; i++)
1520                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1521         write_unlock_bh(&nf_conntrack_lock);
1522
1523         /* Set up fake conntrack:
1524             - to never be deleted, not in any hashes */
1525         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1526         /*  - and look it like as a confirmed connection */
1527         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1528
1529         return ret;
1530
1531 err_free_conntrack_slab:
1532         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1533 err_free_hash:
1534         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1535                             nf_conntrack_htable_size);
1536 err_out:
1537         return -ENOMEM;
1538 }