e3022226a4084bdea58eea7b91c15719bbe6aa9f
[safe/jmp/linux-2.6] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24  *      - restructure nf_conn (introduce nf_conn_help)
25  *      - redesign 'features' how they were originally intended
26  * 26 Feb 2006: Pablo Neira Ayuso <pablo@eurodev.net>
27  *      - add support for L3 protocol module load on demand.
28  *
29  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
30  */
31
32 #include <linux/config.h>
33 #include <linux/types.h>
34 #include <linux/netfilter.h>
35 #include <linux/module.h>
36 #include <linux/skbuff.h>
37 #include <linux/proc_fs.h>
38 #include <linux/vmalloc.h>
39 #include <linux/stddef.h>
40 #include <linux/slab.h>
41 #include <linux/random.h>
42 #include <linux/jhash.h>
43 #include <linux/err.h>
44 #include <linux/percpu.h>
45 #include <linux/moduleparam.h>
46 #include <linux/notifier.h>
47 #include <linux/kernel.h>
48 #include <linux/netdevice.h>
49 #include <linux/socket.h>
50
51 /* This rwlock protects the main hash table, protocol/helper/expected
52    registrations, conntrack timers*/
53 #define ASSERT_READ_LOCK(x)
54 #define ASSERT_WRITE_LOCK(x)
55
56 #include <net/netfilter/nf_conntrack.h>
57 #include <net/netfilter/nf_conntrack_l3proto.h>
58 #include <net/netfilter/nf_conntrack_protocol.h>
59 #include <net/netfilter/nf_conntrack_helper.h>
60 #include <net/netfilter/nf_conntrack_core.h>
61 #include <linux/netfilter_ipv4/listhelp.h>
62
63 #define NF_CONNTRACK_VERSION    "0.5.0"
64
65 #if 0
66 #define DEBUGP printk
67 #else
68 #define DEBUGP(format, args...)
69 #endif
70
71 DEFINE_RWLOCK(nf_conntrack_lock);
72
73 /* nf_conntrack_standalone needs this */
74 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
75
76 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
77 LIST_HEAD(nf_conntrack_expect_list);
78 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
79 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
80 static LIST_HEAD(helpers);
81 unsigned int nf_conntrack_htable_size = 0;
82 int nf_conntrack_max;
83 struct list_head *nf_conntrack_hash;
84 static kmem_cache_t *nf_conntrack_expect_cachep;
85 struct nf_conn nf_conntrack_untracked;
86 unsigned int nf_ct_log_invalid;
87 static LIST_HEAD(unconfirmed);
88 static int nf_conntrack_vmalloc;
89
90 static unsigned int nf_conntrack_next_id;
91 static unsigned int nf_conntrack_expect_next_id;
92 #ifdef CONFIG_NF_CONNTRACK_EVENTS
93 ATOMIC_NOTIFIER_HEAD(nf_conntrack_chain);
94 ATOMIC_NOTIFIER_HEAD(nf_conntrack_expect_chain);
95
96 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
97
98 /* deliver cached events and clear cache entry - must be called with locally
99  * disabled softirqs */
100 static inline void
101 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
102 {
103         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
104         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
105             && ecache->events)
106                 atomic_notifier_call_chain(&nf_conntrack_chain, ecache->events,
107                                     ecache->ct);
108
109         ecache->events = 0;
110         nf_ct_put(ecache->ct);
111         ecache->ct = NULL;
112 }
113
114 /* Deliver all cached events for a particular conntrack. This is called
115  * by code prior to async packet handling for freeing the skb */
116 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
117 {
118         struct nf_conntrack_ecache *ecache;
119
120         local_bh_disable();
121         ecache = &__get_cpu_var(nf_conntrack_ecache);
122         if (ecache->ct == ct)
123                 __nf_ct_deliver_cached_events(ecache);
124         local_bh_enable();
125 }
126
127 /* Deliver cached events for old pending events, if current conntrack != old */
128 void __nf_ct_event_cache_init(struct nf_conn *ct)
129 {
130         struct nf_conntrack_ecache *ecache;
131         
132         /* take care of delivering potentially old events */
133         ecache = &__get_cpu_var(nf_conntrack_ecache);
134         BUG_ON(ecache->ct == ct);
135         if (ecache->ct)
136                 __nf_ct_deliver_cached_events(ecache);
137         /* initialize for this conntrack/packet */
138         ecache->ct = ct;
139         nf_conntrack_get(&ct->ct_general);
140 }
141
142 /* flush the event cache - touches other CPU's data and must not be called
143  * while packets are still passing through the code */
144 static void nf_ct_event_cache_flush(void)
145 {
146         struct nf_conntrack_ecache *ecache;
147         int cpu;
148
149         for_each_possible_cpu(cpu) {
150                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
151                 if (ecache->ct)
152                         nf_ct_put(ecache->ct);
153         }
154 }
155 #else
156 static inline void nf_ct_event_cache_flush(void) {}
157 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
158
159 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
160 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
161
162 /*
163  * This scheme offers various size of "struct nf_conn" dependent on
164  * features(helper, nat, ...)
165  */
166
167 #define NF_CT_FEATURES_NAMELEN  256
168 static struct {
169         /* name of slab cache. printed in /proc/slabinfo */
170         char *name;
171
172         /* size of slab cache */
173         size_t size;
174
175         /* slab cache pointer */
176         kmem_cache_t *cachep;
177
178         /* allocated slab cache + modules which uses this slab cache */
179         int use;
180
181         /* Initialization */
182         int (*init_conntrack)(struct nf_conn *, u_int32_t);
183
184 } nf_ct_cache[NF_CT_F_NUM];
185
186 /* protect members of nf_ct_cache except of "use" */
187 DEFINE_RWLOCK(nf_ct_cache_lock);
188
189 /* This avoids calling kmem_cache_create() with same name simultaneously */
190 static DEFINE_MUTEX(nf_ct_cache_mutex);
191
192 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
193 struct nf_conntrack_protocol *
194 __nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
195 {
196         if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
197                 return &nf_conntrack_generic_protocol;
198
199         return nf_ct_protos[l3proto][protocol];
200 }
201
202 /* this is guaranteed to always return a valid protocol helper, since
203  * it falls back to generic_protocol */
204 struct nf_conntrack_protocol *
205 nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
206 {
207         struct nf_conntrack_protocol *p;
208
209         preempt_disable();
210         p = __nf_ct_proto_find(l3proto, protocol);
211         if (!try_module_get(p->me))
212                 p = &nf_conntrack_generic_protocol;
213         preempt_enable();
214         
215         return p;
216 }
217
218 void nf_ct_proto_put(struct nf_conntrack_protocol *p)
219 {
220         module_put(p->me);
221 }
222
223 struct nf_conntrack_l3proto *
224 nf_ct_l3proto_find_get(u_int16_t l3proto)
225 {
226         struct nf_conntrack_l3proto *p;
227
228         preempt_disable();
229         p = __nf_ct_l3proto_find(l3proto);
230         if (!try_module_get(p->me))
231                 p = &nf_conntrack_generic_l3proto;
232         preempt_enable();
233
234         return p;
235 }
236
237 void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
238 {
239         module_put(p->me);
240 }
241
242 int
243 nf_ct_l3proto_try_module_get(unsigned short l3proto)
244 {
245         int ret;
246         struct nf_conntrack_l3proto *p;
247
248 retry:  p = nf_ct_l3proto_find_get(l3proto);
249         if (p == &nf_conntrack_generic_l3proto) {
250                 ret = request_module("nf_conntrack-%d", l3proto);
251                 if (!ret)
252                         goto retry;
253
254                 return -EPROTOTYPE;
255         }
256
257         return 0;
258 }
259
260 void nf_ct_l3proto_module_put(unsigned short l3proto)
261 {
262         struct nf_conntrack_l3proto *p;
263
264         preempt_disable();
265         p = __nf_ct_l3proto_find(l3proto);
266         preempt_enable();
267
268         module_put(p->me);
269 }
270
271 static int nf_conntrack_hash_rnd_initted;
272 static unsigned int nf_conntrack_hash_rnd;
273
274 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
275                                   unsigned int size, unsigned int rnd)
276 {
277         unsigned int a, b;
278         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
279                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
280         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
281                         (tuple->src.u.all << 16) | tuple->dst.u.all);
282
283         return jhash_2words(a, b, rnd) % size;
284 }
285
286 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
287 {
288         return __hash_conntrack(tuple, nf_conntrack_htable_size,
289                                 nf_conntrack_hash_rnd);
290 }
291
292 int nf_conntrack_register_cache(u_int32_t features, const char *name,
293                                 size_t size)
294 {
295         int ret = 0;
296         char *cache_name;
297         kmem_cache_t *cachep;
298
299         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
300                features, name, size);
301
302         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
303                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
304                         features);
305                 return -EINVAL;
306         }
307
308         mutex_lock(&nf_ct_cache_mutex);
309
310         write_lock_bh(&nf_ct_cache_lock);
311         /* e.g: multiple helpers are loaded */
312         if (nf_ct_cache[features].use > 0) {
313                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
314                 if ((!strncmp(nf_ct_cache[features].name, name,
315                               NF_CT_FEATURES_NAMELEN))
316                     && nf_ct_cache[features].size == size) {
317                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
318                         nf_ct_cache[features].use++;
319                         ret = 0;
320                 } else
321                         ret = -EBUSY;
322
323                 write_unlock_bh(&nf_ct_cache_lock);
324                 mutex_unlock(&nf_ct_cache_mutex);
325                 return ret;
326         }
327         write_unlock_bh(&nf_ct_cache_lock);
328
329         /*
330          * The memory space for name of slab cache must be alive until
331          * cache is destroyed.
332          */
333         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
334         if (cache_name == NULL) {
335                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
336                 ret = -ENOMEM;
337                 goto out_up_mutex;
338         }
339
340         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
341                                                 >= NF_CT_FEATURES_NAMELEN) {
342                 printk("nf_conntrack_register_cache: name too long\n");
343                 ret = -EINVAL;
344                 goto out_free_name;
345         }
346
347         cachep = kmem_cache_create(cache_name, size, 0, 0,
348                                    NULL, NULL);
349         if (!cachep) {
350                 printk("nf_conntrack_register_cache: Can't create slab cache "
351                        "for the features = 0x%x\n", features);
352                 ret = -ENOMEM;
353                 goto out_free_name;
354         }
355
356         write_lock_bh(&nf_ct_cache_lock);
357         nf_ct_cache[features].use = 1;
358         nf_ct_cache[features].size = size;
359         nf_ct_cache[features].cachep = cachep;
360         nf_ct_cache[features].name = cache_name;
361         write_unlock_bh(&nf_ct_cache_lock);
362
363         goto out_up_mutex;
364
365 out_free_name:
366         kfree(cache_name);
367 out_up_mutex:
368         mutex_unlock(&nf_ct_cache_mutex);
369         return ret;
370 }
371
372 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
373 void nf_conntrack_unregister_cache(u_int32_t features)
374 {
375         kmem_cache_t *cachep;
376         char *name;
377
378         /*
379          * This assures that kmem_cache_create() isn't called before destroying
380          * slab cache.
381          */
382         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
383         mutex_lock(&nf_ct_cache_mutex);
384
385         write_lock_bh(&nf_ct_cache_lock);
386         if (--nf_ct_cache[features].use > 0) {
387                 write_unlock_bh(&nf_ct_cache_lock);
388                 mutex_unlock(&nf_ct_cache_mutex);
389                 return;
390         }
391         cachep = nf_ct_cache[features].cachep;
392         name = nf_ct_cache[features].name;
393         nf_ct_cache[features].cachep = NULL;
394         nf_ct_cache[features].name = NULL;
395         nf_ct_cache[features].size = 0;
396         write_unlock_bh(&nf_ct_cache_lock);
397
398         synchronize_net();
399
400         kmem_cache_destroy(cachep);
401         kfree(name);
402
403         mutex_unlock(&nf_ct_cache_mutex);
404 }
405
406 int
407 nf_ct_get_tuple(const struct sk_buff *skb,
408                 unsigned int nhoff,
409                 unsigned int dataoff,
410                 u_int16_t l3num,
411                 u_int8_t protonum,
412                 struct nf_conntrack_tuple *tuple,
413                 const struct nf_conntrack_l3proto *l3proto,
414                 const struct nf_conntrack_protocol *protocol)
415 {
416         NF_CT_TUPLE_U_BLANK(tuple);
417
418         tuple->src.l3num = l3num;
419         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
420                 return 0;
421
422         tuple->dst.protonum = protonum;
423         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
424
425         return protocol->pkt_to_tuple(skb, dataoff, tuple);
426 }
427
428 int
429 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
430                    const struct nf_conntrack_tuple *orig,
431                    const struct nf_conntrack_l3proto *l3proto,
432                    const struct nf_conntrack_protocol *protocol)
433 {
434         NF_CT_TUPLE_U_BLANK(inverse);
435
436         inverse->src.l3num = orig->src.l3num;
437         if (l3proto->invert_tuple(inverse, orig) == 0)
438                 return 0;
439
440         inverse->dst.dir = !orig->dst.dir;
441
442         inverse->dst.protonum = orig->dst.protonum;
443         return protocol->invert_tuple(inverse, orig);
444 }
445
446 /* nf_conntrack_expect helper functions */
447 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
448 {
449         struct nf_conn_help *master_help = nfct_help(exp->master);
450
451         NF_CT_ASSERT(master_help);
452         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
453         NF_CT_ASSERT(!timer_pending(&exp->timeout));
454
455         list_del(&exp->list);
456         NF_CT_STAT_INC(expect_delete);
457         master_help->expecting--;
458         nf_conntrack_expect_put(exp);
459 }
460
461 static void expectation_timed_out(unsigned long ul_expect)
462 {
463         struct nf_conntrack_expect *exp = (void *)ul_expect;
464
465         write_lock_bh(&nf_conntrack_lock);
466         nf_ct_unlink_expect(exp);
467         write_unlock_bh(&nf_conntrack_lock);
468         nf_conntrack_expect_put(exp);
469 }
470
471 struct nf_conntrack_expect *
472 __nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
473 {
474         struct nf_conntrack_expect *i;
475         
476         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
477                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
478                         atomic_inc(&i->use);
479                         return i;
480                 }
481         }
482         return NULL;
483 }
484
485 /* Just find a expectation corresponding to a tuple. */
486 struct nf_conntrack_expect *
487 nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
488 {
489         struct nf_conntrack_expect *i;
490         
491         read_lock_bh(&nf_conntrack_lock);
492         i = __nf_conntrack_expect_find(tuple);
493         read_unlock_bh(&nf_conntrack_lock);
494
495         return i;
496 }
497
498 /* If an expectation for this connection is found, it gets delete from
499  * global list then returned. */
500 static struct nf_conntrack_expect *
501 find_expectation(const struct nf_conntrack_tuple *tuple)
502 {
503         struct nf_conntrack_expect *i;
504
505         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
506         /* If master is not in hash table yet (ie. packet hasn't left
507            this machine yet), how can other end know about expected?
508            Hence these are not the droids you are looking for (if
509            master ct never got confirmed, we'd hold a reference to it
510            and weird things would happen to future packets). */
511                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
512                     && nf_ct_is_confirmed(i->master)) {
513                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
514                                 atomic_inc(&i->use);
515                                 return i;
516                         } else if (del_timer(&i->timeout)) {
517                                 nf_ct_unlink_expect(i);
518                                 return i;
519                         }
520                 }
521         }
522         return NULL;
523 }
524
525 /* delete all expectations for this conntrack */
526 void nf_ct_remove_expectations(struct nf_conn *ct)
527 {
528         struct nf_conntrack_expect *i, *tmp;
529         struct nf_conn_help *help = nfct_help(ct);
530
531         /* Optimization: most connection never expect any others. */
532         if (!help || help->expecting == 0)
533                 return;
534
535         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
536                 if (i->master == ct && del_timer(&i->timeout)) {
537                         nf_ct_unlink_expect(i);
538                         nf_conntrack_expect_put(i);
539                 }
540         }
541 }
542
543 static void
544 clean_from_lists(struct nf_conn *ct)
545 {
546         unsigned int ho, hr;
547         
548         DEBUGP("clean_from_lists(%p)\n", ct);
549         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
550
551         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
552         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
553         LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
554         LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
555
556         /* Destroy all pending expectations */
557         nf_ct_remove_expectations(ct);
558 }
559
560 static void
561 destroy_conntrack(struct nf_conntrack *nfct)
562 {
563         struct nf_conn *ct = (struct nf_conn *)nfct;
564         struct nf_conntrack_l3proto *l3proto;
565         struct nf_conntrack_protocol *proto;
566
567         DEBUGP("destroy_conntrack(%p)\n", ct);
568         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
569         NF_CT_ASSERT(!timer_pending(&ct->timeout));
570
571         nf_conntrack_event(IPCT_DESTROY, ct);
572         set_bit(IPS_DYING_BIT, &ct->status);
573
574         /* To make sure we don't get any weird locking issues here:
575          * destroy_conntrack() MUST NOT be called with a write lock
576          * to nf_conntrack_lock!!! -HW */
577         l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
578         if (l3proto && l3proto->destroy)
579                 l3proto->destroy(ct);
580
581         proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
582         if (proto && proto->destroy)
583                 proto->destroy(ct);
584
585         if (nf_conntrack_destroyed)
586                 nf_conntrack_destroyed(ct);
587
588         write_lock_bh(&nf_conntrack_lock);
589         /* Expectations will have been removed in clean_from_lists,
590          * except TFTP can create an expectation on the first packet,
591          * before connection is in the list, so we need to clean here,
592          * too. */
593         nf_ct_remove_expectations(ct);
594
595         /* We overload first tuple to link into unconfirmed list. */
596         if (!nf_ct_is_confirmed(ct)) {
597                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
598                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
599         }
600
601         NF_CT_STAT_INC(delete);
602         write_unlock_bh(&nf_conntrack_lock);
603
604         if (ct->master)
605                 nf_ct_put(ct->master);
606
607         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
608         nf_conntrack_free(ct);
609 }
610
611 static void death_by_timeout(unsigned long ul_conntrack)
612 {
613         struct nf_conn *ct = (void *)ul_conntrack;
614
615         write_lock_bh(&nf_conntrack_lock);
616         /* Inside lock so preempt is disabled on module removal path.
617          * Otherwise we can get spurious warnings. */
618         NF_CT_STAT_INC(delete_list);
619         clean_from_lists(ct);
620         write_unlock_bh(&nf_conntrack_lock);
621         nf_ct_put(ct);
622 }
623
624 static inline int
625 conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
626                     const struct nf_conntrack_tuple *tuple,
627                     const struct nf_conn *ignored_conntrack)
628 {
629         ASSERT_READ_LOCK(&nf_conntrack_lock);
630         return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
631                 && nf_ct_tuple_equal(tuple, &i->tuple);
632 }
633
634 struct nf_conntrack_tuple_hash *
635 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
636                     const struct nf_conn *ignored_conntrack)
637 {
638         struct nf_conntrack_tuple_hash *h;
639         unsigned int hash = hash_conntrack(tuple);
640
641         ASSERT_READ_LOCK(&nf_conntrack_lock);
642         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
643                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
644                         NF_CT_STAT_INC(found);
645                         return h;
646                 }
647                 NF_CT_STAT_INC(searched);
648         }
649
650         return NULL;
651 }
652
653 /* Find a connection corresponding to a tuple. */
654 struct nf_conntrack_tuple_hash *
655 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
656                       const struct nf_conn *ignored_conntrack)
657 {
658         struct nf_conntrack_tuple_hash *h;
659
660         read_lock_bh(&nf_conntrack_lock);
661         h = __nf_conntrack_find(tuple, ignored_conntrack);
662         if (h)
663                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
664         read_unlock_bh(&nf_conntrack_lock);
665
666         return h;
667 }
668
669 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
670                                        unsigned int hash,
671                                        unsigned int repl_hash) 
672 {
673         ct->id = ++nf_conntrack_next_id;
674         list_prepend(&nf_conntrack_hash[hash],
675                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
676         list_prepend(&nf_conntrack_hash[repl_hash],
677                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
678 }
679
680 void nf_conntrack_hash_insert(struct nf_conn *ct)
681 {
682         unsigned int hash, repl_hash;
683
684         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
685         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
686
687         write_lock_bh(&nf_conntrack_lock);
688         __nf_conntrack_hash_insert(ct, hash, repl_hash);
689         write_unlock_bh(&nf_conntrack_lock);
690 }
691
692 /* Confirm a connection given skb; places it in hash table */
693 int
694 __nf_conntrack_confirm(struct sk_buff **pskb)
695 {
696         unsigned int hash, repl_hash;
697         struct nf_conn *ct;
698         enum ip_conntrack_info ctinfo;
699
700         ct = nf_ct_get(*pskb, &ctinfo);
701
702         /* ipt_REJECT uses nf_conntrack_attach to attach related
703            ICMP/TCP RST packets in other direction.  Actual packet
704            which created connection will be IP_CT_NEW or for an
705            expected connection, IP_CT_RELATED. */
706         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
707                 return NF_ACCEPT;
708
709         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
710         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
711
712         /* We're not in hash table, and we refuse to set up related
713            connections for unconfirmed conns.  But packet copies and
714            REJECT will give spurious warnings here. */
715         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
716
717         /* No external references means noone else could have
718            confirmed us. */
719         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
720         DEBUGP("Confirming conntrack %p\n", ct);
721
722         write_lock_bh(&nf_conntrack_lock);
723
724         /* See if there's one in the list already, including reverse:
725            NAT could have grabbed it without realizing, since we're
726            not in the hash.  If there is, we lost race. */
727         if (!LIST_FIND(&nf_conntrack_hash[hash],
728                        conntrack_tuple_cmp,
729                        struct nf_conntrack_tuple_hash *,
730                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
731             && !LIST_FIND(&nf_conntrack_hash[repl_hash],
732                           conntrack_tuple_cmp,
733                           struct nf_conntrack_tuple_hash *,
734                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
735                 struct nf_conn_help *help;
736                 /* Remove from unconfirmed list */
737                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
738
739                 __nf_conntrack_hash_insert(ct, hash, repl_hash);
740                 /* Timer relative to confirmation time, not original
741                    setting time, otherwise we'd get timer wrap in
742                    weird delay cases. */
743                 ct->timeout.expires += jiffies;
744                 add_timer(&ct->timeout);
745                 atomic_inc(&ct->ct_general.use);
746                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
747                 NF_CT_STAT_INC(insert);
748                 write_unlock_bh(&nf_conntrack_lock);
749                 help = nfct_help(ct);
750                 if (help && help->helper)
751                         nf_conntrack_event_cache(IPCT_HELPER, *pskb);
752 #ifdef CONFIG_NF_NAT_NEEDED
753                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
754                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
755                         nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
756 #endif
757                 nf_conntrack_event_cache(master_ct(ct) ?
758                                          IPCT_RELATED : IPCT_NEW, *pskb);
759                 return NF_ACCEPT;
760         }
761
762         NF_CT_STAT_INC(insert_failed);
763         write_unlock_bh(&nf_conntrack_lock);
764         return NF_DROP;
765 }
766
767 /* Returns true if a connection correspondings to the tuple (required
768    for NAT). */
769 int
770 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
771                          const struct nf_conn *ignored_conntrack)
772 {
773         struct nf_conntrack_tuple_hash *h;
774
775         read_lock_bh(&nf_conntrack_lock);
776         h = __nf_conntrack_find(tuple, ignored_conntrack);
777         read_unlock_bh(&nf_conntrack_lock);
778
779         return h != NULL;
780 }
781
782 /* There's a small race here where we may free a just-assured
783    connection.  Too bad: we're in trouble anyway. */
784 static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
785 {
786         return !(test_bit(IPS_ASSURED_BIT,
787                           &nf_ct_tuplehash_to_ctrack(i)->status));
788 }
789
790 static int early_drop(struct list_head *chain)
791 {
792         /* Traverse backwards: gives us oldest, which is roughly LRU */
793         struct nf_conntrack_tuple_hash *h;
794         struct nf_conn *ct = NULL;
795         int dropped = 0;
796
797         read_lock_bh(&nf_conntrack_lock);
798         h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
799         if (h) {
800                 ct = nf_ct_tuplehash_to_ctrack(h);
801                 atomic_inc(&ct->ct_general.use);
802         }
803         read_unlock_bh(&nf_conntrack_lock);
804
805         if (!ct)
806                 return dropped;
807
808         if (del_timer(&ct->timeout)) {
809                 death_by_timeout((unsigned long)ct);
810                 dropped = 1;
811                 NF_CT_STAT_INC(early_drop);
812         }
813         nf_ct_put(ct);
814         return dropped;
815 }
816
817 static inline int helper_cmp(const struct nf_conntrack_helper *i,
818                              const struct nf_conntrack_tuple *rtuple)
819 {
820         return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
821 }
822
823 static struct nf_conntrack_helper *
824 __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
825 {
826         return LIST_FIND(&helpers, helper_cmp,
827                          struct nf_conntrack_helper *,
828                          tuple);
829 }
830
831 struct nf_conntrack_helper *
832 nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
833 {
834         struct nf_conntrack_helper *helper;
835
836         /* need nf_conntrack_lock to assure that helper exists until
837          * try_module_get() is called */
838         read_lock_bh(&nf_conntrack_lock);
839
840         helper = __nf_ct_helper_find(tuple);
841         if (helper) {
842                 /* need to increase module usage count to assure helper will
843                  * not go away while the caller is e.g. busy putting a
844                  * conntrack in the hash that uses the helper */
845                 if (!try_module_get(helper->me))
846                         helper = NULL;
847         }
848
849         read_unlock_bh(&nf_conntrack_lock);
850
851         return helper;
852 }
853
854 void nf_ct_helper_put(struct nf_conntrack_helper *helper)
855 {
856         module_put(helper->me);
857 }
858
859 static struct nf_conn *
860 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
861                      const struct nf_conntrack_tuple *repl,
862                      const struct nf_conntrack_l3proto *l3proto)
863 {
864         struct nf_conn *conntrack = NULL;
865         u_int32_t features = 0;
866         struct nf_conntrack_helper *helper;
867
868         if (unlikely(!nf_conntrack_hash_rnd_initted)) {
869                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
870                 nf_conntrack_hash_rnd_initted = 1;
871         }
872
873         if (nf_conntrack_max
874             && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
875                 unsigned int hash = hash_conntrack(orig);
876                 /* Try dropping from this hash chain. */
877                 if (!early_drop(&nf_conntrack_hash[hash])) {
878                         if (net_ratelimit())
879                                 printk(KERN_WARNING
880                                        "nf_conntrack: table full, dropping"
881                                        " packet.\n");
882                         return ERR_PTR(-ENOMEM);
883                 }
884         }
885
886         /*  find features needed by this conntrack. */
887         features = l3proto->get_features(orig);
888
889         /* FIXME: protect helper list per RCU */
890         read_lock_bh(&nf_conntrack_lock);
891         helper = __nf_ct_helper_find(repl);
892         if (helper)
893                 features |= NF_CT_F_HELP;
894         read_unlock_bh(&nf_conntrack_lock);
895
896         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
897
898         read_lock_bh(&nf_ct_cache_lock);
899
900         if (unlikely(!nf_ct_cache[features].use)) {
901                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
902                         features);
903                 goto out;
904         }
905
906         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
907         if (conntrack == NULL) {
908                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
909                 goto out;
910         }
911
912         memset(conntrack, 0, nf_ct_cache[features].size);
913         conntrack->features = features;
914         if (helper) {
915                 struct nf_conn_help *help = nfct_help(conntrack);
916                 NF_CT_ASSERT(help);
917                 help->helper = helper;
918         }
919
920         atomic_set(&conntrack->ct_general.use, 1);
921         conntrack->ct_general.destroy = destroy_conntrack;
922         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
923         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
924         /* Don't set timer yet: wait for confirmation */
925         init_timer(&conntrack->timeout);
926         conntrack->timeout.data = (unsigned long)conntrack;
927         conntrack->timeout.function = death_by_timeout;
928
929         atomic_inc(&nf_conntrack_count);
930 out:
931         read_unlock_bh(&nf_ct_cache_lock);
932         return conntrack;
933 }
934
935 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
936                                    const struct nf_conntrack_tuple *repl)
937 {
938         struct nf_conntrack_l3proto *l3proto;
939
940         l3proto = __nf_ct_l3proto_find(orig->src.l3num);
941         return __nf_conntrack_alloc(orig, repl, l3proto);
942 }
943
944 void nf_conntrack_free(struct nf_conn *conntrack)
945 {
946         u_int32_t features = conntrack->features;
947         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
948         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
949                conntrack);
950         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
951         atomic_dec(&nf_conntrack_count);
952 }
953
954 /* Allocate a new conntrack: we return -ENOMEM if classification
955    failed due to stress.  Otherwise it really is unclassifiable. */
956 static struct nf_conntrack_tuple_hash *
957 init_conntrack(const struct nf_conntrack_tuple *tuple,
958                struct nf_conntrack_l3proto *l3proto,
959                struct nf_conntrack_protocol *protocol,
960                struct sk_buff *skb,
961                unsigned int dataoff)
962 {
963         struct nf_conn *conntrack;
964         struct nf_conntrack_tuple repl_tuple;
965         struct nf_conntrack_expect *exp;
966
967         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
968                 DEBUGP("Can't invert tuple.\n");
969                 return NULL;
970         }
971
972         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
973         if (conntrack == NULL || IS_ERR(conntrack)) {
974                 DEBUGP("Can't allocate conntrack.\n");
975                 return (struct nf_conntrack_tuple_hash *)conntrack;
976         }
977
978         if (!protocol->new(conntrack, skb, dataoff)) {
979                 nf_conntrack_free(conntrack);
980                 DEBUGP("init conntrack: can't track with proto module\n");
981                 return NULL;
982         }
983
984         write_lock_bh(&nf_conntrack_lock);
985         exp = find_expectation(tuple);
986
987         if (exp) {
988                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
989                         conntrack, exp);
990                 /* Welcome, Mr. Bond.  We've been expecting you... */
991                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
992                 conntrack->master = exp->master;
993 #ifdef CONFIG_NF_CONNTRACK_MARK
994                 conntrack->mark = exp->master->mark;
995 #endif
996                 nf_conntrack_get(&conntrack->master->ct_general);
997                 NF_CT_STAT_INC(expect_new);
998         } else
999                 NF_CT_STAT_INC(new);
1000
1001         /* Overload tuple linked list to put us in unconfirmed list. */
1002         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
1003
1004         write_unlock_bh(&nf_conntrack_lock);
1005
1006         if (exp) {
1007                 if (exp->expectfn)
1008                         exp->expectfn(conntrack, exp);
1009                 nf_conntrack_expect_put(exp);
1010         }
1011
1012         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
1013 }
1014
1015 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1016 static inline struct nf_conn *
1017 resolve_normal_ct(struct sk_buff *skb,
1018                   unsigned int dataoff,
1019                   u_int16_t l3num,
1020                   u_int8_t protonum,
1021                   struct nf_conntrack_l3proto *l3proto,
1022                   struct nf_conntrack_protocol *proto,
1023                   int *set_reply,
1024                   enum ip_conntrack_info *ctinfo)
1025 {
1026         struct nf_conntrack_tuple tuple;
1027         struct nf_conntrack_tuple_hash *h;
1028         struct nf_conn *ct;
1029
1030         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1031                              dataoff, l3num, protonum, &tuple, l3proto,
1032                              proto)) {
1033                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1034                 return NULL;
1035         }
1036
1037         /* look for tuple match */
1038         h = nf_conntrack_find_get(&tuple, NULL);
1039         if (!h) {
1040                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1041                 if (!h)
1042                         return NULL;
1043                 if (IS_ERR(h))
1044                         return (void *)h;
1045         }
1046         ct = nf_ct_tuplehash_to_ctrack(h);
1047
1048         /* It exists; we have (non-exclusive) reference. */
1049         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1050                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1051                 /* Please set reply bit if this packet OK */
1052                 *set_reply = 1;
1053         } else {
1054                 /* Once we've had two way comms, always ESTABLISHED. */
1055                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1056                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1057                         *ctinfo = IP_CT_ESTABLISHED;
1058                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1059                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1060                         *ctinfo = IP_CT_RELATED;
1061                 } else {
1062                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1063                         *ctinfo = IP_CT_NEW;
1064                 }
1065                 *set_reply = 0;
1066         }
1067         skb->nfct = &ct->ct_general;
1068         skb->nfctinfo = *ctinfo;
1069         return ct;
1070 }
1071
1072 unsigned int
1073 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1074 {
1075         struct nf_conn *ct;
1076         enum ip_conntrack_info ctinfo;
1077         struct nf_conntrack_l3proto *l3proto;
1078         struct nf_conntrack_protocol *proto;
1079         unsigned int dataoff;
1080         u_int8_t protonum;
1081         int set_reply = 0;
1082         int ret;
1083
1084         /* Previously seen (loopback or untracked)?  Ignore. */
1085         if ((*pskb)->nfct) {
1086                 NF_CT_STAT_INC(ignore);
1087                 return NF_ACCEPT;
1088         }
1089
1090         l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
1091         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1092                 DEBUGP("not prepared to track yet or error occured\n");
1093                 return -ret;
1094         }
1095
1096         proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
1097
1098         /* It may be an special packet, error, unclean...
1099          * inverse of the return code tells to the netfilter
1100          * core what to do with the packet. */
1101         if (proto->error != NULL &&
1102             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1103                 NF_CT_STAT_INC(error);
1104                 NF_CT_STAT_INC(invalid);
1105                 return -ret;
1106         }
1107
1108         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1109                                &set_reply, &ctinfo);
1110         if (!ct) {
1111                 /* Not valid part of a connection */
1112                 NF_CT_STAT_INC(invalid);
1113                 return NF_ACCEPT;
1114         }
1115
1116         if (IS_ERR(ct)) {
1117                 /* Too stressed to deal. */
1118                 NF_CT_STAT_INC(drop);
1119                 return NF_DROP;
1120         }
1121
1122         NF_CT_ASSERT((*pskb)->nfct);
1123
1124         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1125         if (ret < 0) {
1126                 /* Invalid: inverse of the return code tells
1127                  * the netfilter core what to do */
1128                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1129                 nf_conntrack_put((*pskb)->nfct);
1130                 (*pskb)->nfct = NULL;
1131                 NF_CT_STAT_INC(invalid);
1132                 return -ret;
1133         }
1134
1135         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1136                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1137
1138         return ret;
1139 }
1140
1141 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1142                          const struct nf_conntrack_tuple *orig)
1143 {
1144         return nf_ct_invert_tuple(inverse, orig,
1145                                   __nf_ct_l3proto_find(orig->src.l3num),
1146                                   __nf_ct_proto_find(orig->src.l3num,
1147                                                      orig->dst.protonum));
1148 }
1149
1150 /* Would two expected things clash? */
1151 static inline int expect_clash(const struct nf_conntrack_expect *a,
1152                                const struct nf_conntrack_expect *b)
1153 {
1154         /* Part covered by intersection of masks must be unequal,
1155            otherwise they clash */
1156         struct nf_conntrack_tuple intersect_mask;
1157         int count;
1158
1159         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1160         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1161         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1162         intersect_mask.dst.protonum = a->mask.dst.protonum
1163                                         & b->mask.dst.protonum;
1164
1165         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1166                 intersect_mask.src.u3.all[count] =
1167                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1168         }
1169
1170         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1171                 intersect_mask.dst.u3.all[count] =
1172                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1173         }
1174
1175         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1176 }
1177
1178 static inline int expect_matches(const struct nf_conntrack_expect *a,
1179                                  const struct nf_conntrack_expect *b)
1180 {
1181         return a->master == b->master
1182                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1183                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1184 }
1185
1186 /* Generally a bad idea to call this: could have matched already. */
1187 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1188 {
1189         struct nf_conntrack_expect *i;
1190
1191         write_lock_bh(&nf_conntrack_lock);
1192         /* choose the the oldest expectation to evict */
1193         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1194                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1195                         nf_ct_unlink_expect(i);
1196                         write_unlock_bh(&nf_conntrack_lock);
1197                         nf_conntrack_expect_put(i);
1198                         return;
1199                 }
1200         }
1201         write_unlock_bh(&nf_conntrack_lock);
1202 }
1203
1204 /* We don't increase the master conntrack refcount for non-fulfilled
1205  * conntracks. During the conntrack destruction, the expectations are
1206  * always killed before the conntrack itself */
1207 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1208 {
1209         struct nf_conntrack_expect *new;
1210
1211         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1212         if (!new) {
1213                 DEBUGP("expect_related: OOM allocating expect\n");
1214                 return NULL;
1215         }
1216         new->master = me;
1217         atomic_set(&new->use, 1);
1218         return new;
1219 }
1220
1221 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1222 {
1223         if (atomic_dec_and_test(&exp->use))
1224                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1225 }
1226
1227 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1228 {
1229         struct nf_conn_help *master_help = nfct_help(exp->master);
1230
1231         atomic_inc(&exp->use);
1232         master_help->expecting++;
1233         list_add(&exp->list, &nf_conntrack_expect_list);
1234
1235         init_timer(&exp->timeout);
1236         exp->timeout.data = (unsigned long)exp;
1237         exp->timeout.function = expectation_timed_out;
1238         exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
1239         add_timer(&exp->timeout);
1240
1241         exp->id = ++nf_conntrack_expect_next_id;
1242         atomic_inc(&exp->use);
1243         NF_CT_STAT_INC(expect_create);
1244 }
1245
1246 /* Race with expectations being used means we could have none to find; OK. */
1247 static void evict_oldest_expect(struct nf_conn *master)
1248 {
1249         struct nf_conntrack_expect *i;
1250
1251         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1252                 if (i->master == master) {
1253                         if (del_timer(&i->timeout)) {
1254                                 nf_ct_unlink_expect(i);
1255                                 nf_conntrack_expect_put(i);
1256                         }
1257                         break;
1258                 }
1259         }
1260 }
1261
1262 static inline int refresh_timer(struct nf_conntrack_expect *i)
1263 {
1264         struct nf_conn_help *master_help = nfct_help(i->master);
1265
1266         if (!del_timer(&i->timeout))
1267                 return 0;
1268
1269         i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
1270         add_timer(&i->timeout);
1271         return 1;
1272 }
1273
1274 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1275 {
1276         struct nf_conntrack_expect *i;
1277         struct nf_conn *master = expect->master;
1278         struct nf_conn_help *master_help = nfct_help(master);
1279         int ret;
1280
1281         NF_CT_ASSERT(master_help);
1282
1283         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1284         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1285         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1286
1287         write_lock_bh(&nf_conntrack_lock);
1288         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1289                 if (expect_matches(i, expect)) {
1290                         /* Refresh timer: if it's dying, ignore.. */
1291                         if (refresh_timer(i)) {
1292                                 ret = 0;
1293                                 goto out;
1294                         }
1295                 } else if (expect_clash(i, expect)) {
1296                         ret = -EBUSY;
1297                         goto out;
1298                 }
1299         }
1300         /* Will be over limit? */
1301         if (master_help->helper->max_expected &&
1302             master_help->expecting >= master_help->helper->max_expected)
1303                 evict_oldest_expect(master);
1304
1305         nf_conntrack_expect_insert(expect);
1306         nf_conntrack_expect_event(IPEXP_NEW, expect);
1307         ret = 0;
1308 out:
1309         write_unlock_bh(&nf_conntrack_lock);
1310         return ret;
1311 }
1312
1313 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1314 {
1315         int ret;
1316         BUG_ON(me->timeout == 0);
1317
1318         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1319                                           sizeof(struct nf_conn)
1320                                           + sizeof(struct nf_conn_help)
1321                                           + __alignof__(struct nf_conn_help));
1322         if (ret < 0) {
1323                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1324                 return ret;
1325         }
1326         write_lock_bh(&nf_conntrack_lock);
1327         list_prepend(&helpers, me);
1328         write_unlock_bh(&nf_conntrack_lock);
1329
1330         return 0;
1331 }
1332
1333 struct nf_conntrack_helper *
1334 __nf_conntrack_helper_find_byname(const char *name)
1335 {
1336         struct nf_conntrack_helper *h;
1337
1338         list_for_each_entry(h, &helpers, list) {
1339                 if (!strcmp(h->name, name))
1340                         return h;
1341         }
1342
1343         return NULL;
1344 }
1345
1346 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1347                          const struct nf_conntrack_helper *me)
1348 {
1349         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1350         struct nf_conn_help *help = nfct_help(ct);
1351
1352         if (help && help->helper == me) {
1353                 nf_conntrack_event(IPCT_HELPER, ct);
1354                 help->helper = NULL;
1355         }
1356         return 0;
1357 }
1358
1359 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1360 {
1361         unsigned int i;
1362         struct nf_conntrack_expect *exp, *tmp;
1363
1364         /* Need write lock here, to delete helper. */
1365         write_lock_bh(&nf_conntrack_lock);
1366         LIST_DELETE(&helpers, me);
1367
1368         /* Get rid of expectations */
1369         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1370                 struct nf_conn_help *help = nfct_help(exp->master);
1371                 if (help->helper == me && del_timer(&exp->timeout)) {
1372                         nf_ct_unlink_expect(exp);
1373                         nf_conntrack_expect_put(exp);
1374                 }
1375         }
1376
1377         /* Get rid of expecteds, set helpers to NULL. */
1378         LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1379         for (i = 0; i < nf_conntrack_htable_size; i++)
1380                 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1381                             struct nf_conntrack_tuple_hash *, me);
1382         write_unlock_bh(&nf_conntrack_lock);
1383
1384         /* Someone could be still looking at the helper in a bh. */
1385         synchronize_net();
1386 }
1387
1388 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1389 void __nf_ct_refresh_acct(struct nf_conn *ct,
1390                           enum ip_conntrack_info ctinfo,
1391                           const struct sk_buff *skb,
1392                           unsigned long extra_jiffies,
1393                           int do_acct)
1394 {
1395         int event = 0;
1396
1397         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1398         NF_CT_ASSERT(skb);
1399
1400         write_lock_bh(&nf_conntrack_lock);
1401
1402         /* If not in hash table, timer will not be active yet */
1403         if (!nf_ct_is_confirmed(ct)) {
1404                 ct->timeout.expires = extra_jiffies;
1405                 event = IPCT_REFRESH;
1406         } else {
1407                 /* Need del_timer for race avoidance (may already be dying). */
1408                 if (del_timer(&ct->timeout)) {
1409                         ct->timeout.expires = jiffies + extra_jiffies;
1410                         add_timer(&ct->timeout);
1411                         event = IPCT_REFRESH;
1412                 }
1413         }
1414
1415 #ifdef CONFIG_NF_CT_ACCT
1416         if (do_acct) {
1417                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1418                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1419                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1420         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1421             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1422                 event |= IPCT_COUNTER_FILLING;
1423         }
1424 #endif
1425
1426         write_unlock_bh(&nf_conntrack_lock);
1427
1428         /* must be unlocked when calling event cache */
1429         if (event)
1430                 nf_conntrack_event_cache(event, skb);
1431 }
1432
1433 #if defined(CONFIG_NF_CT_NETLINK) || \
1434     defined(CONFIG_NF_CT_NETLINK_MODULE)
1435
1436 #include <linux/netfilter/nfnetlink.h>
1437 #include <linux/netfilter/nfnetlink_conntrack.h>
1438 #include <linux/mutex.h>
1439
1440
1441 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1442  * in ip_conntrack_core, since we don't want the protocols to autoload
1443  * or depend on ctnetlink */
1444 int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1445                                const struct nf_conntrack_tuple *tuple)
1446 {
1447         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1448                 &tuple->src.u.tcp.port);
1449         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1450                 &tuple->dst.u.tcp.port);
1451         return 0;
1452
1453 nfattr_failure:
1454         return -1;
1455 }
1456
1457 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1458         [CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
1459         [CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
1460 };
1461
1462 int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1463                                struct nf_conntrack_tuple *t)
1464 {
1465         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1466                 return -EINVAL;
1467
1468         if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1469                 return -EINVAL;
1470
1471         t->src.u.tcp.port =
1472                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1473         t->dst.u.tcp.port =
1474                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1475
1476         return 0;
1477 }
1478 #endif
1479
1480 /* Used by ipt_REJECT and ip6t_REJECT. */
1481 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1482 {
1483         struct nf_conn *ct;
1484         enum ip_conntrack_info ctinfo;
1485
1486         /* This ICMP is in reverse direction to the packet which caused it */
1487         ct = nf_ct_get(skb, &ctinfo);
1488         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1489                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1490         else
1491                 ctinfo = IP_CT_RELATED;
1492
1493         /* Attach to new skbuff, and increment count */
1494         nskb->nfct = &ct->ct_general;
1495         nskb->nfctinfo = ctinfo;
1496         nf_conntrack_get(nskb->nfct);
1497 }
1498
1499 static inline int
1500 do_iter(const struct nf_conntrack_tuple_hash *i,
1501         int (*iter)(struct nf_conn *i, void *data),
1502         void *data)
1503 {
1504         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1505 }
1506
1507 /* Bring out ya dead! */
1508 static struct nf_conntrack_tuple_hash *
1509 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1510                 void *data, unsigned int *bucket)
1511 {
1512         struct nf_conntrack_tuple_hash *h = NULL;
1513
1514         write_lock_bh(&nf_conntrack_lock);
1515         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1516                 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1517                                 struct nf_conntrack_tuple_hash *, iter, data);
1518                 if (h)
1519                         break;
1520         }
1521         if (!h)
1522                 h = LIST_FIND_W(&unconfirmed, do_iter,
1523                                 struct nf_conntrack_tuple_hash *, iter, data);
1524         if (h)
1525                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1526         write_unlock_bh(&nf_conntrack_lock);
1527
1528         return h;
1529 }
1530
1531 void
1532 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1533 {
1534         struct nf_conntrack_tuple_hash *h;
1535         unsigned int bucket = 0;
1536
1537         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1538                 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1539                 /* Time to push up daises... */
1540                 if (del_timer(&ct->timeout))
1541                         death_by_timeout((unsigned long)ct);
1542                 /* ... else the timer will get him soon. */
1543
1544                 nf_ct_put(ct);
1545         }
1546 }
1547
1548 static int kill_all(struct nf_conn *i, void *data)
1549 {
1550         return 1;
1551 }
1552
1553 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1554 {
1555         if (vmalloced)
1556                 vfree(hash);
1557         else
1558                 free_pages((unsigned long)hash, 
1559                            get_order(sizeof(struct list_head) * size));
1560 }
1561
1562 void nf_conntrack_flush()
1563 {
1564         nf_ct_iterate_cleanup(kill_all, NULL);
1565 }
1566
1567 /* Mishearing the voices in his head, our hero wonders how he's
1568    supposed to kill the mall. */
1569 void nf_conntrack_cleanup(void)
1570 {
1571         int i;
1572
1573         ip_ct_attach = NULL;
1574
1575         /* This makes sure all current packets have passed through
1576            netfilter framework.  Roll on, two-stage module
1577            delete... */
1578         synchronize_net();
1579
1580         nf_ct_event_cache_flush();
1581  i_see_dead_people:
1582         nf_conntrack_flush();
1583         if (atomic_read(&nf_conntrack_count) != 0) {
1584                 schedule();
1585                 goto i_see_dead_people;
1586         }
1587         /* wait until all references to nf_conntrack_untracked are dropped */
1588         while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1589                 schedule();
1590
1591         for (i = 0; i < NF_CT_F_NUM; i++) {
1592                 if (nf_ct_cache[i].use == 0)
1593                         continue;
1594
1595                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1596                 nf_ct_cache[i].use = 1;
1597                 nf_conntrack_unregister_cache(i);
1598         }
1599         kmem_cache_destroy(nf_conntrack_expect_cachep);
1600         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1601                             nf_conntrack_htable_size);
1602
1603         /* free l3proto protocol tables */
1604         for (i = 0; i < PF_MAX; i++)
1605                 if (nf_ct_protos[i]) {
1606                         kfree(nf_ct_protos[i]);
1607                         nf_ct_protos[i] = NULL;
1608                 }
1609 }
1610
1611 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1612 {
1613         struct list_head *hash;
1614         unsigned int i;
1615
1616         *vmalloced = 0; 
1617         hash = (void*)__get_free_pages(GFP_KERNEL, 
1618                                        get_order(sizeof(struct list_head)
1619                                                  * size));
1620         if (!hash) { 
1621                 *vmalloced = 1;
1622                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1623                 hash = vmalloc(sizeof(struct list_head) * size);
1624         }
1625
1626         if (hash)
1627                 for (i = 0; i < size; i++) 
1628                         INIT_LIST_HEAD(&hash[i]);
1629
1630         return hash;
1631 }
1632
1633 int set_hashsize(const char *val, struct kernel_param *kp)
1634 {
1635         int i, bucket, hashsize, vmalloced;
1636         int old_vmalloced, old_size;
1637         int rnd;
1638         struct list_head *hash, *old_hash;
1639         struct nf_conntrack_tuple_hash *h;
1640
1641         /* On boot, we can set this without any fancy locking. */
1642         if (!nf_conntrack_htable_size)
1643                 return param_set_uint(val, kp);
1644
1645         hashsize = simple_strtol(val, NULL, 0);
1646         if (!hashsize)
1647                 return -EINVAL;
1648
1649         hash = alloc_hashtable(hashsize, &vmalloced);
1650         if (!hash)
1651                 return -ENOMEM;
1652
1653         /* We have to rehahs for the new table anyway, so we also can
1654          * use a newrandom seed */
1655         get_random_bytes(&rnd, 4);
1656
1657         write_lock_bh(&nf_conntrack_lock);
1658         for (i = 0; i < nf_conntrack_htable_size; i++) {
1659                 while (!list_empty(&nf_conntrack_hash[i])) {
1660                         h = list_entry(nf_conntrack_hash[i].next,
1661                                        struct nf_conntrack_tuple_hash, list);
1662                         list_del(&h->list);
1663                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1664                         list_add_tail(&h->list, &hash[bucket]);
1665                 }
1666         }
1667         old_size = nf_conntrack_htable_size;
1668         old_vmalloced = nf_conntrack_vmalloc;
1669         old_hash = nf_conntrack_hash;
1670
1671         nf_conntrack_htable_size = hashsize;
1672         nf_conntrack_vmalloc = vmalloced;
1673         nf_conntrack_hash = hash;
1674         nf_conntrack_hash_rnd = rnd;
1675         write_unlock_bh(&nf_conntrack_lock);
1676
1677         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1678         return 0;
1679 }
1680
1681 module_param_call(hashsize, set_hashsize, param_get_uint,
1682                   &nf_conntrack_htable_size, 0600);
1683
1684 int __init nf_conntrack_init(void)
1685 {
1686         unsigned int i;
1687         int ret;
1688
1689         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1690          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1691         if (!nf_conntrack_htable_size) {
1692                 nf_conntrack_htable_size
1693                         = (((num_physpages << PAGE_SHIFT) / 16384)
1694                            / sizeof(struct list_head));
1695                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1696                         nf_conntrack_htable_size = 8192;
1697                 if (nf_conntrack_htable_size < 16)
1698                         nf_conntrack_htable_size = 16;
1699         }
1700         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1701
1702         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1703                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1704                nf_conntrack_max);
1705
1706         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1707                                             &nf_conntrack_vmalloc);
1708         if (!nf_conntrack_hash) {
1709                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1710                 goto err_out;
1711         }
1712
1713         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1714                                           sizeof(struct nf_conn));
1715         if (ret < 0) {
1716                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1717                 goto err_free_hash;
1718         }
1719
1720         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1721                                         sizeof(struct nf_conntrack_expect),
1722                                         0, 0, NULL, NULL);
1723         if (!nf_conntrack_expect_cachep) {
1724                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1725                 goto err_free_conntrack_slab;
1726         }
1727
1728         /* Don't NEED lock here, but good form anyway. */
1729         write_lock_bh(&nf_conntrack_lock);
1730         for (i = 0; i < PF_MAX; i++)
1731                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1732         write_unlock_bh(&nf_conntrack_lock);
1733
1734         /* For use by REJECT target */
1735         ip_ct_attach = __nf_conntrack_attach;
1736
1737         /* Set up fake conntrack:
1738             - to never be deleted, not in any hashes */
1739         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1740         /*  - and look it like as a confirmed connection */
1741         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1742
1743         return ret;
1744
1745 err_free_conntrack_slab:
1746         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1747 err_free_hash:
1748         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1749                             nf_conntrack_htable_size);
1750 err_out:
1751         return -ENOMEM;
1752 }