[NETFILTER] nf_conntrack: clean up to reduce size of 'struct nf_conn'
[safe/jmp/linux-2.6] / net / netfilter / nf_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License version 2 as
11  * published by the Free Software Foundation.
12  *
13  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
14  *      - new API and handling of conntrack/nat helpers
15  *      - now capable of multiple expectations for one master
16  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
17  *      - add usage/reference counts to ip_conntrack_expect
18  *      - export ip_conntrack[_expect]_{find_get,put} functions
19  * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
20  *      - generalize L3 protocol denendent part.
21  * 23 Mar 2004: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
22  *      - add support various size of conntrack structures.
23  * 26 Jan 2006: Harald Welte <laforge@netfilter.org>
24  *      - restructure nf_conn (introduce nf_conn_help)
25  *      - redesign 'features' how they were originally intended
26  *
27  * Derived from net/ipv4/netfilter/ip_conntrack_core.c
28  */
29
30 #include <linux/config.h>
31 #include <linux/types.h>
32 #include <linux/netfilter.h>
33 #include <linux/module.h>
34 #include <linux/skbuff.h>
35 #include <linux/proc_fs.h>
36 #include <linux/vmalloc.h>
37 #include <linux/stddef.h>
38 #include <linux/slab.h>
39 #include <linux/random.h>
40 #include <linux/jhash.h>
41 #include <linux/err.h>
42 #include <linux/percpu.h>
43 #include <linux/moduleparam.h>
44 #include <linux/notifier.h>
45 #include <linux/kernel.h>
46 #include <linux/netdevice.h>
47 #include <linux/socket.h>
48
49 /* This rwlock protects the main hash table, protocol/helper/expected
50    registrations, conntrack timers*/
51 #define ASSERT_READ_LOCK(x)
52 #define ASSERT_WRITE_LOCK(x)
53
54 #include <net/netfilter/nf_conntrack.h>
55 #include <net/netfilter/nf_conntrack_l3proto.h>
56 #include <net/netfilter/nf_conntrack_protocol.h>
57 #include <net/netfilter/nf_conntrack_helper.h>
58 #include <net/netfilter/nf_conntrack_core.h>
59 #include <linux/netfilter_ipv4/listhelp.h>
60
61 #define NF_CONNTRACK_VERSION    "0.5.0"
62
63 #if 0
64 #define DEBUGP printk
65 #else
66 #define DEBUGP(format, args...)
67 #endif
68
69 DEFINE_RWLOCK(nf_conntrack_lock);
70
71 /* nf_conntrack_standalone needs this */
72 atomic_t nf_conntrack_count = ATOMIC_INIT(0);
73
74 void (*nf_conntrack_destroyed)(struct nf_conn *conntrack) = NULL;
75 LIST_HEAD(nf_conntrack_expect_list);
76 struct nf_conntrack_protocol **nf_ct_protos[PF_MAX];
77 struct nf_conntrack_l3proto *nf_ct_l3protos[PF_MAX];
78 static LIST_HEAD(helpers);
79 unsigned int nf_conntrack_htable_size = 0;
80 int nf_conntrack_max;
81 struct list_head *nf_conntrack_hash;
82 static kmem_cache_t *nf_conntrack_expect_cachep;
83 struct nf_conn nf_conntrack_untracked;
84 unsigned int nf_ct_log_invalid;
85 static LIST_HEAD(unconfirmed);
86 static int nf_conntrack_vmalloc;
87
88 static unsigned int nf_conntrack_next_id = 1;
89 static unsigned int nf_conntrack_expect_next_id = 1;
90 #ifdef CONFIG_NF_CONNTRACK_EVENTS
91 struct notifier_block *nf_conntrack_chain;
92 struct notifier_block *nf_conntrack_expect_chain;
93
94 DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache);
95
96 /* deliver cached events and clear cache entry - must be called with locally
97  * disabled softirqs */
98 static inline void
99 __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
100 {
101         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
102         if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
103             && ecache->events)
104                 notifier_call_chain(&nf_conntrack_chain, ecache->events,
105                                     ecache->ct);
106
107         ecache->events = 0;
108         nf_ct_put(ecache->ct);
109         ecache->ct = NULL;
110 }
111
112 /* Deliver all cached events for a particular conntrack. This is called
113  * by code prior to async packet handling for freeing the skb */
114 void nf_ct_deliver_cached_events(const struct nf_conn *ct)
115 {
116         struct nf_conntrack_ecache *ecache;
117
118         local_bh_disable();
119         ecache = &__get_cpu_var(nf_conntrack_ecache);
120         if (ecache->ct == ct)
121                 __nf_ct_deliver_cached_events(ecache);
122         local_bh_enable();
123 }
124
125 /* Deliver cached events for old pending events, if current conntrack != old */
126 void __nf_ct_event_cache_init(struct nf_conn *ct)
127 {
128         struct nf_conntrack_ecache *ecache;
129         
130         /* take care of delivering potentially old events */
131         ecache = &__get_cpu_var(nf_conntrack_ecache);
132         BUG_ON(ecache->ct == ct);
133         if (ecache->ct)
134                 __nf_ct_deliver_cached_events(ecache);
135         /* initialize for this conntrack/packet */
136         ecache->ct = ct;
137         nf_conntrack_get(&ct->ct_general);
138 }
139
140 /* flush the event cache - touches other CPU's data and must not be called
141  * while packets are still passing through the code */
142 static void nf_ct_event_cache_flush(void)
143 {
144         struct nf_conntrack_ecache *ecache;
145         int cpu;
146
147         for_each_cpu(cpu) {
148                 ecache = &per_cpu(nf_conntrack_ecache, cpu);
149                 if (ecache->ct)
150                         nf_ct_put(ecache->ct);
151         }
152 }
153 #else
154 static inline void nf_ct_event_cache_flush(void) {}
155 #endif /* CONFIG_NF_CONNTRACK_EVENTS */
156
157 DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat);
158 EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat);
159
160 /*
161  * This scheme offers various size of "struct nf_conn" dependent on
162  * features(helper, nat, ...)
163  */
164
165 #define NF_CT_FEATURES_NAMELEN  256
166 static struct {
167         /* name of slab cache. printed in /proc/slabinfo */
168         char *name;
169
170         /* size of slab cache */
171         size_t size;
172
173         /* slab cache pointer */
174         kmem_cache_t *cachep;
175
176         /* allocated slab cache + modules which uses this slab cache */
177         int use;
178
179         /* Initialization */
180         int (*init_conntrack)(struct nf_conn *, u_int32_t);
181
182 } nf_ct_cache[NF_CT_F_NUM];
183
184 /* protect members of nf_ct_cache except of "use" */
185 DEFINE_RWLOCK(nf_ct_cache_lock);
186
187 /* This avoids calling kmem_cache_create() with same name simultaneously */
188 DECLARE_MUTEX(nf_ct_cache_mutex);
189
190 extern struct nf_conntrack_protocol nf_conntrack_generic_protocol;
191 struct nf_conntrack_protocol *
192 __nf_ct_proto_find(u_int16_t l3proto, u_int8_t protocol)
193 {
194         if (unlikely(l3proto >= AF_MAX || nf_ct_protos[l3proto] == NULL))
195                 return &nf_conntrack_generic_protocol;
196
197         return nf_ct_protos[l3proto][protocol];
198 }
199
200 /* this is guaranteed to always return a valid protocol helper, since
201  * it falls back to generic_protocol */
202 struct nf_conntrack_protocol *
203 nf_ct_proto_find_get(u_int16_t l3proto, u_int8_t protocol)
204 {
205         struct nf_conntrack_protocol *p;
206
207         preempt_disable();
208         p = __nf_ct_proto_find(l3proto, protocol);
209         if (p) {
210                 if (!try_module_get(p->me))
211                         p = &nf_conntrack_generic_protocol;
212         }
213         preempt_enable();
214         
215         return p;
216 }
217
218 void nf_ct_proto_put(struct nf_conntrack_protocol *p)
219 {
220         module_put(p->me);
221 }
222
223 struct nf_conntrack_l3proto *
224 nf_ct_l3proto_find_get(u_int16_t l3proto)
225 {
226         struct nf_conntrack_l3proto *p;
227
228         preempt_disable();
229         p = __nf_ct_l3proto_find(l3proto);
230         if (p) {
231                 if (!try_module_get(p->me))
232                         p = &nf_conntrack_generic_l3proto;
233         }
234         preempt_enable();
235
236         return p;
237 }
238
239 void nf_ct_l3proto_put(struct nf_conntrack_l3proto *p)
240 {
241         module_put(p->me);
242 }
243
244 static int nf_conntrack_hash_rnd_initted;
245 static unsigned int nf_conntrack_hash_rnd;
246
247 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
248                                   unsigned int size, unsigned int rnd)
249 {
250         unsigned int a, b;
251         a = jhash((void *)tuple->src.u3.all, sizeof(tuple->src.u3.all),
252                   ((tuple->src.l3num) << 16) | tuple->dst.protonum);
253         b = jhash((void *)tuple->dst.u3.all, sizeof(tuple->dst.u3.all),
254                         (tuple->src.u.all << 16) | tuple->dst.u.all);
255
256         return jhash_2words(a, b, rnd) % size;
257 }
258
259 static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple)
260 {
261         return __hash_conntrack(tuple, nf_conntrack_htable_size,
262                                 nf_conntrack_hash_rnd);
263 }
264
265 int nf_conntrack_register_cache(u_int32_t features, const char *name,
266                                 size_t size)
267 {
268         int ret = 0;
269         char *cache_name;
270         kmem_cache_t *cachep;
271
272         DEBUGP("nf_conntrack_register_cache: features=0x%x, name=%s, size=%d\n",
273                features, name, size);
274
275         if (features < NF_CT_F_BASIC || features >= NF_CT_F_NUM) {
276                 DEBUGP("nf_conntrack_register_cache: invalid features.: 0x%x\n",
277                         features);
278                 return -EINVAL;
279         }
280
281         down(&nf_ct_cache_mutex);
282
283         write_lock_bh(&nf_ct_cache_lock);
284         /* e.g: multiple helpers are loaded */
285         if (nf_ct_cache[features].use > 0) {
286                 DEBUGP("nf_conntrack_register_cache: already resisterd.\n");
287                 if ((!strncmp(nf_ct_cache[features].name, name,
288                               NF_CT_FEATURES_NAMELEN))
289                     && nf_ct_cache[features].size == size) {
290                         DEBUGP("nf_conntrack_register_cache: reusing.\n");
291                         nf_ct_cache[features].use++;
292                         ret = 0;
293                 } else
294                         ret = -EBUSY;
295
296                 write_unlock_bh(&nf_ct_cache_lock);
297                 up(&nf_ct_cache_mutex);
298                 return ret;
299         }
300         write_unlock_bh(&nf_ct_cache_lock);
301
302         /*
303          * The memory space for name of slab cache must be alive until
304          * cache is destroyed.
305          */
306         cache_name = kmalloc(sizeof(char)*NF_CT_FEATURES_NAMELEN, GFP_ATOMIC);
307         if (cache_name == NULL) {
308                 DEBUGP("nf_conntrack_register_cache: can't alloc cache_name\n");
309                 ret = -ENOMEM;
310                 goto out_up_mutex;
311         }
312
313         if (strlcpy(cache_name, name, NF_CT_FEATURES_NAMELEN)
314                                                 >= NF_CT_FEATURES_NAMELEN) {
315                 printk("nf_conntrack_register_cache: name too long\n");
316                 ret = -EINVAL;
317                 goto out_free_name;
318         }
319
320         cachep = kmem_cache_create(cache_name, size, 0, 0,
321                                    NULL, NULL);
322         if (!cachep) {
323                 printk("nf_conntrack_register_cache: Can't create slab cache "
324                        "for the features = 0x%x\n", features);
325                 ret = -ENOMEM;
326                 goto out_free_name;
327         }
328
329         write_lock_bh(&nf_ct_cache_lock);
330         nf_ct_cache[features].use = 1;
331         nf_ct_cache[features].size = size;
332         nf_ct_cache[features].cachep = cachep;
333         nf_ct_cache[features].name = cache_name;
334         write_unlock_bh(&nf_ct_cache_lock);
335
336         goto out_up_mutex;
337
338 out_free_name:
339         kfree(cache_name);
340 out_up_mutex:
341         up(&nf_ct_cache_mutex);
342         return ret;
343 }
344
345 /* FIXME: In the current, only nf_conntrack_cleanup() can call this function. */
346 void nf_conntrack_unregister_cache(u_int32_t features)
347 {
348         kmem_cache_t *cachep;
349         char *name;
350
351         /*
352          * This assures that kmem_cache_create() isn't called before destroying
353          * slab cache.
354          */
355         DEBUGP("nf_conntrack_unregister_cache: 0x%04x\n", features);
356         down(&nf_ct_cache_mutex);
357
358         write_lock_bh(&nf_ct_cache_lock);
359         if (--nf_ct_cache[features].use > 0) {
360                 write_unlock_bh(&nf_ct_cache_lock);
361                 up(&nf_ct_cache_mutex);
362                 return;
363         }
364         cachep = nf_ct_cache[features].cachep;
365         name = nf_ct_cache[features].name;
366         nf_ct_cache[features].cachep = NULL;
367         nf_ct_cache[features].name = NULL;
368         nf_ct_cache[features].size = 0;
369         write_unlock_bh(&nf_ct_cache_lock);
370
371         synchronize_net();
372
373         kmem_cache_destroy(cachep);
374         kfree(name);
375
376         up(&nf_ct_cache_mutex);
377 }
378
379 int
380 nf_ct_get_tuple(const struct sk_buff *skb,
381                 unsigned int nhoff,
382                 unsigned int dataoff,
383                 u_int16_t l3num,
384                 u_int8_t protonum,
385                 struct nf_conntrack_tuple *tuple,
386                 const struct nf_conntrack_l3proto *l3proto,
387                 const struct nf_conntrack_protocol *protocol)
388 {
389         NF_CT_TUPLE_U_BLANK(tuple);
390
391         tuple->src.l3num = l3num;
392         if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
393                 return 0;
394
395         tuple->dst.protonum = protonum;
396         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
397
398         return protocol->pkt_to_tuple(skb, dataoff, tuple);
399 }
400
401 int
402 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
403                    const struct nf_conntrack_tuple *orig,
404                    const struct nf_conntrack_l3proto *l3proto,
405                    const struct nf_conntrack_protocol *protocol)
406 {
407         NF_CT_TUPLE_U_BLANK(inverse);
408
409         inverse->src.l3num = orig->src.l3num;
410         if (l3proto->invert_tuple(inverse, orig) == 0)
411                 return 0;
412
413         inverse->dst.dir = !orig->dst.dir;
414
415         inverse->dst.protonum = orig->dst.protonum;
416         return protocol->invert_tuple(inverse, orig);
417 }
418
419 /* nf_conntrack_expect helper functions */
420 void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
421 {
422         struct nf_conn_help *master_help = nfct_help(exp->master);
423
424         NF_CT_ASSERT(master_help);
425         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
426         NF_CT_ASSERT(!timer_pending(&exp->timeout));
427
428         list_del(&exp->list);
429         NF_CT_STAT_INC(expect_delete);
430         master_help->expecting--;
431         nf_conntrack_expect_put(exp);
432 }
433
434 static void expectation_timed_out(unsigned long ul_expect)
435 {
436         struct nf_conntrack_expect *exp = (void *)ul_expect;
437
438         write_lock_bh(&nf_conntrack_lock);
439         nf_ct_unlink_expect(exp);
440         write_unlock_bh(&nf_conntrack_lock);
441         nf_conntrack_expect_put(exp);
442 }
443
444 struct nf_conntrack_expect *
445 __nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
446 {
447         struct nf_conntrack_expect *i;
448         
449         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
450                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
451                         atomic_inc(&i->use);
452                         return i;
453                 }
454         }
455         return NULL;
456 }
457
458 /* Just find a expectation corresponding to a tuple. */
459 struct nf_conntrack_expect *
460 nf_conntrack_expect_find(const struct nf_conntrack_tuple *tuple)
461 {
462         struct nf_conntrack_expect *i;
463         
464         read_lock_bh(&nf_conntrack_lock);
465         i = __nf_conntrack_expect_find(tuple);
466         read_unlock_bh(&nf_conntrack_lock);
467
468         return i;
469 }
470
471 /* If an expectation for this connection is found, it gets delete from
472  * global list then returned. */
473 static struct nf_conntrack_expect *
474 find_expectation(const struct nf_conntrack_tuple *tuple)
475 {
476         struct nf_conntrack_expect *i;
477
478         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
479         /* If master is not in hash table yet (ie. packet hasn't left
480            this machine yet), how can other end know about expected?
481            Hence these are not the droids you are looking for (if
482            master ct never got confirmed, we'd hold a reference to it
483            and weird things would happen to future packets). */
484                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
485                     && nf_ct_is_confirmed(i->master)) {
486                         if (i->flags & NF_CT_EXPECT_PERMANENT) {
487                                 atomic_inc(&i->use);
488                                 return i;
489                         } else if (del_timer(&i->timeout)) {
490                                 nf_ct_unlink_expect(i);
491                                 return i;
492                         }
493                 }
494         }
495         return NULL;
496 }
497
498 /* delete all expectations for this conntrack */
499 void nf_ct_remove_expectations(struct nf_conn *ct)
500 {
501         struct nf_conntrack_expect *i, *tmp;
502         struct nf_conn_help *help = nfct_help(ct);
503
504         /* Optimization: most connection never expect any others. */
505         if (!help || help->expecting == 0)
506                 return;
507
508         list_for_each_entry_safe(i, tmp, &nf_conntrack_expect_list, list) {
509                 if (i->master == ct && del_timer(&i->timeout)) {
510                         nf_ct_unlink_expect(i);
511                         nf_conntrack_expect_put(i);
512                 }
513         }
514 }
515
516 static void
517 clean_from_lists(struct nf_conn *ct)
518 {
519         unsigned int ho, hr;
520         
521         DEBUGP("clean_from_lists(%p)\n", ct);
522         ASSERT_WRITE_LOCK(&nf_conntrack_lock);
523
524         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
525         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
526         LIST_DELETE(&nf_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
527         LIST_DELETE(&nf_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
528
529         /* Destroy all pending expectations */
530         nf_ct_remove_expectations(ct);
531 }
532
533 static void
534 destroy_conntrack(struct nf_conntrack *nfct)
535 {
536         struct nf_conn *ct = (struct nf_conn *)nfct;
537         struct nf_conntrack_l3proto *l3proto;
538         struct nf_conntrack_protocol *proto;
539
540         DEBUGP("destroy_conntrack(%p)\n", ct);
541         NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
542         NF_CT_ASSERT(!timer_pending(&ct->timeout));
543
544         nf_conntrack_event(IPCT_DESTROY, ct);
545         set_bit(IPS_DYING_BIT, &ct->status);
546
547         /* To make sure we don't get any weird locking issues here:
548          * destroy_conntrack() MUST NOT be called with a write lock
549          * to nf_conntrack_lock!!! -HW */
550         l3proto = __nf_ct_l3proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num);
551         if (l3proto && l3proto->destroy)
552                 l3proto->destroy(ct);
553
554         proto = __nf_ct_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.l3num, ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
555         if (proto && proto->destroy)
556                 proto->destroy(ct);
557
558         if (nf_conntrack_destroyed)
559                 nf_conntrack_destroyed(ct);
560
561         write_lock_bh(&nf_conntrack_lock);
562         /* Expectations will have been removed in clean_from_lists,
563          * except TFTP can create an expectation on the first packet,
564          * before connection is in the list, so we need to clean here,
565          * too. */
566         nf_ct_remove_expectations(ct);
567
568         /* We overload first tuple to link into unconfirmed list. */
569         if (!nf_ct_is_confirmed(ct)) {
570                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
571                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
572         }
573
574         NF_CT_STAT_INC(delete);
575         write_unlock_bh(&nf_conntrack_lock);
576
577         if (ct->master)
578                 nf_ct_put(ct->master);
579
580         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
581         nf_conntrack_free(ct);
582 }
583
584 static void death_by_timeout(unsigned long ul_conntrack)
585 {
586         struct nf_conn *ct = (void *)ul_conntrack;
587
588         write_lock_bh(&nf_conntrack_lock);
589         /* Inside lock so preempt is disabled on module removal path.
590          * Otherwise we can get spurious warnings. */
591         NF_CT_STAT_INC(delete_list);
592         clean_from_lists(ct);
593         write_unlock_bh(&nf_conntrack_lock);
594         nf_ct_put(ct);
595 }
596
597 static inline int
598 conntrack_tuple_cmp(const struct nf_conntrack_tuple_hash *i,
599                     const struct nf_conntrack_tuple *tuple,
600                     const struct nf_conn *ignored_conntrack)
601 {
602         ASSERT_READ_LOCK(&nf_conntrack_lock);
603         return nf_ct_tuplehash_to_ctrack(i) != ignored_conntrack
604                 && nf_ct_tuple_equal(tuple, &i->tuple);
605 }
606
607 struct nf_conntrack_tuple_hash *
608 __nf_conntrack_find(const struct nf_conntrack_tuple *tuple,
609                     const struct nf_conn *ignored_conntrack)
610 {
611         struct nf_conntrack_tuple_hash *h;
612         unsigned int hash = hash_conntrack(tuple);
613
614         ASSERT_READ_LOCK(&nf_conntrack_lock);
615         list_for_each_entry(h, &nf_conntrack_hash[hash], list) {
616                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
617                         NF_CT_STAT_INC(found);
618                         return h;
619                 }
620                 NF_CT_STAT_INC(searched);
621         }
622
623         return NULL;
624 }
625
626 /* Find a connection corresponding to a tuple. */
627 struct nf_conntrack_tuple_hash *
628 nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple,
629                       const struct nf_conn *ignored_conntrack)
630 {
631         struct nf_conntrack_tuple_hash *h;
632
633         read_lock_bh(&nf_conntrack_lock);
634         h = __nf_conntrack_find(tuple, ignored_conntrack);
635         if (h)
636                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
637         read_unlock_bh(&nf_conntrack_lock);
638
639         return h;
640 }
641
642 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
643                                        unsigned int hash,
644                                        unsigned int repl_hash) 
645 {
646         ct->id = ++nf_conntrack_next_id;
647         list_prepend(&nf_conntrack_hash[hash],
648                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
649         list_prepend(&nf_conntrack_hash[repl_hash],
650                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
651 }
652
653 void nf_conntrack_hash_insert(struct nf_conn *ct)
654 {
655         unsigned int hash, repl_hash;
656
657         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
658         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
659
660         write_lock_bh(&nf_conntrack_lock);
661         __nf_conntrack_hash_insert(ct, hash, repl_hash);
662         write_unlock_bh(&nf_conntrack_lock);
663 }
664
665 /* Confirm a connection given skb; places it in hash table */
666 int
667 __nf_conntrack_confirm(struct sk_buff **pskb)
668 {
669         unsigned int hash, repl_hash;
670         struct nf_conn *ct;
671         enum ip_conntrack_info ctinfo;
672
673         ct = nf_ct_get(*pskb, &ctinfo);
674
675         /* ipt_REJECT uses nf_conntrack_attach to attach related
676            ICMP/TCP RST packets in other direction.  Actual packet
677            which created connection will be IP_CT_NEW or for an
678            expected connection, IP_CT_RELATED. */
679         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
680                 return NF_ACCEPT;
681
682         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
683         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
684
685         /* We're not in hash table, and we refuse to set up related
686            connections for unconfirmed conns.  But packet copies and
687            REJECT will give spurious warnings here. */
688         /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
689
690         /* No external references means noone else could have
691            confirmed us. */
692         NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
693         DEBUGP("Confirming conntrack %p\n", ct);
694
695         write_lock_bh(&nf_conntrack_lock);
696
697         /* See if there's one in the list already, including reverse:
698            NAT could have grabbed it without realizing, since we're
699            not in the hash.  If there is, we lost race. */
700         if (!LIST_FIND(&nf_conntrack_hash[hash],
701                        conntrack_tuple_cmp,
702                        struct nf_conntrack_tuple_hash *,
703                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
704             && !LIST_FIND(&nf_conntrack_hash[repl_hash],
705                           conntrack_tuple_cmp,
706                           struct nf_conntrack_tuple_hash *,
707                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
708                 struct nf_conn_help *help;
709                 /* Remove from unconfirmed list */
710                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
711
712                 __nf_conntrack_hash_insert(ct, hash, repl_hash);
713                 /* Timer relative to confirmation time, not original
714                    setting time, otherwise we'd get timer wrap in
715                    weird delay cases. */
716                 ct->timeout.expires += jiffies;
717                 add_timer(&ct->timeout);
718                 atomic_inc(&ct->ct_general.use);
719                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
720                 NF_CT_STAT_INC(insert);
721                 write_unlock_bh(&nf_conntrack_lock);
722                 help = nfct_help(ct);
723                 if (help && help->helper)
724                         nf_conntrack_event_cache(IPCT_HELPER, *pskb);
725 #ifdef CONFIG_NF_NAT_NEEDED
726                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
727                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
728                         nf_conntrack_event_cache(IPCT_NATINFO, *pskb);
729 #endif
730                 nf_conntrack_event_cache(master_ct(ct) ?
731                                          IPCT_RELATED : IPCT_NEW, *pskb);
732                 return NF_ACCEPT;
733         }
734
735         NF_CT_STAT_INC(insert_failed);
736         write_unlock_bh(&nf_conntrack_lock);
737         return NF_DROP;
738 }
739
740 /* Returns true if a connection correspondings to the tuple (required
741    for NAT). */
742 int
743 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
744                          const struct nf_conn *ignored_conntrack)
745 {
746         struct nf_conntrack_tuple_hash *h;
747
748         read_lock_bh(&nf_conntrack_lock);
749         h = __nf_conntrack_find(tuple, ignored_conntrack);
750         read_unlock_bh(&nf_conntrack_lock);
751
752         return h != NULL;
753 }
754
755 /* There's a small race here where we may free a just-assured
756    connection.  Too bad: we're in trouble anyway. */
757 static inline int unreplied(const struct nf_conntrack_tuple_hash *i)
758 {
759         return !(test_bit(IPS_ASSURED_BIT,
760                           &nf_ct_tuplehash_to_ctrack(i)->status));
761 }
762
763 static int early_drop(struct list_head *chain)
764 {
765         /* Traverse backwards: gives us oldest, which is roughly LRU */
766         struct nf_conntrack_tuple_hash *h;
767         struct nf_conn *ct = NULL;
768         int dropped = 0;
769
770         read_lock_bh(&nf_conntrack_lock);
771         h = LIST_FIND_B(chain, unreplied, struct nf_conntrack_tuple_hash *);
772         if (h) {
773                 ct = nf_ct_tuplehash_to_ctrack(h);
774                 atomic_inc(&ct->ct_general.use);
775         }
776         read_unlock_bh(&nf_conntrack_lock);
777
778         if (!ct)
779                 return dropped;
780
781         if (del_timer(&ct->timeout)) {
782                 death_by_timeout((unsigned long)ct);
783                 dropped = 1;
784                 NF_CT_STAT_INC(early_drop);
785         }
786         nf_ct_put(ct);
787         return dropped;
788 }
789
790 static inline int helper_cmp(const struct nf_conntrack_helper *i,
791                              const struct nf_conntrack_tuple *rtuple)
792 {
793         return nf_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
794 }
795
796 static struct nf_conntrack_helper *
797 __nf_ct_helper_find(const struct nf_conntrack_tuple *tuple)
798 {
799         return LIST_FIND(&helpers, helper_cmp,
800                          struct nf_conntrack_helper *,
801                          tuple);
802 }
803
804 struct nf_conntrack_helper *
805 nf_ct_helper_find_get( const struct nf_conntrack_tuple *tuple)
806 {
807         struct nf_conntrack_helper *helper;
808
809         /* need nf_conntrack_lock to assure that helper exists until
810          * try_module_get() is called */
811         read_lock_bh(&nf_conntrack_lock);
812
813         helper = __nf_ct_helper_find(tuple);
814         if (helper) {
815                 /* need to increase module usage count to assure helper will
816                  * not go away while the caller is e.g. busy putting a
817                  * conntrack in the hash that uses the helper */
818                 if (!try_module_get(helper->me))
819                         helper = NULL;
820         }
821
822         read_unlock_bh(&nf_conntrack_lock);
823
824         return helper;
825 }
826
827 void nf_ct_helper_put(struct nf_conntrack_helper *helper)
828 {
829         module_put(helper->me);
830 }
831
832 static struct nf_conn *
833 __nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
834                      const struct nf_conntrack_tuple *repl,
835                      const struct nf_conntrack_l3proto *l3proto)
836 {
837         struct nf_conn *conntrack = NULL;
838         u_int32_t features = 0;
839         struct nf_conntrack_helper *helper;
840
841         if (unlikely(!nf_conntrack_hash_rnd_initted)) {
842                 get_random_bytes(&nf_conntrack_hash_rnd, 4);
843                 nf_conntrack_hash_rnd_initted = 1;
844         }
845
846         if (nf_conntrack_max
847             && atomic_read(&nf_conntrack_count) >= nf_conntrack_max) {
848                 unsigned int hash = hash_conntrack(orig);
849                 /* Try dropping from this hash chain. */
850                 if (!early_drop(&nf_conntrack_hash[hash])) {
851                         if (net_ratelimit())
852                                 printk(KERN_WARNING
853                                        "nf_conntrack: table full, dropping"
854                                        " packet.\n");
855                         return ERR_PTR(-ENOMEM);
856                 }
857         }
858
859         /*  find features needed by this conntrack. */
860         features = l3proto->get_features(orig);
861
862         /* FIXME: protect helper list per RCU */
863         read_lock_bh(&nf_conntrack_lock);
864         helper = __nf_ct_helper_find(repl);
865         if (helper)
866                 features |= NF_CT_F_HELP;
867         read_unlock_bh(&nf_conntrack_lock);
868
869         DEBUGP("nf_conntrack_alloc: features=0x%x\n", features);
870
871         read_lock_bh(&nf_ct_cache_lock);
872
873         if (unlikely(!nf_ct_cache[features].use)) {
874                 DEBUGP("nf_conntrack_alloc: not supported features = 0x%x\n",
875                         features);
876                 goto out;
877         }
878
879         conntrack = kmem_cache_alloc(nf_ct_cache[features].cachep, GFP_ATOMIC);
880         if (conntrack == NULL) {
881                 DEBUGP("nf_conntrack_alloc: Can't alloc conntrack from cache\n");
882                 goto out;
883         }
884
885         memset(conntrack, 0, nf_ct_cache[features].size);
886         conntrack->features = features;
887         if (helper) {
888                 struct nf_conn_help *help = nfct_help(conntrack);
889                 NF_CT_ASSERT(help);
890                 help->helper = helper;
891         }
892
893         atomic_set(&conntrack->ct_general.use, 1);
894         conntrack->ct_general.destroy = destroy_conntrack;
895         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
896         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
897         /* Don't set timer yet: wait for confirmation */
898         init_timer(&conntrack->timeout);
899         conntrack->timeout.data = (unsigned long)conntrack;
900         conntrack->timeout.function = death_by_timeout;
901
902         atomic_inc(&nf_conntrack_count);
903 out:
904         read_unlock_bh(&nf_ct_cache_lock);
905         return conntrack;
906 }
907
908 struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig,
909                                    const struct nf_conntrack_tuple *repl)
910 {
911         struct nf_conntrack_l3proto *l3proto;
912
913         l3proto = __nf_ct_l3proto_find(orig->src.l3num);
914         return __nf_conntrack_alloc(orig, repl, l3proto);
915 }
916
917 void nf_conntrack_free(struct nf_conn *conntrack)
918 {
919         u_int32_t features = conntrack->features;
920         NF_CT_ASSERT(features >= NF_CT_F_BASIC && features < NF_CT_F_NUM);
921         DEBUGP("nf_conntrack_free: features = 0x%x, conntrack=%p\n", features,
922                conntrack);
923         kmem_cache_free(nf_ct_cache[features].cachep, conntrack);
924         atomic_dec(&nf_conntrack_count);
925 }
926
927 /* Allocate a new conntrack: we return -ENOMEM if classification
928    failed due to stress.  Otherwise it really is unclassifiable. */
929 static struct nf_conntrack_tuple_hash *
930 init_conntrack(const struct nf_conntrack_tuple *tuple,
931                struct nf_conntrack_l3proto *l3proto,
932                struct nf_conntrack_protocol *protocol,
933                struct sk_buff *skb,
934                unsigned int dataoff)
935 {
936         struct nf_conn *conntrack;
937         struct nf_conntrack_tuple repl_tuple;
938         struct nf_conntrack_expect *exp;
939
940         if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, protocol)) {
941                 DEBUGP("Can't invert tuple.\n");
942                 return NULL;
943         }
944
945         conntrack = __nf_conntrack_alloc(tuple, &repl_tuple, l3proto);
946         if (conntrack == NULL || IS_ERR(conntrack)) {
947                 DEBUGP("Can't allocate conntrack.\n");
948                 return (struct nf_conntrack_tuple_hash *)conntrack;
949         }
950
951         if (!protocol->new(conntrack, skb, dataoff)) {
952                 nf_conntrack_free(conntrack);
953                 DEBUGP("init conntrack: can't track with proto module\n");
954                 return NULL;
955         }
956
957         write_lock_bh(&nf_conntrack_lock);
958         exp = find_expectation(tuple);
959
960         if (exp) {
961                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
962                         conntrack, exp);
963                 /* Welcome, Mr. Bond.  We've been expecting you... */
964                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
965                 conntrack->master = exp->master;
966 #ifdef CONFIG_NF_CONNTRACK_MARK
967                 conntrack->mark = exp->master->mark;
968 #endif
969                 nf_conntrack_get(&conntrack->master->ct_general);
970                 NF_CT_STAT_INC(expect_new);
971         } else
972                 NF_CT_STAT_INC(new);
973
974         /* Overload tuple linked list to put us in unconfirmed list. */
975         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
976
977         write_unlock_bh(&nf_conntrack_lock);
978
979         if (exp) {
980                 if (exp->expectfn)
981                         exp->expectfn(conntrack, exp);
982                 nf_conntrack_expect_put(exp);
983         }
984
985         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
986 }
987
988 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
989 static inline struct nf_conn *
990 resolve_normal_ct(struct sk_buff *skb,
991                   unsigned int dataoff,
992                   u_int16_t l3num,
993                   u_int8_t protonum,
994                   struct nf_conntrack_l3proto *l3proto,
995                   struct nf_conntrack_protocol *proto,
996                   int *set_reply,
997                   enum ip_conntrack_info *ctinfo)
998 {
999         struct nf_conntrack_tuple tuple;
1000         struct nf_conntrack_tuple_hash *h;
1001         struct nf_conn *ct;
1002
1003         if (!nf_ct_get_tuple(skb, (unsigned int)(skb->nh.raw - skb->data),
1004                              dataoff, l3num, protonum, &tuple, l3proto,
1005                              proto)) {
1006                 DEBUGP("resolve_normal_ct: Can't get tuple\n");
1007                 return NULL;
1008         }
1009
1010         /* look for tuple match */
1011         h = nf_conntrack_find_get(&tuple, NULL);
1012         if (!h) {
1013                 h = init_conntrack(&tuple, l3proto, proto, skb, dataoff);
1014                 if (!h)
1015                         return NULL;
1016                 if (IS_ERR(h))
1017                         return (void *)h;
1018         }
1019         ct = nf_ct_tuplehash_to_ctrack(h);
1020
1021         /* It exists; we have (non-exclusive) reference. */
1022         if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1023                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
1024                 /* Please set reply bit if this packet OK */
1025                 *set_reply = 1;
1026         } else {
1027                 /* Once we've had two way comms, always ESTABLISHED. */
1028                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1029                         DEBUGP("nf_conntrack_in: normal packet for %p\n", ct);
1030                         *ctinfo = IP_CT_ESTABLISHED;
1031                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1032                         DEBUGP("nf_conntrack_in: related packet for %p\n", ct);
1033                         *ctinfo = IP_CT_RELATED;
1034                 } else {
1035                         DEBUGP("nf_conntrack_in: new packet for %p\n", ct);
1036                         *ctinfo = IP_CT_NEW;
1037                 }
1038                 *set_reply = 0;
1039         }
1040         skb->nfct = &ct->ct_general;
1041         skb->nfctinfo = *ctinfo;
1042         return ct;
1043 }
1044
1045 unsigned int
1046 nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff **pskb)
1047 {
1048         struct nf_conn *ct;
1049         enum ip_conntrack_info ctinfo;
1050         struct nf_conntrack_l3proto *l3proto;
1051         struct nf_conntrack_protocol *proto;
1052         unsigned int dataoff;
1053         u_int8_t protonum;
1054         int set_reply = 0;
1055         int ret;
1056
1057         /* Previously seen (loopback or untracked)?  Ignore. */
1058         if ((*pskb)->nfct) {
1059                 NF_CT_STAT_INC(ignore);
1060                 return NF_ACCEPT;
1061         }
1062
1063         l3proto = __nf_ct_l3proto_find((u_int16_t)pf);
1064         if ((ret = l3proto->prepare(pskb, hooknum, &dataoff, &protonum)) <= 0) {
1065                 DEBUGP("not prepared to track yet or error occured\n");
1066                 return -ret;
1067         }
1068
1069         proto = __nf_ct_proto_find((u_int16_t)pf, protonum);
1070
1071         /* It may be an special packet, error, unclean...
1072          * inverse of the return code tells to the netfilter
1073          * core what to do with the packet. */
1074         if (proto->error != NULL &&
1075             (ret = proto->error(*pskb, dataoff, &ctinfo, pf, hooknum)) <= 0) {
1076                 NF_CT_STAT_INC(error);
1077                 NF_CT_STAT_INC(invalid);
1078                 return -ret;
1079         }
1080
1081         ct = resolve_normal_ct(*pskb, dataoff, pf, protonum, l3proto, proto,
1082                                &set_reply, &ctinfo);
1083         if (!ct) {
1084                 /* Not valid part of a connection */
1085                 NF_CT_STAT_INC(invalid);
1086                 return NF_ACCEPT;
1087         }
1088
1089         if (IS_ERR(ct)) {
1090                 /* Too stressed to deal. */
1091                 NF_CT_STAT_INC(drop);
1092                 return NF_DROP;
1093         }
1094
1095         NF_CT_ASSERT((*pskb)->nfct);
1096
1097         ret = proto->packet(ct, *pskb, dataoff, ctinfo, pf, hooknum);
1098         if (ret < 0) {
1099                 /* Invalid: inverse of the return code tells
1100                  * the netfilter core what to do */
1101                 DEBUGP("nf_conntrack_in: Can't track with proto module\n");
1102                 nf_conntrack_put((*pskb)->nfct);
1103                 (*pskb)->nfct = NULL;
1104                 NF_CT_STAT_INC(invalid);
1105                 return -ret;
1106         }
1107
1108         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1109                 nf_conntrack_event_cache(IPCT_STATUS, *pskb);
1110
1111         return ret;
1112 }
1113
1114 int nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1115                          const struct nf_conntrack_tuple *orig)
1116 {
1117         return nf_ct_invert_tuple(inverse, orig,
1118                                   __nf_ct_l3proto_find(orig->src.l3num),
1119                                   __nf_ct_proto_find(orig->src.l3num,
1120                                                      orig->dst.protonum));
1121 }
1122
1123 /* Would two expected things clash? */
1124 static inline int expect_clash(const struct nf_conntrack_expect *a,
1125                                const struct nf_conntrack_expect *b)
1126 {
1127         /* Part covered by intersection of masks must be unequal,
1128            otherwise they clash */
1129         struct nf_conntrack_tuple intersect_mask;
1130         int count;
1131
1132         intersect_mask.src.l3num = a->mask.src.l3num & b->mask.src.l3num;
1133         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
1134         intersect_mask.dst.u.all = a->mask.dst.u.all & b->mask.dst.u.all;
1135         intersect_mask.dst.protonum = a->mask.dst.protonum
1136                                         & b->mask.dst.protonum;
1137
1138         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1139                 intersect_mask.src.u3.all[count] =
1140                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
1141         }
1142
1143         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
1144                 intersect_mask.dst.u3.all[count] =
1145                         a->mask.dst.u3.all[count] & b->mask.dst.u3.all[count];
1146         }
1147
1148         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
1149 }
1150
1151 static inline int expect_matches(const struct nf_conntrack_expect *a,
1152                                  const struct nf_conntrack_expect *b)
1153 {
1154         return a->master == b->master
1155                 && nf_ct_tuple_equal(&a->tuple, &b->tuple)
1156                 && nf_ct_tuple_equal(&a->mask, &b->mask);
1157 }
1158
1159 /* Generally a bad idea to call this: could have matched already. */
1160 void nf_conntrack_unexpect_related(struct nf_conntrack_expect *exp)
1161 {
1162         struct nf_conntrack_expect *i;
1163
1164         write_lock_bh(&nf_conntrack_lock);
1165         /* choose the the oldest expectation to evict */
1166         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1167                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
1168                         nf_ct_unlink_expect(i);
1169                         write_unlock_bh(&nf_conntrack_lock);
1170                         nf_conntrack_expect_put(i);
1171                         return;
1172                 }
1173         }
1174         write_unlock_bh(&nf_conntrack_lock);
1175 }
1176
1177 /* We don't increase the master conntrack refcount for non-fulfilled
1178  * conntracks. During the conntrack destruction, the expectations are
1179  * always killed before the conntrack itself */
1180 struct nf_conntrack_expect *nf_conntrack_expect_alloc(struct nf_conn *me)
1181 {
1182         struct nf_conntrack_expect *new;
1183
1184         new = kmem_cache_alloc(nf_conntrack_expect_cachep, GFP_ATOMIC);
1185         if (!new) {
1186                 DEBUGP("expect_related: OOM allocating expect\n");
1187                 return NULL;
1188         }
1189         new->master = me;
1190         atomic_set(&new->use, 1);
1191         return new;
1192 }
1193
1194 void nf_conntrack_expect_put(struct nf_conntrack_expect *exp)
1195 {
1196         if (atomic_dec_and_test(&exp->use))
1197                 kmem_cache_free(nf_conntrack_expect_cachep, exp);
1198 }
1199
1200 static void nf_conntrack_expect_insert(struct nf_conntrack_expect *exp)
1201 {
1202         struct nf_conn_help *master_help = nfct_help(exp->master);
1203
1204         atomic_inc(&exp->use);
1205         master_help->expecting++;
1206         list_add(&exp->list, &nf_conntrack_expect_list);
1207
1208         init_timer(&exp->timeout);
1209         exp->timeout.data = (unsigned long)exp;
1210         exp->timeout.function = expectation_timed_out;
1211         exp->timeout.expires = jiffies + master_help->helper->timeout * HZ;
1212         add_timer(&exp->timeout);
1213
1214         exp->id = ++nf_conntrack_expect_next_id;
1215         atomic_inc(&exp->use);
1216         NF_CT_STAT_INC(expect_create);
1217 }
1218
1219 /* Race with expectations being used means we could have none to find; OK. */
1220 static void evict_oldest_expect(struct nf_conn *master)
1221 {
1222         struct nf_conntrack_expect *i;
1223
1224         list_for_each_entry_reverse(i, &nf_conntrack_expect_list, list) {
1225                 if (i->master == master) {
1226                         if (del_timer(&i->timeout)) {
1227                                 nf_ct_unlink_expect(i);
1228                                 nf_conntrack_expect_put(i);
1229                         }
1230                         break;
1231                 }
1232         }
1233 }
1234
1235 static inline int refresh_timer(struct nf_conntrack_expect *i)
1236 {
1237         struct nf_conn_help *master_help = nfct_help(i->master);
1238
1239         if (!del_timer(&i->timeout))
1240                 return 0;
1241
1242         i->timeout.expires = jiffies + master_help->helper->timeout*HZ;
1243         add_timer(&i->timeout);
1244         return 1;
1245 }
1246
1247 int nf_conntrack_expect_related(struct nf_conntrack_expect *expect)
1248 {
1249         struct nf_conntrack_expect *i;
1250         struct nf_conn *master = expect->master;
1251         struct nf_conn_help *master_help = nfct_help(master);
1252         int ret;
1253
1254         NF_CT_ASSERT(master_help);
1255
1256         DEBUGP("nf_conntrack_expect_related %p\n", related_to);
1257         DEBUGP("tuple: "); NF_CT_DUMP_TUPLE(&expect->tuple);
1258         DEBUGP("mask:  "); NF_CT_DUMP_TUPLE(&expect->mask);
1259
1260         write_lock_bh(&nf_conntrack_lock);
1261         list_for_each_entry(i, &nf_conntrack_expect_list, list) {
1262                 if (expect_matches(i, expect)) {
1263                         /* Refresh timer: if it's dying, ignore.. */
1264                         if (refresh_timer(i)) {
1265                                 ret = 0;
1266                                 goto out;
1267                         }
1268                 } else if (expect_clash(i, expect)) {
1269                         ret = -EBUSY;
1270                         goto out;
1271                 }
1272         }
1273         /* Will be over limit? */
1274         if (master_help->helper->max_expected &&
1275             master_help->expecting >= master_help->helper->max_expected)
1276                 evict_oldest_expect(master);
1277
1278         nf_conntrack_expect_insert(expect);
1279         nf_conntrack_expect_event(IPEXP_NEW, expect);
1280         ret = 0;
1281 out:
1282         write_unlock_bh(&nf_conntrack_lock);
1283         return ret;
1284 }
1285
1286 int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
1287 {
1288         int ret;
1289         BUG_ON(me->timeout == 0);
1290
1291         ret = nf_conntrack_register_cache(NF_CT_F_HELP, "nf_conntrack:help",
1292                                           sizeof(struct nf_conn)
1293                                           + sizeof(struct nf_conn_help)
1294                                           + __alignof__(struct nf_conn_help));
1295         if (ret < 0) {
1296                 printk(KERN_ERR "nf_conntrack_helper_reigster: Unable to create slab cache for conntracks\n");
1297                 return ret;
1298         }
1299         write_lock_bh(&nf_conntrack_lock);
1300         list_prepend(&helpers, me);
1301         write_unlock_bh(&nf_conntrack_lock);
1302
1303         return 0;
1304 }
1305
1306 struct nf_conntrack_helper *
1307 __nf_conntrack_helper_find_byname(const char *name)
1308 {
1309         struct nf_conntrack_helper *h;
1310
1311         list_for_each_entry(h, &helpers, list) {
1312                 if (!strcmp(h->name, name))
1313                         return h;
1314         }
1315
1316         return NULL;
1317 }
1318
1319 static inline int unhelp(struct nf_conntrack_tuple_hash *i,
1320                          const struct nf_conntrack_helper *me)
1321 {
1322         struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
1323         struct nf_conn_help *help = nfct_help(ct);
1324
1325         if (help && help->helper == me) {
1326                 nf_conntrack_event(IPCT_HELPER, ct);
1327                 help->helper = NULL;
1328         }
1329         return 0;
1330 }
1331
1332 void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
1333 {
1334         unsigned int i;
1335         struct nf_conntrack_expect *exp, *tmp;
1336
1337         /* Need write lock here, to delete helper. */
1338         write_lock_bh(&nf_conntrack_lock);
1339         LIST_DELETE(&helpers, me);
1340
1341         /* Get rid of expectations */
1342         list_for_each_entry_safe(exp, tmp, &nf_conntrack_expect_list, list) {
1343                 struct nf_conn_help *help = nfct_help(exp->master);
1344                 if (help->helper == me && del_timer(&exp->timeout)) {
1345                         nf_ct_unlink_expect(exp);
1346                         nf_conntrack_expect_put(exp);
1347                 }
1348         }
1349
1350         /* Get rid of expecteds, set helpers to NULL. */
1351         LIST_FIND_W(&unconfirmed, unhelp, struct nf_conntrack_tuple_hash*, me);
1352         for (i = 0; i < nf_conntrack_htable_size; i++)
1353                 LIST_FIND_W(&nf_conntrack_hash[i], unhelp,
1354                             struct nf_conntrack_tuple_hash *, me);
1355         write_unlock_bh(&nf_conntrack_lock);
1356
1357         /* Someone could be still looking at the helper in a bh. */
1358         synchronize_net();
1359 }
1360
1361 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1362 void __nf_ct_refresh_acct(struct nf_conn *ct,
1363                           enum ip_conntrack_info ctinfo,
1364                           const struct sk_buff *skb,
1365                           unsigned long extra_jiffies,
1366                           int do_acct)
1367 {
1368         int event = 0;
1369
1370         NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1371         NF_CT_ASSERT(skb);
1372
1373         write_lock_bh(&nf_conntrack_lock);
1374
1375         /* If not in hash table, timer will not be active yet */
1376         if (!nf_ct_is_confirmed(ct)) {
1377                 ct->timeout.expires = extra_jiffies;
1378                 event = IPCT_REFRESH;
1379         } else {
1380                 /* Need del_timer for race avoidance (may already be dying). */
1381                 if (del_timer(&ct->timeout)) {
1382                         ct->timeout.expires = jiffies + extra_jiffies;
1383                         add_timer(&ct->timeout);
1384                         event = IPCT_REFRESH;
1385                 }
1386         }
1387
1388 #ifdef CONFIG_NF_CT_ACCT
1389         if (do_acct) {
1390                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1391                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1392                         skb->len - (unsigned int)(skb->nh.raw - skb->data);
1393         if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1394             || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1395                 event |= IPCT_COUNTER_FILLING;
1396         }
1397 #endif
1398
1399         write_unlock_bh(&nf_conntrack_lock);
1400
1401         /* must be unlocked when calling event cache */
1402         if (event)
1403                 nf_conntrack_event_cache(event, skb);
1404 }
1405
1406 #if defined(CONFIG_NF_CT_NETLINK) || \
1407     defined(CONFIG_NF_CT_NETLINK_MODULE)
1408
1409 #include <linux/netfilter/nfnetlink.h>
1410 #include <linux/netfilter/nfnetlink_conntrack.h>
1411
1412 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1413  * in ip_conntrack_core, since we don't want the protocols to autoload
1414  * or depend on ctnetlink */
1415 int nf_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1416                                const struct nf_conntrack_tuple *tuple)
1417 {
1418         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1419                 &tuple->src.u.tcp.port);
1420         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1421                 &tuple->dst.u.tcp.port);
1422         return 0;
1423
1424 nfattr_failure:
1425         return -1;
1426 }
1427
1428 static const size_t cta_min_proto[CTA_PROTO_MAX] = {
1429         [CTA_PROTO_SRC_PORT-1]  = sizeof(u_int16_t),
1430         [CTA_PROTO_DST_PORT-1]  = sizeof(u_int16_t)
1431 };
1432
1433 int nf_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1434                                struct nf_conntrack_tuple *t)
1435 {
1436         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1437                 return -EINVAL;
1438
1439         if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto))
1440                 return -EINVAL;
1441
1442         t->src.u.tcp.port =
1443                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1444         t->dst.u.tcp.port =
1445                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1446
1447         return 0;
1448 }
1449 #endif
1450
1451 /* Used by ipt_REJECT and ip6t_REJECT. */
1452 void __nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1453 {
1454         struct nf_conn *ct;
1455         enum ip_conntrack_info ctinfo;
1456
1457         /* This ICMP is in reverse direction to the packet which caused it */
1458         ct = nf_ct_get(skb, &ctinfo);
1459         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1460                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1461         else
1462                 ctinfo = IP_CT_RELATED;
1463
1464         /* Attach to new skbuff, and increment count */
1465         nskb->nfct = &ct->ct_general;
1466         nskb->nfctinfo = ctinfo;
1467         nf_conntrack_get(nskb->nfct);
1468 }
1469
1470 static inline int
1471 do_iter(const struct nf_conntrack_tuple_hash *i,
1472         int (*iter)(struct nf_conn *i, void *data),
1473         void *data)
1474 {
1475         return iter(nf_ct_tuplehash_to_ctrack(i), data);
1476 }
1477
1478 /* Bring out ya dead! */
1479 static struct nf_conntrack_tuple_hash *
1480 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
1481                 void *data, unsigned int *bucket)
1482 {
1483         struct nf_conntrack_tuple_hash *h = NULL;
1484
1485         write_lock_bh(&nf_conntrack_lock);
1486         for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1487                 h = LIST_FIND_W(&nf_conntrack_hash[*bucket], do_iter,
1488                                 struct nf_conntrack_tuple_hash *, iter, data);
1489                 if (h)
1490                         break;
1491         }
1492         if (!h)
1493                 h = LIST_FIND_W(&unconfirmed, do_iter,
1494                                 struct nf_conntrack_tuple_hash *, iter, data);
1495         if (h)
1496                 atomic_inc(&nf_ct_tuplehash_to_ctrack(h)->ct_general.use);
1497         write_unlock_bh(&nf_conntrack_lock);
1498
1499         return h;
1500 }
1501
1502 void
1503 nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data)
1504 {
1505         struct nf_conntrack_tuple_hash *h;
1506         unsigned int bucket = 0;
1507
1508         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1509                 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
1510                 /* Time to push up daises... */
1511                 if (del_timer(&ct->timeout))
1512                         death_by_timeout((unsigned long)ct);
1513                 /* ... else the timer will get him soon. */
1514
1515                 nf_ct_put(ct);
1516         }
1517 }
1518
1519 static int kill_all(struct nf_conn *i, void *data)
1520 {
1521         return 1;
1522 }
1523
1524 static void free_conntrack_hash(struct list_head *hash, int vmalloced, int size)
1525 {
1526         if (vmalloced)
1527                 vfree(hash);
1528         else
1529                 free_pages((unsigned long)hash, 
1530                            get_order(sizeof(struct list_head) * size));
1531 }
1532
1533 void nf_conntrack_flush()
1534 {
1535         nf_ct_iterate_cleanup(kill_all, NULL);
1536 }
1537
1538 /* Mishearing the voices in his head, our hero wonders how he's
1539    supposed to kill the mall. */
1540 void nf_conntrack_cleanup(void)
1541 {
1542         int i;
1543
1544         ip_ct_attach = NULL;
1545
1546         /* This makes sure all current packets have passed through
1547            netfilter framework.  Roll on, two-stage module
1548            delete... */
1549         synchronize_net();
1550
1551         nf_ct_event_cache_flush();
1552  i_see_dead_people:
1553         nf_conntrack_flush();
1554         if (atomic_read(&nf_conntrack_count) != 0) {
1555                 schedule();
1556                 goto i_see_dead_people;
1557         }
1558         /* wait until all references to nf_conntrack_untracked are dropped */
1559         while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
1560                 schedule();
1561
1562         for (i = 0; i < NF_CT_F_NUM; i++) {
1563                 if (nf_ct_cache[i].use == 0)
1564                         continue;
1565
1566                 NF_CT_ASSERT(nf_ct_cache[i].use == 1);
1567                 nf_ct_cache[i].use = 1;
1568                 nf_conntrack_unregister_cache(i);
1569         }
1570         kmem_cache_destroy(nf_conntrack_expect_cachep);
1571         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1572                             nf_conntrack_htable_size);
1573
1574         /* free l3proto protocol tables */
1575         for (i = 0; i < PF_MAX; i++)
1576                 if (nf_ct_protos[i]) {
1577                         kfree(nf_ct_protos[i]);
1578                         nf_ct_protos[i] = NULL;
1579                 }
1580 }
1581
1582 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1583 {
1584         struct list_head *hash;
1585         unsigned int i;
1586
1587         *vmalloced = 0; 
1588         hash = (void*)__get_free_pages(GFP_KERNEL, 
1589                                        get_order(sizeof(struct list_head)
1590                                                  * size));
1591         if (!hash) { 
1592                 *vmalloced = 1;
1593                 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1594                 hash = vmalloc(sizeof(struct list_head) * size);
1595         }
1596
1597         if (hash)
1598                 for (i = 0; i < size; i++) 
1599                         INIT_LIST_HEAD(&hash[i]);
1600
1601         return hash;
1602 }
1603
1604 int set_hashsize(const char *val, struct kernel_param *kp)
1605 {
1606         int i, bucket, hashsize, vmalloced;
1607         int old_vmalloced, old_size;
1608         int rnd;
1609         struct list_head *hash, *old_hash;
1610         struct nf_conntrack_tuple_hash *h;
1611
1612         /* On boot, we can set this without any fancy locking. */
1613         if (!nf_conntrack_htable_size)
1614                 return param_set_uint(val, kp);
1615
1616         hashsize = simple_strtol(val, NULL, 0);
1617         if (!hashsize)
1618                 return -EINVAL;
1619
1620         hash = alloc_hashtable(hashsize, &vmalloced);
1621         if (!hash)
1622                 return -ENOMEM;
1623
1624         /* We have to rehahs for the new table anyway, so we also can
1625          * use a newrandom seed */
1626         get_random_bytes(&rnd, 4);
1627
1628         write_lock_bh(&nf_conntrack_lock);
1629         for (i = 0; i < nf_conntrack_htable_size; i++) {
1630                 while (!list_empty(&nf_conntrack_hash[i])) {
1631                         h = list_entry(nf_conntrack_hash[i].next,
1632                                        struct nf_conntrack_tuple_hash, list);
1633                         list_del(&h->list);
1634                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1635                         list_add_tail(&h->list, &hash[bucket]);
1636                 }
1637         }
1638         old_size = nf_conntrack_htable_size;
1639         old_vmalloced = nf_conntrack_vmalloc;
1640         old_hash = nf_conntrack_hash;
1641
1642         nf_conntrack_htable_size = hashsize;
1643         nf_conntrack_vmalloc = vmalloced;
1644         nf_conntrack_hash = hash;
1645         nf_conntrack_hash_rnd = rnd;
1646         write_unlock_bh(&nf_conntrack_lock);
1647
1648         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1649         return 0;
1650 }
1651
1652 module_param_call(hashsize, set_hashsize, param_get_uint,
1653                   &nf_conntrack_htable_size, 0600);
1654
1655 int __init nf_conntrack_init(void)
1656 {
1657         unsigned int i;
1658         int ret;
1659
1660         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1661          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1662         if (!nf_conntrack_htable_size) {
1663                 nf_conntrack_htable_size
1664                         = (((num_physpages << PAGE_SHIFT) / 16384)
1665                            / sizeof(struct list_head));
1666                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1667                         nf_conntrack_htable_size = 8192;
1668                 if (nf_conntrack_htable_size < 16)
1669                         nf_conntrack_htable_size = 16;
1670         }
1671         nf_conntrack_max = 8 * nf_conntrack_htable_size;
1672
1673         printk("nf_conntrack version %s (%u buckets, %d max)\n",
1674                NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1675                nf_conntrack_max);
1676
1677         nf_conntrack_hash = alloc_hashtable(nf_conntrack_htable_size,
1678                                             &nf_conntrack_vmalloc);
1679         if (!nf_conntrack_hash) {
1680                 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
1681                 goto err_out;
1682         }
1683
1684         ret = nf_conntrack_register_cache(NF_CT_F_BASIC, "nf_conntrack:basic",
1685                                           sizeof(struct nf_conn));
1686         if (ret < 0) {
1687                 printk(KERN_ERR "Unable to create nf_conn slab cache\n");
1688                 goto err_free_hash;
1689         }
1690
1691         nf_conntrack_expect_cachep = kmem_cache_create("nf_conntrack_expect",
1692                                         sizeof(struct nf_conntrack_expect),
1693                                         0, 0, NULL, NULL);
1694         if (!nf_conntrack_expect_cachep) {
1695                 printk(KERN_ERR "Unable to create nf_expect slab cache\n");
1696                 goto err_free_conntrack_slab;
1697         }
1698
1699         /* Don't NEED lock here, but good form anyway. */
1700         write_lock_bh(&nf_conntrack_lock);
1701         for (i = 0; i < PF_MAX; i++)
1702                 nf_ct_l3protos[i] = &nf_conntrack_generic_l3proto;
1703         write_unlock_bh(&nf_conntrack_lock);
1704
1705         /* For use by REJECT target */
1706         ip_ct_attach = __nf_conntrack_attach;
1707
1708         /* Set up fake conntrack:
1709             - to never be deleted, not in any hashes */
1710         atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
1711         /*  - and look it like as a confirmed connection */
1712         set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
1713
1714         return ret;
1715
1716 err_free_conntrack_slab:
1717         nf_conntrack_unregister_cache(NF_CT_F_BASIC);
1718 err_free_hash:
1719         free_conntrack_hash(nf_conntrack_hash, nf_conntrack_vmalloc,
1720                             nf_conntrack_htable_size);
1721 err_out:
1722         return -ENOMEM;
1723 }