[NETFILTER]: kill listhelp.h
[safe/jmp/linux-2.6] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50
51 #define IP_CONNTRACK_VERSION    "2.4"
52
53 #if 0
54 #define DEBUGP printk
55 #else
56 #define DEBUGP(format, args...)
57 #endif
58
59 DEFINE_RWLOCK(ip_conntrack_lock);
60
61 /* ip_conntrack_standalone needs this */
62 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
63
64 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
65 LIST_HEAD(ip_conntrack_expect_list);
66 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
67 static LIST_HEAD(helpers);
68 unsigned int ip_conntrack_htable_size __read_mostly = 0;
69 int ip_conntrack_max __read_mostly;
70 struct list_head *ip_conntrack_hash;
71 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
72 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
73 struct ip_conntrack ip_conntrack_untracked;
74 unsigned int ip_ct_log_invalid __read_mostly;
75 static LIST_HEAD(unconfirmed);
76 static int ip_conntrack_vmalloc;
77
78 static unsigned int ip_conntrack_next_id;
79 static unsigned int ip_conntrack_expect_next_id;
80 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
81 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
83
84 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
85
86 /* deliver cached events and clear cache entry - must be called with locally
87  * disabled softirqs */
88 static inline void
89 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
90 {
91         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
92         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
93                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
94                                     ecache->ct);
95         ecache->events = 0;
96         ip_conntrack_put(ecache->ct);
97         ecache->ct = NULL;
98 }
99
100 /* Deliver all cached events for a particular conntrack. This is called
101  * by code prior to async packet handling or freeing the skb */
102 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
103 {
104         struct ip_conntrack_ecache *ecache;
105         
106         local_bh_disable();
107         ecache = &__get_cpu_var(ip_conntrack_ecache);
108         if (ecache->ct == ct)
109                 __ip_ct_deliver_cached_events(ecache);
110         local_bh_enable();
111 }
112
113 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
114 {
115         struct ip_conntrack_ecache *ecache;
116
117         /* take care of delivering potentially old events */
118         ecache = &__get_cpu_var(ip_conntrack_ecache);
119         BUG_ON(ecache->ct == ct);
120         if (ecache->ct)
121                 __ip_ct_deliver_cached_events(ecache);
122         /* initialize for this conntrack/packet */
123         ecache->ct = ct;
124         nf_conntrack_get(&ct->ct_general);
125 }
126
127 /* flush the event cache - touches other CPU's data and must not be called while
128  * packets are still passing through the code */
129 static void ip_ct_event_cache_flush(void)
130 {
131         struct ip_conntrack_ecache *ecache;
132         int cpu;
133
134         for_each_possible_cpu(cpu) {
135                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
136                 if (ecache->ct)
137                         ip_conntrack_put(ecache->ct);
138         }
139 }
140 #else
141 static inline void ip_ct_event_cache_flush(void) {}
142 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
143
144 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
145
146 static int ip_conntrack_hash_rnd_initted;
147 static unsigned int ip_conntrack_hash_rnd;
148
149 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
150                             unsigned int size, unsigned int rnd)
151 {
152         return (jhash_3words(tuple->src.ip,
153                              (tuple->dst.ip ^ tuple->dst.protonum),
154                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
155                              rnd) % size);
156 }
157
158 static u_int32_t
159 hash_conntrack(const struct ip_conntrack_tuple *tuple)
160 {
161         return __hash_conntrack(tuple, ip_conntrack_htable_size,
162                                 ip_conntrack_hash_rnd);
163 }
164
165 int
166 ip_ct_get_tuple(const struct iphdr *iph,
167                 const struct sk_buff *skb,
168                 unsigned int dataoff,
169                 struct ip_conntrack_tuple *tuple,
170                 const struct ip_conntrack_protocol *protocol)
171 {
172         /* Never happen */
173         if (iph->frag_off & htons(IP_OFFSET)) {
174                 printk("ip_conntrack_core: Frag of proto %u.\n",
175                        iph->protocol);
176                 return 0;
177         }
178
179         tuple->src.ip = iph->saddr;
180         tuple->dst.ip = iph->daddr;
181         tuple->dst.protonum = iph->protocol;
182         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
183
184         return protocol->pkt_to_tuple(skb, dataoff, tuple);
185 }
186
187 int
188 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
189                    const struct ip_conntrack_tuple *orig,
190                    const struct ip_conntrack_protocol *protocol)
191 {
192         inverse->src.ip = orig->dst.ip;
193         inverse->dst.ip = orig->src.ip;
194         inverse->dst.protonum = orig->dst.protonum;
195         inverse->dst.dir = !orig->dst.dir;
196
197         return protocol->invert_tuple(inverse, orig);
198 }
199
200
201 /* ip_conntrack_expect helper functions */
202 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
203 {
204         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
205         IP_NF_ASSERT(!timer_pending(&exp->timeout));
206         list_del(&exp->list);
207         CONNTRACK_STAT_INC(expect_delete);
208         exp->master->expecting--;
209         ip_conntrack_expect_put(exp);
210 }
211
212 static void expectation_timed_out(unsigned long ul_expect)
213 {
214         struct ip_conntrack_expect *exp = (void *)ul_expect;
215
216         write_lock_bh(&ip_conntrack_lock);
217         ip_ct_unlink_expect(exp);
218         write_unlock_bh(&ip_conntrack_lock);
219         ip_conntrack_expect_put(exp);
220 }
221
222 struct ip_conntrack_expect *
223 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
224 {
225         struct ip_conntrack_expect *i;
226         
227         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
228                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
229                         atomic_inc(&i->use);
230                         return i;
231                 }
232         }
233         return NULL;
234 }
235
236 /* Just find a expectation corresponding to a tuple. */
237 struct ip_conntrack_expect *
238 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
239 {
240         struct ip_conntrack_expect *i;
241         
242         read_lock_bh(&ip_conntrack_lock);
243         i = __ip_conntrack_expect_find(tuple);
244         read_unlock_bh(&ip_conntrack_lock);
245
246         return i;
247 }
248
249 /* If an expectation for this connection is found, it gets delete from
250  * global list then returned. */
251 static struct ip_conntrack_expect *
252 find_expectation(const struct ip_conntrack_tuple *tuple)
253 {
254         struct ip_conntrack_expect *i;
255
256         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
257                 /* If master is not in hash table yet (ie. packet hasn't left
258                    this machine yet), how can other end know about expected?
259                    Hence these are not the droids you are looking for (if
260                    master ct never got confirmed, we'd hold a reference to it
261                    and weird things would happen to future packets). */
262                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
263                     && is_confirmed(i->master)) {
264                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
265                                 atomic_inc(&i->use);
266                                 return i;
267                         } else if (del_timer(&i->timeout)) {
268                                 ip_ct_unlink_expect(i);
269                                 return i;
270                         }
271                 }
272         }
273         return NULL;
274 }
275
276 /* delete all expectations for this conntrack */
277 void ip_ct_remove_expectations(struct ip_conntrack *ct)
278 {
279         struct ip_conntrack_expect *i, *tmp;
280
281         /* Optimization: most connection never expect any others. */
282         if (ct->expecting == 0)
283                 return;
284
285         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
286                 if (i->master == ct && del_timer(&i->timeout)) {
287                         ip_ct_unlink_expect(i);
288                         ip_conntrack_expect_put(i);
289                 }
290         }
291 }
292
293 static void
294 clean_from_lists(struct ip_conntrack *ct)
295 {
296         DEBUGP("clean_from_lists(%p)\n", ct);
297         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
298         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
299         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
300
301         /* Destroy all pending expectations */
302         ip_ct_remove_expectations(ct);
303 }
304
305 static void
306 destroy_conntrack(struct nf_conntrack *nfct)
307 {
308         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
309         struct ip_conntrack_protocol *proto;
310
311         DEBUGP("destroy_conntrack(%p)\n", ct);
312         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
313         IP_NF_ASSERT(!timer_pending(&ct->timeout));
314
315         ip_conntrack_event(IPCT_DESTROY, ct);
316         set_bit(IPS_DYING_BIT, &ct->status);
317
318         /* To make sure we don't get any weird locking issues here:
319          * destroy_conntrack() MUST NOT be called with a write lock
320          * to ip_conntrack_lock!!! -HW */
321         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
322         if (proto && proto->destroy)
323                 proto->destroy(ct);
324
325         if (ip_conntrack_destroyed)
326                 ip_conntrack_destroyed(ct);
327
328         write_lock_bh(&ip_conntrack_lock);
329         /* Expectations will have been removed in clean_from_lists,
330          * except TFTP can create an expectation on the first packet,
331          * before connection is in the list, so we need to clean here,
332          * too. */
333         ip_ct_remove_expectations(ct);
334
335         /* We overload first tuple to link into unconfirmed list. */
336         if (!is_confirmed(ct)) {
337                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
338                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
339         }
340
341         CONNTRACK_STAT_INC(delete);
342         write_unlock_bh(&ip_conntrack_lock);
343
344         if (ct->master)
345                 ip_conntrack_put(ct->master);
346
347         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
348         ip_conntrack_free(ct);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353         struct ip_conntrack *ct = (void *)ul_conntrack;
354
355         write_lock_bh(&ip_conntrack_lock);
356         /* Inside lock so preempt is disabled on module removal path.
357          * Otherwise we can get spurious warnings. */
358         CONNTRACK_STAT_INC(delete_list);
359         clean_from_lists(ct);
360         write_unlock_bh(&ip_conntrack_lock);
361         ip_conntrack_put(ct);
362 }
363
364 struct ip_conntrack_tuple_hash *
365 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366                     const struct ip_conntrack *ignored_conntrack)
367 {
368         struct ip_conntrack_tuple_hash *h;
369         unsigned int hash = hash_conntrack(tuple);
370
371         ASSERT_READ_LOCK(&ip_conntrack_lock);
372         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
373                 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
374                     ip_ct_tuple_equal(tuple, &h->tuple)) {
375                         CONNTRACK_STAT_INC(found);
376                         return h;
377                 }
378                 CONNTRACK_STAT_INC(searched);
379         }
380
381         return NULL;
382 }
383
384 /* Find a connection corresponding to a tuple. */
385 struct ip_conntrack_tuple_hash *
386 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
387                       const struct ip_conntrack *ignored_conntrack)
388 {
389         struct ip_conntrack_tuple_hash *h;
390
391         read_lock_bh(&ip_conntrack_lock);
392         h = __ip_conntrack_find(tuple, ignored_conntrack);
393         if (h)
394                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
395         read_unlock_bh(&ip_conntrack_lock);
396
397         return h;
398 }
399
400 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
401                                         unsigned int hash,
402                                         unsigned int repl_hash) 
403 {
404         ct->id = ++ip_conntrack_next_id;
405         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
406                  &ip_conntrack_hash[hash]);
407         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
408                  &ip_conntrack_hash[repl_hash]);
409 }
410
411 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
412 {
413         unsigned int hash, repl_hash;
414
415         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
416         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
417
418         write_lock_bh(&ip_conntrack_lock);
419         __ip_conntrack_hash_insert(ct, hash, repl_hash);
420         write_unlock_bh(&ip_conntrack_lock);
421 }
422
423 /* Confirm a connection given skb; places it in hash table */
424 int
425 __ip_conntrack_confirm(struct sk_buff **pskb)
426 {
427         unsigned int hash, repl_hash;
428         struct ip_conntrack_tuple_hash *h;
429         struct ip_conntrack *ct;
430         enum ip_conntrack_info ctinfo;
431
432         ct = ip_conntrack_get(*pskb, &ctinfo);
433
434         /* ipt_REJECT uses ip_conntrack_attach to attach related
435            ICMP/TCP RST packets in other direction.  Actual packet
436            which created connection will be IP_CT_NEW or for an
437            expected connection, IP_CT_RELATED. */
438         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
439                 return NF_ACCEPT;
440
441         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
442         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
443
444         /* We're not in hash table, and we refuse to set up related
445            connections for unconfirmed conns.  But packet copies and
446            REJECT will give spurious warnings here. */
447         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
448
449         /* No external references means noone else could have
450            confirmed us. */
451         IP_NF_ASSERT(!is_confirmed(ct));
452         DEBUGP("Confirming conntrack %p\n", ct);
453
454         write_lock_bh(&ip_conntrack_lock);
455
456         /* See if there's one in the list already, including reverse:
457            NAT could have grabbed it without realizing, since we're
458            not in the hash.  If there is, we lost race. */
459         list_for_each_entry(h, &ip_conntrack_hash[hash], list)
460                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
461                                       &h->tuple))
462                         goto out;
463         list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
464                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
465                                       &h->tuple))
466                         goto out;
467
468         /* Remove from unconfirmed list */
469         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
470
471         __ip_conntrack_hash_insert(ct, hash, repl_hash);
472         /* Timer relative to confirmation time, not original
473            setting time, otherwise we'd get timer wrap in
474            weird delay cases. */
475         ct->timeout.expires += jiffies;
476         add_timer(&ct->timeout);
477         atomic_inc(&ct->ct_general.use);
478         set_bit(IPS_CONFIRMED_BIT, &ct->status);
479         CONNTRACK_STAT_INC(insert);
480         write_unlock_bh(&ip_conntrack_lock);
481         if (ct->helper)
482                 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
483 #ifdef CONFIG_IP_NF_NAT_NEEDED
484         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
485             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
486                 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
487 #endif
488         ip_conntrack_event_cache(master_ct(ct) ?
489                                  IPCT_RELATED : IPCT_NEW, *pskb);
490
491         return NF_ACCEPT;
492
493 out:
494         CONNTRACK_STAT_INC(insert_failed);
495         write_unlock_bh(&ip_conntrack_lock);
496         return NF_DROP;
497 }
498
499 /* Returns true if a connection correspondings to the tuple (required
500    for NAT). */
501 int
502 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
503                          const struct ip_conntrack *ignored_conntrack)
504 {
505         struct ip_conntrack_tuple_hash *h;
506
507         read_lock_bh(&ip_conntrack_lock);
508         h = __ip_conntrack_find(tuple, ignored_conntrack);
509         read_unlock_bh(&ip_conntrack_lock);
510
511         return h != NULL;
512 }
513
514 /* There's a small race here where we may free a just-assured
515    connection.  Too bad: we're in trouble anyway. */
516 static int early_drop(struct list_head *chain)
517 {
518         /* Traverse backwards: gives us oldest, which is roughly LRU */
519         struct ip_conntrack_tuple_hash *h;
520         struct ip_conntrack *ct = NULL, *tmp;
521         int dropped = 0;
522
523         read_lock_bh(&ip_conntrack_lock);
524         list_for_each_entry_reverse(h, chain, list) {
525                 tmp = tuplehash_to_ctrack(h);
526                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
527                         ct = tmp;
528                         atomic_inc(&ct->ct_general.use);
529                         break;
530                 }
531         }
532         read_unlock_bh(&ip_conntrack_lock);
533
534         if (!ct)
535                 return dropped;
536
537         if (del_timer(&ct->timeout)) {
538                 death_by_timeout((unsigned long)ct);
539                 dropped = 1;
540                 CONNTRACK_STAT_INC(early_drop);
541         }
542         ip_conntrack_put(ct);
543         return dropped;
544 }
545
546 static struct ip_conntrack_helper *
547 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
548 {
549         struct ip_conntrack_helper *h;
550
551         list_for_each_entry(h, &helpers, list) {
552                 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
553                         return h;
554         }
555         return NULL;
556 }
557
558 struct ip_conntrack_helper *
559 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
560 {
561         struct ip_conntrack_helper *helper;
562
563         /* need ip_conntrack_lock to assure that helper exists until
564          * try_module_get() is called */
565         read_lock_bh(&ip_conntrack_lock);
566
567         helper = __ip_conntrack_helper_find(tuple);
568         if (helper) {
569                 /* need to increase module usage count to assure helper will
570                  * not go away while the caller is e.g. busy putting a
571                  * conntrack in the hash that uses the helper */
572                 if (!try_module_get(helper->me))
573                         helper = NULL;
574         }
575
576         read_unlock_bh(&ip_conntrack_lock);
577
578         return helper;
579 }
580
581 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
582 {
583         module_put(helper->me);
584 }
585
586 struct ip_conntrack_protocol *
587 __ip_conntrack_proto_find(u_int8_t protocol)
588 {
589         return ip_ct_protos[protocol];
590 }
591
592 /* this is guaranteed to always return a valid protocol helper, since
593  * it falls back to generic_protocol */
594 struct ip_conntrack_protocol *
595 ip_conntrack_proto_find_get(u_int8_t protocol)
596 {
597         struct ip_conntrack_protocol *p;
598
599         preempt_disable();
600         p = __ip_conntrack_proto_find(protocol);
601         if (p) {
602                 if (!try_module_get(p->me))
603                         p = &ip_conntrack_generic_protocol;
604         }
605         preempt_enable();
606         
607         return p;
608 }
609
610 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
611 {
612         module_put(p->me);
613 }
614
615 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
616                                         struct ip_conntrack_tuple *repl)
617 {
618         struct ip_conntrack *conntrack;
619
620         if (!ip_conntrack_hash_rnd_initted) {
621                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
622                 ip_conntrack_hash_rnd_initted = 1;
623         }
624
625         if (ip_conntrack_max
626             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
627                 unsigned int hash = hash_conntrack(orig);
628                 /* Try dropping from this hash chain. */
629                 if (!early_drop(&ip_conntrack_hash[hash])) {
630                         if (net_ratelimit())
631                                 printk(KERN_WARNING
632                                        "ip_conntrack: table full, dropping"
633                                        " packet.\n");
634                         return ERR_PTR(-ENOMEM);
635                 }
636         }
637
638         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
639         if (!conntrack) {
640                 DEBUGP("Can't allocate conntrack.\n");
641                 return ERR_PTR(-ENOMEM);
642         }
643
644         memset(conntrack, 0, sizeof(*conntrack));
645         atomic_set(&conntrack->ct_general.use, 1);
646         conntrack->ct_general.destroy = destroy_conntrack;
647         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
648         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
649         /* Don't set timer yet: wait for confirmation */
650         init_timer(&conntrack->timeout);
651         conntrack->timeout.data = (unsigned long)conntrack;
652         conntrack->timeout.function = death_by_timeout;
653
654         atomic_inc(&ip_conntrack_count);
655
656         return conntrack;
657 }
658
659 void
660 ip_conntrack_free(struct ip_conntrack *conntrack)
661 {
662         atomic_dec(&ip_conntrack_count);
663         kmem_cache_free(ip_conntrack_cachep, conntrack);
664 }
665
666 /* Allocate a new conntrack: we return -ENOMEM if classification
667  * failed due to stress.   Otherwise it really is unclassifiable */
668 static struct ip_conntrack_tuple_hash *
669 init_conntrack(struct ip_conntrack_tuple *tuple,
670                struct ip_conntrack_protocol *protocol,
671                struct sk_buff *skb)
672 {
673         struct ip_conntrack *conntrack;
674         struct ip_conntrack_tuple repl_tuple;
675         struct ip_conntrack_expect *exp;
676
677         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
678                 DEBUGP("Can't invert tuple.\n");
679                 return NULL;
680         }
681
682         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
683         if (conntrack == NULL || IS_ERR(conntrack))
684                 return (struct ip_conntrack_tuple_hash *)conntrack;
685
686         if (!protocol->new(conntrack, skb)) {
687                 ip_conntrack_free(conntrack);
688                 return NULL;
689         }
690
691         write_lock_bh(&ip_conntrack_lock);
692         exp = find_expectation(tuple);
693
694         if (exp) {
695                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
696                         conntrack, exp);
697                 /* Welcome, Mr. Bond.  We've been expecting you... */
698                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
699                 conntrack->master = exp->master;
700 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
701                 conntrack->mark = exp->master->mark;
702 #endif
703 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
704     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
705                 /* this is ugly, but there is no other place where to put it */
706                 conntrack->nat.masq_index = exp->master->nat.masq_index;
707 #endif
708 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
709                 conntrack->secmark = exp->master->secmark;
710 #endif
711                 nf_conntrack_get(&conntrack->master->ct_general);
712                 CONNTRACK_STAT_INC(expect_new);
713         } else {
714                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
715
716                 CONNTRACK_STAT_INC(new);
717         }
718
719         /* Overload tuple linked list to put us in unconfirmed list. */
720         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
721
722         write_unlock_bh(&ip_conntrack_lock);
723
724         if (exp) {
725                 if (exp->expectfn)
726                         exp->expectfn(conntrack, exp);
727                 ip_conntrack_expect_put(exp);
728         }
729
730         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
731 }
732
733 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
734 static inline struct ip_conntrack *
735 resolve_normal_ct(struct sk_buff *skb,
736                   struct ip_conntrack_protocol *proto,
737                   int *set_reply,
738                   unsigned int hooknum,
739                   enum ip_conntrack_info *ctinfo)
740 {
741         struct ip_conntrack_tuple tuple;
742         struct ip_conntrack_tuple_hash *h;
743         struct ip_conntrack *ct;
744
745         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
746
747         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
748                                 &tuple,proto))
749                 return NULL;
750
751         /* look for tuple match */
752         h = ip_conntrack_find_get(&tuple, NULL);
753         if (!h) {
754                 h = init_conntrack(&tuple, proto, skb);
755                 if (!h)
756                         return NULL;
757                 if (IS_ERR(h))
758                         return (void *)h;
759         }
760         ct = tuplehash_to_ctrack(h);
761
762         /* It exists; we have (non-exclusive) reference. */
763         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
764                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
765                 /* Please set reply bit if this packet OK */
766                 *set_reply = 1;
767         } else {
768                 /* Once we've had two way comms, always ESTABLISHED. */
769                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
770                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
771                                ct);
772                         *ctinfo = IP_CT_ESTABLISHED;
773                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
774                         DEBUGP("ip_conntrack_in: related packet for %p\n",
775                                ct);
776                         *ctinfo = IP_CT_RELATED;
777                 } else {
778                         DEBUGP("ip_conntrack_in: new packet for %p\n",
779                                ct);
780                         *ctinfo = IP_CT_NEW;
781                 }
782                 *set_reply = 0;
783         }
784         skb->nfct = &ct->ct_general;
785         skb->nfctinfo = *ctinfo;
786         return ct;
787 }
788
789 /* Netfilter hook itself. */
790 unsigned int ip_conntrack_in(unsigned int hooknum,
791                              struct sk_buff **pskb,
792                              const struct net_device *in,
793                              const struct net_device *out,
794                              int (*okfn)(struct sk_buff *))
795 {
796         struct ip_conntrack *ct;
797         enum ip_conntrack_info ctinfo;
798         struct ip_conntrack_protocol *proto;
799         int set_reply = 0;
800         int ret;
801
802         /* Previously seen (loopback or untracked)?  Ignore. */
803         if ((*pskb)->nfct) {
804                 CONNTRACK_STAT_INC(ignore);
805                 return NF_ACCEPT;
806         }
807
808         /* Never happen */
809         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
810                 if (net_ratelimit()) {
811                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
812                        (*pskb)->nh.iph->protocol, hooknum);
813                 }
814                 return NF_DROP;
815         }
816
817 /* Doesn't cover locally-generated broadcast, so not worth it. */
818 #if 0
819         /* Ignore broadcast: no `connection'. */
820         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
821                 printk("Broadcast packet!\n");
822                 return NF_ACCEPT;
823         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
824                    == htonl(0x000000FF)) {
825                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
826                        NIPQUAD((*pskb)->nh.iph->saddr),
827                        NIPQUAD((*pskb)->nh.iph->daddr),
828                        (*pskb)->sk, (*pskb)->pkt_type);
829         }
830 #endif
831
832         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
833
834         /* It may be an special packet, error, unclean...
835          * inverse of the return code tells to the netfilter
836          * core what to do with the packet. */
837         if (proto->error != NULL 
838             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
839                 CONNTRACK_STAT_INC(error);
840                 CONNTRACK_STAT_INC(invalid);
841                 return -ret;
842         }
843
844         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
845                 /* Not valid part of a connection */
846                 CONNTRACK_STAT_INC(invalid);
847                 return NF_ACCEPT;
848         }
849
850         if (IS_ERR(ct)) {
851                 /* Too stressed to deal. */
852                 CONNTRACK_STAT_INC(drop);
853                 return NF_DROP;
854         }
855
856         IP_NF_ASSERT((*pskb)->nfct);
857
858         ret = proto->packet(ct, *pskb, ctinfo);
859         if (ret < 0) {
860                 /* Invalid: inverse of the return code tells
861                  * the netfilter core what to do*/
862                 nf_conntrack_put((*pskb)->nfct);
863                 (*pskb)->nfct = NULL;
864                 CONNTRACK_STAT_INC(invalid);
865                 return -ret;
866         }
867
868         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
869                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
870
871         return ret;
872 }
873
874 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
875                    const struct ip_conntrack_tuple *orig)
876 {
877         return ip_ct_invert_tuple(inverse, orig, 
878                                   __ip_conntrack_proto_find(orig->dst.protonum));
879 }
880
881 /* Would two expected things clash? */
882 static inline int expect_clash(const struct ip_conntrack_expect *a,
883                                const struct ip_conntrack_expect *b)
884 {
885         /* Part covered by intersection of masks must be unequal,
886            otherwise they clash */
887         struct ip_conntrack_tuple intersect_mask
888                 = { { a->mask.src.ip & b->mask.src.ip,
889                       { a->mask.src.u.all & b->mask.src.u.all } },
890                     { a->mask.dst.ip & b->mask.dst.ip,
891                       { a->mask.dst.u.all & b->mask.dst.u.all },
892                       a->mask.dst.protonum & b->mask.dst.protonum } };
893
894         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
895 }
896
897 static inline int expect_matches(const struct ip_conntrack_expect *a,
898                                  const struct ip_conntrack_expect *b)
899 {
900         return a->master == b->master
901                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
902                 && ip_ct_tuple_equal(&a->mask, &b->mask);
903 }
904
905 /* Generally a bad idea to call this: could have matched already. */
906 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
907 {
908         struct ip_conntrack_expect *i;
909
910         write_lock_bh(&ip_conntrack_lock);
911         /* choose the the oldest expectation to evict */
912         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
913                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
914                         ip_ct_unlink_expect(i);
915                         write_unlock_bh(&ip_conntrack_lock);
916                         ip_conntrack_expect_put(i);
917                         return;
918                 }
919         }
920         write_unlock_bh(&ip_conntrack_lock);
921 }
922
923 /* We don't increase the master conntrack refcount for non-fulfilled
924  * conntracks. During the conntrack destruction, the expectations are 
925  * always killed before the conntrack itself */
926 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
927 {
928         struct ip_conntrack_expect *new;
929
930         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
931         if (!new) {
932                 DEBUGP("expect_related: OOM allocating expect\n");
933                 return NULL;
934         }
935         new->master = me;
936         atomic_set(&new->use, 1);
937         return new;
938 }
939
940 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
941 {
942         if (atomic_dec_and_test(&exp->use))
943                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
944 }
945
946 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
947 {
948         atomic_inc(&exp->use);
949         exp->master->expecting++;
950         list_add(&exp->list, &ip_conntrack_expect_list);
951
952         init_timer(&exp->timeout);
953         exp->timeout.data = (unsigned long)exp;
954         exp->timeout.function = expectation_timed_out;
955         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
956         add_timer(&exp->timeout);
957
958         exp->id = ++ip_conntrack_expect_next_id;
959         atomic_inc(&exp->use);
960         CONNTRACK_STAT_INC(expect_create);
961 }
962
963 /* Race with expectations being used means we could have none to find; OK. */
964 static void evict_oldest_expect(struct ip_conntrack *master)
965 {
966         struct ip_conntrack_expect *i;
967
968         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
969                 if (i->master == master) {
970                         if (del_timer(&i->timeout)) {
971                                 ip_ct_unlink_expect(i);
972                                 ip_conntrack_expect_put(i);
973                         }
974                         break;
975                 }
976         }
977 }
978
979 static inline int refresh_timer(struct ip_conntrack_expect *i)
980 {
981         if (!del_timer(&i->timeout))
982                 return 0;
983
984         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
985         add_timer(&i->timeout);
986         return 1;
987 }
988
989 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
990 {
991         struct ip_conntrack_expect *i;
992         int ret;
993
994         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
995         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
996         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
997
998         write_lock_bh(&ip_conntrack_lock);
999         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1000                 if (expect_matches(i, expect)) {
1001                         /* Refresh timer: if it's dying, ignore.. */
1002                         if (refresh_timer(i)) {
1003                                 ret = 0;
1004                                 goto out;
1005                         }
1006                 } else if (expect_clash(i, expect)) {
1007                         ret = -EBUSY;
1008                         goto out;
1009                 }
1010         }
1011
1012         /* Will be over limit? */
1013         if (expect->master->helper->max_expected && 
1014             expect->master->expecting >= expect->master->helper->max_expected)
1015                 evict_oldest_expect(expect->master);
1016
1017         ip_conntrack_expect_insert(expect);
1018         ip_conntrack_expect_event(IPEXP_NEW, expect);
1019         ret = 0;
1020 out:
1021         write_unlock_bh(&ip_conntrack_lock);
1022         return ret;
1023 }
1024
1025 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1026    implicitly racy: see __ip_conntrack_confirm */
1027 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1028                               const struct ip_conntrack_tuple *newreply)
1029 {
1030         write_lock_bh(&ip_conntrack_lock);
1031         /* Should be unconfirmed, so not in hash table yet */
1032         IP_NF_ASSERT(!is_confirmed(conntrack));
1033
1034         DEBUGP("Altering reply tuple of %p to ", conntrack);
1035         DUMP_TUPLE(newreply);
1036
1037         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1038         if (!conntrack->master && conntrack->expecting == 0)
1039                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1040         write_unlock_bh(&ip_conntrack_lock);
1041 }
1042
1043 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1044 {
1045         BUG_ON(me->timeout == 0);
1046         write_lock_bh(&ip_conntrack_lock);
1047         list_add(&me->list, &helpers);
1048         write_unlock_bh(&ip_conntrack_lock);
1049
1050         return 0;
1051 }
1052
1053 struct ip_conntrack_helper *
1054 __ip_conntrack_helper_find_byname(const char *name)
1055 {
1056         struct ip_conntrack_helper *h;
1057
1058         list_for_each_entry(h, &helpers, list) {
1059                 if (!strcmp(h->name, name))
1060                         return h;
1061         }
1062
1063         return NULL;
1064 }
1065
1066 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1067                           const struct ip_conntrack_helper *me)
1068 {
1069         if (tuplehash_to_ctrack(i)->helper == me) {
1070                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1071                 tuplehash_to_ctrack(i)->helper = NULL;
1072         }
1073 }
1074
1075 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1076 {
1077         unsigned int i;
1078         struct ip_conntrack_tuple_hash *h;
1079         struct ip_conntrack_expect *exp, *tmp;
1080
1081         /* Need write lock here, to delete helper. */
1082         write_lock_bh(&ip_conntrack_lock);
1083         list_del(&me->list);
1084
1085         /* Get rid of expectations */
1086         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1087                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1088                         ip_ct_unlink_expect(exp);
1089                         ip_conntrack_expect_put(exp);
1090                 }
1091         }
1092         /* Get rid of expecteds, set helpers to NULL. */
1093         list_for_each_entry(h, &unconfirmed, list)
1094                 unhelp(h, me);
1095         for (i = 0; i < ip_conntrack_htable_size; i++) {
1096                 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1097                         unhelp(h, me);
1098         }
1099         write_unlock_bh(&ip_conntrack_lock);
1100
1101         /* Someone could be still looking at the helper in a bh. */
1102         synchronize_net();
1103 }
1104
1105 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1106 void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
1107                         enum ip_conntrack_info ctinfo,
1108                         const struct sk_buff *skb,
1109                         unsigned long extra_jiffies,
1110                         int do_acct)
1111 {
1112         int event = 0;
1113
1114         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1115         IP_NF_ASSERT(skb);
1116
1117         write_lock_bh(&ip_conntrack_lock);
1118
1119         /* Only update if this is not a fixed timeout */
1120         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1121                 write_unlock_bh(&ip_conntrack_lock);
1122                 return;
1123         }
1124
1125         /* If not in hash table, timer will not be active yet */
1126         if (!is_confirmed(ct)) {
1127                 ct->timeout.expires = extra_jiffies;
1128                 event = IPCT_REFRESH;
1129         } else {
1130                 /* Need del_timer for race avoidance (may already be dying). */
1131                 if (del_timer(&ct->timeout)) {
1132                         ct->timeout.expires = jiffies + extra_jiffies;
1133                         add_timer(&ct->timeout);
1134                         event = IPCT_REFRESH;
1135                 }
1136         }
1137
1138 #ifdef CONFIG_IP_NF_CT_ACCT
1139         if (do_acct) {
1140                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1141                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1142                                                 ntohs(skb->nh.iph->tot_len);
1143                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1144                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1145                         event |= IPCT_COUNTER_FILLING;
1146         }
1147 #endif
1148
1149         write_unlock_bh(&ip_conntrack_lock);
1150
1151         /* must be unlocked when calling event cache */
1152         if (event)
1153                 ip_conntrack_event_cache(event, skb);
1154 }
1155
1156 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1157     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1158 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1159  * in ip_conntrack_core, since we don't want the protocols to autoload
1160  * or depend on ctnetlink */
1161 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1162                                const struct ip_conntrack_tuple *tuple)
1163 {
1164         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1165                 &tuple->src.u.tcp.port);
1166         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1167                 &tuple->dst.u.tcp.port);
1168         return 0;
1169
1170 nfattr_failure:
1171         return -1;
1172 }
1173
1174 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1175                                struct ip_conntrack_tuple *t)
1176 {
1177         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1178                 return -EINVAL;
1179
1180         t->src.u.tcp.port =
1181                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1182         t->dst.u.tcp.port =
1183                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1184
1185         return 0;
1186 }
1187 #endif
1188
1189 /* Returns new sk_buff, or NULL */
1190 struct sk_buff *
1191 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1192 {
1193         skb_orphan(skb);
1194
1195         local_bh_disable(); 
1196         skb = ip_defrag(skb, user);
1197         local_bh_enable();
1198
1199         if (skb)
1200                 ip_send_check(skb->nh.iph);
1201         return skb;
1202 }
1203
1204 /* Used by ipt_REJECT. */
1205 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1206 {
1207         struct ip_conntrack *ct;
1208         enum ip_conntrack_info ctinfo;
1209
1210         /* This ICMP is in reverse direction to the packet which caused it */
1211         ct = ip_conntrack_get(skb, &ctinfo);
1212         
1213         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1214                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1215         else
1216                 ctinfo = IP_CT_RELATED;
1217
1218         /* Attach to new skbuff, and increment count */
1219         nskb->nfct = &ct->ct_general;
1220         nskb->nfctinfo = ctinfo;
1221         nf_conntrack_get(nskb->nfct);
1222 }
1223
1224 /* Bring out ya dead! */
1225 static struct ip_conntrack *
1226 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1227                 void *data, unsigned int *bucket)
1228 {
1229         struct ip_conntrack_tuple_hash *h;
1230         struct ip_conntrack *ct;
1231
1232         write_lock_bh(&ip_conntrack_lock);
1233         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1234                 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1235                         ct = tuplehash_to_ctrack(h);
1236                         if (iter(ct, data))
1237                                 goto found;
1238                 }
1239         }
1240         list_for_each_entry(h, &unconfirmed, list) {
1241                 ct = tuplehash_to_ctrack(h);
1242                 if (iter(ct, data))
1243                         goto found;
1244         }
1245         write_unlock_bh(&ip_conntrack_lock);
1246         return NULL;
1247
1248 found:
1249         atomic_inc(&ct->ct_general.use);
1250         write_unlock_bh(&ip_conntrack_lock);
1251         return ct;
1252 }
1253
1254 void
1255 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1256 {
1257         struct ip_conntrack *ct;
1258         unsigned int bucket = 0;
1259
1260         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1261                 /* Time to push up daises... */
1262                 if (del_timer(&ct->timeout))
1263                         death_by_timeout((unsigned long)ct);
1264                 /* ... else the timer will get him soon. */
1265
1266                 ip_conntrack_put(ct);
1267         }
1268 }
1269
1270 /* Fast function for those who don't want to parse /proc (and I don't
1271    blame them). */
1272 /* Reversing the socket's dst/src point of view gives us the reply
1273    mapping. */
1274 static int
1275 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1276 {
1277         struct inet_sock *inet = inet_sk(sk);
1278         struct ip_conntrack_tuple_hash *h;
1279         struct ip_conntrack_tuple tuple;
1280         
1281         IP_CT_TUPLE_U_BLANK(&tuple);
1282         tuple.src.ip = inet->rcv_saddr;
1283         tuple.src.u.tcp.port = inet->sport;
1284         tuple.dst.ip = inet->daddr;
1285         tuple.dst.u.tcp.port = inet->dport;
1286         tuple.dst.protonum = IPPROTO_TCP;
1287
1288         /* We only do TCP at the moment: is there a better way? */
1289         if (strcmp(sk->sk_prot->name, "TCP")) {
1290                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1291                 return -ENOPROTOOPT;
1292         }
1293
1294         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1295                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1296                        *len, sizeof(struct sockaddr_in));
1297                 return -EINVAL;
1298         }
1299
1300         h = ip_conntrack_find_get(&tuple, NULL);
1301         if (h) {
1302                 struct sockaddr_in sin;
1303                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1304
1305                 sin.sin_family = AF_INET;
1306                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1307                         .tuple.dst.u.tcp.port;
1308                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1309                         .tuple.dst.ip;
1310                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1311
1312                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1313                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1314                 ip_conntrack_put(ct);
1315                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1316                         return -EFAULT;
1317                 else
1318                         return 0;
1319         }
1320         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1321                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1322                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1323         return -ENOENT;
1324 }
1325
1326 static struct nf_sockopt_ops so_getorigdst = {
1327         .pf             = PF_INET,
1328         .get_optmin     = SO_ORIGINAL_DST,
1329         .get_optmax     = SO_ORIGINAL_DST+1,
1330         .get            = &getorigdst,
1331 };
1332
1333 static int kill_all(struct ip_conntrack *i, void *data)
1334 {
1335         return 1;
1336 }
1337
1338 void ip_conntrack_flush(void)
1339 {
1340         ip_ct_iterate_cleanup(kill_all, NULL);
1341 }
1342
1343 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1344 {
1345         if (vmalloced)
1346                 vfree(hash);
1347         else
1348                 free_pages((unsigned long)hash, 
1349                            get_order(sizeof(struct list_head) * size));
1350 }
1351
1352 /* Mishearing the voices in his head, our hero wonders how he's
1353    supposed to kill the mall. */
1354 void ip_conntrack_cleanup(void)
1355 {
1356         ip_ct_attach = NULL;
1357
1358         /* This makes sure all current packets have passed through
1359            netfilter framework.  Roll on, two-stage module
1360            delete... */
1361         synchronize_net();
1362
1363         ip_ct_event_cache_flush();
1364  i_see_dead_people:
1365         ip_conntrack_flush();
1366         if (atomic_read(&ip_conntrack_count) != 0) {
1367                 schedule();
1368                 goto i_see_dead_people;
1369         }
1370         /* wait until all references to ip_conntrack_untracked are dropped */
1371         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1372                 schedule();
1373
1374         kmem_cache_destroy(ip_conntrack_cachep);
1375         kmem_cache_destroy(ip_conntrack_expect_cachep);
1376         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1377                             ip_conntrack_htable_size);
1378         nf_unregister_sockopt(&so_getorigdst);
1379 }
1380
1381 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1382 {
1383         struct list_head *hash;
1384         unsigned int i;
1385
1386         *vmalloced = 0; 
1387         hash = (void*)__get_free_pages(GFP_KERNEL, 
1388                                        get_order(sizeof(struct list_head)
1389                                                  * size));
1390         if (!hash) { 
1391                 *vmalloced = 1;
1392                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1393                 hash = vmalloc(sizeof(struct list_head) * size);
1394         }
1395
1396         if (hash)
1397                 for (i = 0; i < size; i++)
1398                         INIT_LIST_HEAD(&hash[i]);
1399
1400         return hash;
1401 }
1402
1403 static int set_hashsize(const char *val, struct kernel_param *kp)
1404 {
1405         int i, bucket, hashsize, vmalloced;
1406         int old_vmalloced, old_size;
1407         int rnd;
1408         struct list_head *hash, *old_hash;
1409         struct ip_conntrack_tuple_hash *h;
1410
1411         /* On boot, we can set this without any fancy locking. */
1412         if (!ip_conntrack_htable_size)
1413                 return param_set_int(val, kp);
1414
1415         hashsize = simple_strtol(val, NULL, 0);
1416         if (!hashsize)
1417                 return -EINVAL;
1418
1419         hash = alloc_hashtable(hashsize, &vmalloced);
1420         if (!hash)
1421                 return -ENOMEM;
1422
1423         /* We have to rehash for the new table anyway, so we also can 
1424          * use a new random seed */
1425         get_random_bytes(&rnd, 4);
1426
1427         write_lock_bh(&ip_conntrack_lock);
1428         for (i = 0; i < ip_conntrack_htable_size; i++) {
1429                 while (!list_empty(&ip_conntrack_hash[i])) {
1430                         h = list_entry(ip_conntrack_hash[i].next,
1431                                        struct ip_conntrack_tuple_hash, list);
1432                         list_del(&h->list);
1433                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1434                         list_add_tail(&h->list, &hash[bucket]);
1435                 }
1436         }
1437         old_size = ip_conntrack_htable_size;
1438         old_vmalloced = ip_conntrack_vmalloc;
1439         old_hash = ip_conntrack_hash;
1440
1441         ip_conntrack_htable_size = hashsize;
1442         ip_conntrack_vmalloc = vmalloced;
1443         ip_conntrack_hash = hash;
1444         ip_conntrack_hash_rnd = rnd;
1445         write_unlock_bh(&ip_conntrack_lock);
1446
1447         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1448         return 0;
1449 }
1450
1451 module_param_call(hashsize, set_hashsize, param_get_uint,
1452                   &ip_conntrack_htable_size, 0600);
1453
1454 int __init ip_conntrack_init(void)
1455 {
1456         unsigned int i;
1457         int ret;
1458
1459         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1460          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1461         if (!ip_conntrack_htable_size) {
1462                 ip_conntrack_htable_size
1463                         = (((num_physpages << PAGE_SHIFT) / 16384)
1464                            / sizeof(struct list_head));
1465                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1466                         ip_conntrack_htable_size = 8192;
1467                 if (ip_conntrack_htable_size < 16)
1468                         ip_conntrack_htable_size = 16;
1469         }
1470         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1471
1472         printk("ip_conntrack version %s (%u buckets, %d max)"
1473                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1474                ip_conntrack_htable_size, ip_conntrack_max,
1475                sizeof(struct ip_conntrack));
1476
1477         ret = nf_register_sockopt(&so_getorigdst);
1478         if (ret != 0) {
1479                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1480                 return ret;
1481         }
1482
1483         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1484                                             &ip_conntrack_vmalloc);
1485         if (!ip_conntrack_hash) {
1486                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1487                 goto err_unreg_sockopt;
1488         }
1489
1490         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1491                                                 sizeof(struct ip_conntrack), 0,
1492                                                 0, NULL, NULL);
1493         if (!ip_conntrack_cachep) {
1494                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1495                 goto err_free_hash;
1496         }
1497
1498         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1499                                         sizeof(struct ip_conntrack_expect),
1500                                         0, 0, NULL, NULL);
1501         if (!ip_conntrack_expect_cachep) {
1502                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1503                 goto err_free_conntrack_slab;
1504         }
1505
1506         /* Don't NEED lock here, but good form anyway. */
1507         write_lock_bh(&ip_conntrack_lock);
1508         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1509                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1510         /* Sew in builtin protocols. */
1511         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1512         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1513         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1514         write_unlock_bh(&ip_conntrack_lock);
1515
1516         /* For use by ipt_REJECT */
1517         ip_ct_attach = ip_conntrack_attach;
1518
1519         /* Set up fake conntrack:
1520             - to never be deleted, not in any hashes */
1521         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1522         /*  - and look it like as a confirmed connection */
1523         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1524
1525         return ret;
1526
1527 err_free_conntrack_slab:
1528         kmem_cache_destroy(ip_conntrack_cachep);
1529 err_free_hash:
1530         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1531                             ip_conntrack_htable_size);
1532 err_unreg_sockopt:
1533         nf_unregister_sockopt(&so_getorigdst);
1534
1535         return -ENOMEM;
1536 }